[llvm] [CodeGen] Use round-down algorithm for uncooperative constants (PR #99666)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 19 12:17:16 PDT 2024
https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/99666
>From e5d13fb994f4b6c2f95405b4e79b61b9058e8098 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Thu, 18 Jul 2024 12:23:13 -0400
Subject: [PATCH] [CodeGen] Use round-down algorithm for uncooperative
constants
This is inspired by https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
---
.../llvm/Support/DivisionByConstantInfo.h | 5 +-
llvm/lib/Support/DivisionByConstantInfo.cpp | 86 +-
.../AArch64/GlobalISel/combine-udiv.ll | 138 ++-
.../AArch64/GlobalISel/combine-udiv.mir | 191 ++--
...izer-combiner-divrem-insertpt-conflict.mir | 6 +-
.../prelegalizercombiner-trivial-arith.mir | 34 +-
.../CodeGen/AArch64/arm64-neon-mul-div-cte.ll | 20 +-
llvm/test/CodeGen/AArch64/rotate-extract.ll | 15 +-
llvm/test/CodeGen/AArch64/sve-expand-div.ll | 17 +-
...sve-streaming-mode-fixed-length-int-div.ll | 944 ++++++++----------
llvm/test/CodeGen/AArch64/urem-lkk.ll | 38 +-
.../CodeGen/AArch64/urem-seteq-vec-splat.ll | 13 +-
llvm/test/CodeGen/AArch64/urem-vector-lkk.ll | 135 +--
.../CodeGen/AMDGPU/GlobalISel/udiv.i32.ll | 19 +-
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 4 +-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 36 +-
.../CodeGen/AMDGPU/combine-reg-or-const.ll | 4 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 145 ++-
llvm/test/CodeGen/AMDGPU/urem.ll | 1 -
.../PowerPC/loop-instr-form-prepare.ll | 6 +-
llvm/test/CodeGen/PowerPC/urem-lkk.ll | 16 +-
llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll | 222 ++--
llvm/test/CodeGen/RISCV/div-by-constant.ll | 241 +++--
llvm/test/CodeGen/RISCV/div.ll | 62 +-
llvm/test/CodeGen/RISCV/pr51206.ll | 5 +-
llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll | 20 +-
.../rvv/fixed-vectors-buildvec-of-binop.ll | 18 +-
.../RISCV/rvv/fixed-vectors-extract.ll | 26 +-
.../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 371 +++----
llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll | 156 ++-
llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll | 150 +--
llvm/test/CodeGen/RISCV/select.ll | 49 +-
.../CodeGen/RISCV/split-udiv-by-constant.ll | 401 ++++----
.../CodeGen/RISCV/split-urem-by-constant.ll | 142 ++-
llvm/test/CodeGen/RISCV/urem-lkk.ll | 45 +-
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 30 +-
llvm/test/CodeGen/SystemZ/int-div-06.ll | 62 +-
llvm/test/CodeGen/SystemZ/int-mul-13.ll | 6 +-
.../test/CodeGen/Thumb2/mve-blockplacement.ll | 84 +-
llvm/test/CodeGen/Thumb2/thumb2-select.ll | 119 ++-
llvm/test/CodeGen/VE/Scalar/div.ll | 126 ++-
llvm/test/CodeGen/VE/Scalar/rem.ll | 126 ++-
llvm/test/CodeGen/VE/Vector/vec_divrem.ll | 34 +-
llvm/test/CodeGen/X86/and-encoding.ll | 9 +-
llvm/test/CodeGen/X86/atomic-unordered.ll | 15 +-
llvm/test/CodeGen/X86/bug80500.ll | 3 +-
llvm/test/CodeGen/X86/combine-pmuldq.ll | 100 +-
llvm/test/CodeGen/X86/combine-udiv.ll | 203 ++--
llvm/test/CodeGen/X86/divide-by-constant.ll | 300 +++---
llvm/test/CodeGen/X86/divmod128.ll | 240 ++---
llvm/test/CodeGen/X86/divrem-by-select.ll | 9 +-
llvm/test/CodeGen/X86/freeze.ll | 9 +-
llvm/test/CodeGen/X86/known-bits.ll | 14 +-
llvm/test/CodeGen/X86/known-pow2.ll | 4 +-
.../test/CodeGen/X86/load-scalar-as-vector.ll | 30 +-
...of-two-or-zero-when-comparing-with-zero.ll | 12 +-
llvm/test/CodeGen/X86/pr35636.ll | 8 +-
llvm/test/CodeGen/X86/pr38217.ll | 4 +-
.../CodeGen/X86/prefer-avx256-wide-mul.ll | 42 +-
llvm/test/CodeGen/X86/rem.ll | 3 +-
.../test/CodeGen/X86/rotate-extract-vector.ll | 18 +-
llvm/test/CodeGen/X86/rotate-extract.ll | 40 +-
llvm/test/CodeGen/X86/urem-i8-constant.ll | 4 +-
llvm/test/CodeGen/X86/urem-lkk.ll | 35 +-
llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 90 +-
llvm/test/CodeGen/X86/urem-vector-lkk.ll | 283 +++---
llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 590 ++++-------
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 579 ++++-------
llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 477 +++------
llvm/test/CodeGen/X86/vector-idiv-v2i32.ll | 60 +-
.../X86/vshli-simplify-demanded-bits.ll | 4 +-
llvm/test/CodeGen/X86/x86_64-mul-by-const.ll | 10 +-
.../Support/DivisionByConstantTest.cpp | 28 +-
73 files changed, 3331 insertions(+), 4260 deletions(-)
diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
index caa0b35e71447..fb0c1382ce821 100644
--- a/llvm/include/llvm/Support/DivisionByConstantInfo.h
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -26,9 +26,8 @@ struct SignedDivisionByConstantInfo {
/// Magic data for optimising unsigned division by a constant.
struct UnsignedDivisionByConstantInfo {
- static UnsignedDivisionByConstantInfo
- get(const APInt &D, unsigned LeadingZeros = 0,
- bool AllowEvenDivisorOptimization = true);
+ static UnsignedDivisionByConstantInfo get(const APInt &D,
+ unsigned LeadingZeros = 0);
APInt Magic; ///< magic number
bool IsAdd; ///< add indicator
unsigned PostShift; ///< post-shift amount
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index b0e503003a680..3fb7c4a56c028 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -70,9 +70,9 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
/// S. Warren, Jr., chapter 10.
/// LeadingZeros can be used to simplify the calculation if the upper bits
/// of the divided value are known zero.
-UnsignedDivisionByConstantInfo
-UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
- bool AllowEvenDivisorOptimization) {
+
+static UnsignedDivisionByConstantInfo get2(const APInt &D,
+ unsigned LeadingZeros) {
assert(!D.isZero() && !D.isOne() && "Precondition violation.");
assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
@@ -132,16 +132,6 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
} while (P < D.getBitWidth() * 2 &&
(Q1.ult(Delta) || (Q1 == Delta && R1.isZero())));
- if (Retval.IsAdd && !D[0] && AllowEvenDivisorOptimization) {
- unsigned PreShift = D.countr_zero();
- APInt ShiftedD = D.lshr(PreShift);
- Retval =
- UnsignedDivisionByConstantInfo::get(ShiftedD, LeadingZeros + PreShift);
- assert(Retval.IsAdd == 0 && Retval.PreShift == 0);
- Retval.PreShift = PreShift;
- return Retval;
- }
-
Retval.Magic = std::move(Q2); // resulting magic number
++Retval.Magic;
Retval.PostShift = P - D.getBitWidth(); // resulting shift
@@ -153,3 +143,73 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
Retval.PreShift = 0;
return Retval;
}
+
+UnsignedDivisionByConstantInfo
+UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
+ assert(!D.isZero() && !D.isOne() && "Precondition violation.");
+ assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
+
+ if (D.isPowerOf2())
+ return get2(D, LeadingZeros);
+ struct UnsignedDivisionByConstantInfo Retval;
+ APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
+
+ // Calculate NC, the largest dividend such that NC.urem(D) == D-1.
+ APInt Q2, R2;
+ // initialize Q = (2P-1)/D; R2 = rem((2P-1),D)
+ APInt::udivrem(SignedMax, D, Q2, R2);
+
+ APInt MultiplierRoundDown = APInt::getZero(D.getBitWidth());
+ unsigned ExponentRoundDown = 0;
+ bool HasMagicDown = false;
+
+ unsigned Log2D = D.ceilLogBase2();
+ unsigned Exponent = 0;
+
+ for (;; Exponent++) {
+ if (R2.uge(D - R2)) {
+ Q2 <<= 1;
+ ++Q2;
+ R2 <<= 1;
+ R2 -= D;
+ } else {
+ Q2 <<= 1;
+ R2 <<= 1;
+ }
+
+ APInt Ule = APInt::getOneBitSet(D.getBitWidth(), Exponent + LeadingZeros);
+
+ if (Exponent + LeadingZeros >= Log2D || (D - R2).ule(Ule))
+ break;
+
+ // Set magic_down if we have not set it yet and this exponent works for the
+ // round_down algorithm
+ if (!HasMagicDown && R2.ule(Ule)) {
+ HasMagicDown = true;
+ MultiplierRoundDown = Q2;
+ ExponentRoundDown = Exponent;
+ }
+ }
+
+ if (Exponent < Log2D) {
+ // Do the normal values
+ Retval.Magic = Q2 + 1;
+ Retval.PreShift = 0;
+ Retval.PostShift = Exponent;
+ Retval.IsAdd = false;
+ } else if (D[0]) {
+ Retval.Magic = MultiplierRoundDown;
+ Retval.PreShift = 0;
+ Retval.PostShift = ExponentRoundDown;
+ Retval.IsAdd = true;
+ } else {
+ unsigned PreShift = D.countr_zero();
+ APInt ShiftedD = D.lshr(PreShift);
+ Retval = UnsignedDivisionByConstantInfo::get(
+ ShiftedD, D.getBitWidth() - LeadingZeros - PreShift);
+ assert(Retval.IsAdd == 0 && Retval.PreShift == 0);
+ Retval.PreShift = PreShift;
+ }
+
+ return Retval;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index d465e0237201b..49e96cc424c1d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -6,14 +6,12 @@
define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_uniform:
; SDAG: // %bb.0:
-; SDAG-NEXT: mov w8, #25645 // =0x642d
+; SDAG-NEXT: mov w8, #45589 // =0xb215
; SDAG-NEXT: dup v1.8h, w8
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: usra v1.8h, v0.8h, #1
-; SDAG-NEXT: ushr v0.8h, v1.8h, #4
+; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; SDAG-NEXT: ushr v0.8h, v0.8h, #4
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_uniform:
@@ -21,11 +19,9 @@ define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; GISEL-NEXT: adrp x8, .LCPI0_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: usra v1.8h, v0.8h, #1
-; GISEL-NEXT: ushr v0.8h, v1.8h, #4
+; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: ushr v0.8h, v0.8h, #4
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
ret <8 x i16> %1
@@ -37,37 +33,30 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SDAG-NEXT: adrp x8, .LCPI1_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; SDAG-NEXT: adrp x8, .LCPI1_1
+; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; SDAG-NEXT: adrp x8, .LCPI1_2
-; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h
-; SDAG-NEXT: umull2 v3.4s, v1.8h, v2.8h
-; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
-; SDAG-NEXT: adrp x8, .LCPI1_3
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
+; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_2]
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform:
; GISEL: // %bb.0:
-; GISEL-NEXT: adrp x8, .LCPI1_3
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
; GISEL-NEXT: adrp x8, .LCPI1_2
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_2]
; GISEL-NEXT: adrp x8, .LCPI1_1
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h
-; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h
+; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; GISEL-NEXT: adrp x8, .LCPI1_0
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
@@ -87,13 +76,17 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SDAG-NEXT: adrp x8, .LCPI2_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; SDAG-NEXT: adrp x8, .LCPI2_1
-; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
-; SDAG-NEXT: adrp x8, .LCPI2_2
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; SDAG-NEXT: adrp x8, .LCPI2_2
+; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
+; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
-; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
@@ -102,15 +95,18 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; GISEL-NEXT: adrp x8, .LCPI2_2
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
; GISEL-NEXT: adrp x8, .LCPI2_1
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
-; GISEL-NEXT: adrp x8, .LCPI2_0
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; GISEL-NEXT: adrp x8, .LCPI2_0
+; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
+; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; GISEL-NEXT: add v0.8h, v0.8h, v1.8h
+; GISEL-NEXT: neg v1.8h, v2.8h
; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -124,12 +120,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; SDAG-NEXT: adrp x8, .LCPI3_1
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: usra v1.8h, v0.8h, #1
-; SDAG-NEXT: ldr q0, [x8, :lo12:.LCPI3_1]
-; SDAG-NEXT: ushl v0.8h, v1.8h, v0.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform3:
@@ -138,13 +132,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
; GISEL-NEXT: adrp x8, .LCPI3_0
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
-; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: usra v1.8h, v0.8h, #1
-; GISEL-NEXT: neg v0.8h, v2.8h
-; GISEL-NEXT: ushl v0.8h, v1.8h, v0.8h
+; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; GISEL-NEXT: neg v1.8h, v1.8h
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
ret <8 x i16> %1
@@ -153,7 +145,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform4:
; SDAG: // %bb.0:
-; SDAG-NEXT: movi v1.16b, #171
+; SDAG-NEXT: movi v1.16b, #85
; SDAG-NEXT: adrp x8, .LCPI4_0
; SDAG-NEXT: adrp x9, .LCPI4_1
; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_1]
@@ -162,7 +154,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
-; SDAG-NEXT: ushr v1.16b, v1.16b, #7
+; SDAG-NEXT: ushr v1.16b, v1.16b, #6
; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
@@ -192,50 +184,36 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SDAG-LABEL: pr38477:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI5_0
-; SDAG-NEXT: adrp x9, .LCPI5_4
+; SDAG-NEXT: adrp x9, .LCPI5_3
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; SDAG-NEXT: adrp x8, .LCPI5_1
-; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_1]
-; SDAG-NEXT: adrp x8, .LCPI5_2
+; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_3]
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v2.8h, v0.8h, v1.8h
-; SDAG-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; SDAG-NEXT: umull v2.4s, v2.4h, v3.4h
-; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_4]
; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
-; SDAG-NEXT: uzp2 v2.8h, v2.8h, v4.8h
-; SDAG-NEXT: add v1.8h, v2.8h, v1.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2]
-; SDAG-NEXT: adrp x8, .LCPI5_3
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
+; SDAG-NEXT: adrp x8, .LCPI5_2
; SDAG-NEXT: ushl v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_3]
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2]
; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
;
; GISEL-LABEL: pr38477:
; GISEL: // %bb.0:
-; GISEL-NEXT: adrp x8, .LCPI5_3
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_3]
; GISEL-NEXT: adrp x8, .LCPI5_2
-; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_2]
; GISEL-NEXT: adrp x8, .LCPI5_0
-; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h
; GISEL-NEXT: ldr d3, [x8, :lo12:.LCPI5_0]
; GISEL-NEXT: adrp x8, .LCPI5_1
-; GISEL-NEXT: ushll v3.8h, v3.8b, #0
-; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h
+; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1]
-; GISEL-NEXT: shl v3.8h, v3.8h, #15
-; GISEL-NEXT: add v1.8h, v2.8h, v1.8h
+; GISEL-NEXT: ushll v3.8h, v3.8b, #0
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; GISEL-NEXT: neg v2.8h, v4.8h
+; GISEL-NEXT: shl v3.8h, v3.8h, #15
; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h
; GISEL-NEXT: sshr v2.8h, v3.8h, #15
; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
index f8578a694e2d4..8cd2d7e9fc17b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
@@ -9,13 +9,11 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 818089009
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[C1]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1636178017
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
%0:_(s32) = COPY $w0
%cst:_(s32) = G_CONSTANT i32 42
%2:_(s32) = G_UDIV %0(s32), %cst(s32)
@@ -35,18 +33,13 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
@@ -72,30 +65,26 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 3855
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32757
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 4681
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 512
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639
- ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C8]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C7]](s16), [[C1]](s16), [[C1]](s16), [[C12]](s16), [[C12]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<8 x s16>)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 257
+ ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C8]](s16), [[C10]](s16), [[C5]](s16), [[C11]](s16), [[C9]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C7]](s16), [[C9]](s16), [[C1]](s16), [[C1]](s16), [[C12]](s16), [[C1]](s16)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]]
+ ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
@@ -126,26 +115,31 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5617
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2049
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 11
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5619
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7281
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10347
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13107
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32747
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C12]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C5]](s16), [[C3]](s16), [[C5]](s16), [[C8]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<8 x s16>)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7283
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10349
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 13107
+ ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
+ ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197
+ ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C12]](s16), [[C14]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C5]](s16), [[C11]](s16), [[C13]](s16), [[C15]](s16)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
+ ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 -34
@@ -176,28 +170,23 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 517
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C8]](s16), [[C8]](s16), [[C11]](s16)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 20971
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -26701
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 -20917
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 517
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C8]](s16), [[C10]](s16), [[C11]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C3]](s16), [[C5]](s16), [[C3]](s16), [[C]](s16), [[C9]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 7
@@ -229,8 +218,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 -85
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 7
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 85
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<16 x s8>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
@@ -265,30 +254,24 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4957
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 551
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8081
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8079
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 4103
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 12
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 16385
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -29991
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C4]](s16), [[C5]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C3]](s16), [[C3]](s16), [[C6]](s16), [[C8]](s16), [[C3]](s16), [[C]](s16), [[C12]](s16)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32713
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 4443
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
- ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s1>) = G_BUILD_VECTOR [[C13]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1)
- ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[BUILD_VECTOR3]](<8 x s1>), [[COPY]], [[LSHR]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<8 x s16>)
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s1>) = G_BUILD_VECTOR [[C11]](s1), [[C12]](s1), [[C12]](s1), [[C12]](s1), [[C12]](s1), [[C12]](s1), [[C12]](s1), [[C12]](s1)
+ ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[BUILD_VECTOR2]](<8 x s1>), [[COPY]], [[LSHR]]
; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
@@ -338,11 +321,15 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 660764199
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMULH]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[UMULH]]
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
index 2e879c7e1622a..4e4cc3349fb76 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
@@ -9,13 +9,11 @@ body: |
bb.1:
; CHECK-LABEL: name: test
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483647
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[C]], [[C1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32)
; CHECK-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[C]], [[C]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UREM]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LSHR]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UMULH]](s32)
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s8)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ZEXT]], [[SEXT]]
; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[OR]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
index 0900dd4267a2e..a695c7527a7f3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
@@ -26,8 +26,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x + 0) -> x
- ;
; CHECK-LABEL: name: right_ident_add
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -47,8 +45,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x * 0) -> 0
- ;
; CHECK-LABEL: name: mul_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -69,8 +65,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x * 0) -> 0
- ;
; CHECK-LABEL: name: mul_0_cant_replace
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -93,8 +87,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 / x) -> 0
- ;
; CHECK-LABEL: name: sdiv_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -114,8 +106,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 / x) -> 0
- ;
; CHECK-LABEL: name: udiv_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -135,8 +125,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 % x) -> 0
- ;
; CHECK-LABEL: name: srem_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -156,8 +144,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 % x) -> 0
- ;
; CHECK-LABEL: name: urem_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -178,8 +164,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x || 0) -> x
- ;
; CHECK-LABEL: name: right_ident_or
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -199,8 +183,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x | 0) -> x
- ;
; CHECK-LABEL: name: right_ident_xor
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -220,8 +202,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x << 0) -> x
- ;
; CHECK-LABEL: name: right_ident_shl
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -241,8 +221,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x ashr 0) -> x
- ;
; CHECK-LABEL: name: right_ident_ashr
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -262,8 +240,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x lshr 0) -> x
- ;
; CHECK-LABEL: name: right_ident_lshr
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -283,8 +259,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Not an identity, no folding.
- ;
; CHECK-LABEL: name: dont_fold_sub
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -325,8 +299,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $x0
- ; Fold (x + 0) -> x
- ;
; CHECK-LABEL: name: right_ident_ptr_add
; CHECK: liveins: $x0
; CHECK-NEXT: {{ $}}
@@ -476,8 +448,10 @@ body: |
; CHECK-LABEL: name: udiv_of_sext
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s2) = G_CONSTANT i2 1
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s2) = G_UMULH [[C]], [[C]]
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMULH]](s2)
+ ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%2:_(s1) = G_CONSTANT i1 true
%4:_(s2) = G_CONSTANT i2 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
index f1458b76c525a..1b192342953c9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
@@ -50,11 +50,11 @@ define <4 x i32> @div32xi4(<4 x i32> %x) {
define <16 x i8> @udiv16xi8(<16 x i8> %x) {
; CHECK-LABEL: udiv16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.16b, #121
+; CHECK-NEXT: movi v1.16b, #15
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #5
+; CHECK-NEXT: ushr v0.16b, v0.16b, #2
; CHECK-NEXT: ret
%div = udiv <16 x i8> %x, <i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68>
ret <16 x i8> %div
@@ -63,14 +63,12 @@ define <16 x i8> @udiv16xi8(<16 x i8> %x) {
define <8 x i16> @udiv8xi16(<8 x i16> %x) {
; CHECK-LABEL: udiv8xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16593 // =0x40d1
+; CHECK-NEXT: mov w8, #41063 // =0xa067
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: usra v1.8h, v0.8h, #1
-; CHECK-NEXT: ushr v0.8h, v1.8h, #12
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #12
; CHECK-NEXT: ret
%div = udiv <8 x i16> %x, <i16 6537, i16 6537, i16 6537, i16 6537, i16 6537, i16 6537, i16 6537, i16 6537>
ret <8 x i16> %div
@@ -79,13 +77,13 @@ define <8 x i16> @udiv8xi16(<8 x i16> %x) {
define <4 x i32> @udiv32xi4(<4 x i32> %x) {
; CHECK-LABEL: udiv32xi4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16747 // =0x416b
-; CHECK-NEXT: movk w8, #31439, lsl #16
+; CHECK-NEXT: mov w8, #41141 // =0xa0b5
+; CHECK-NEXT: movk w8, #15719, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: ushr v0.4s, v0.4s, #22
+; CHECK-NEXT: ushr v0.4s, v0.4s, #21
; CHECK-NEXT: ret
%div = udiv <4 x i32> %x, <i32 8743143, i32 8743143, i32 8743143, i32 8743143>
ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll
index e3eaf81245ff4..73e0161e64fd5 100644
--- a/llvm/test/CodeGen/AArch64/rotate-extract.ll
+++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll
@@ -50,10 +50,8 @@ define i32 @ror_extract_mul(i32 %i) nounwind {
define i64 @ror_extract_udiv(i64 %i) nounwind {
; CHECK-LABEL: ror_extract_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
-; CHECK-NEXT: movk x8, #43691
+; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
; CHECK-NEXT: umulh x8, x0, x8
-; CHECK-NEXT: lsr x8, x8, #1
; CHECK-NEXT: ror x0, x8, #4
; CHECK-NEXT: ret
%lhs_div = udiv i64 %i, 3
@@ -127,15 +125,14 @@ define i64 @no_extract_mul(i64 %i) nounwind {
define i32 @no_extract_udiv(i32 %i) nounwind {
; CHECK-LABEL: no_extract_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43691 // =0xaaab
-; CHECK-NEXT: mov w9, #33437 // =0x829d
-; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: movk w9, #21399, lsl #16
+; CHECK-NEXT: mov w9, #30762 // =0x782a
+; CHECK-NEXT: mov w8, #1431655765 // =0x55555555
+; CHECK-NEXT: movk w9, #1337, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: umull x9, w0, w9
-; CHECK-NEXT: lsr x8, x8, #33
+; CHECK-NEXT: lsr x8, x8, #32
; CHECK-NEXT: lsr x9, x9, #32
-; CHECK-NEXT: extr w0, w8, w9, #4
+; CHECK-NEXT: orr w0, w9, w8, lsl #28
; CHECK-NEXT: ret
%lhs_div = udiv i32 %i, 3
%rhs_div = udiv i32 %i, 49
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 180c64e0a7de1..9a51f1d1b8e9e 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -71,10 +71,9 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: udiv_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.b, #-85 // =0xffffffffffffffab
+; CHECK-NEXT: mov z1.b, #85 // =0x55
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT: lsr z0.b, z0.b, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 16 x i8> %a, splat (i8 3)
ret <vscale x 16 x i8> %div
@@ -83,11 +82,9 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: udiv_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-21845 // =0xffffaaab
+; CHECK-NEXT: dupm z1.b, #0x55
; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: lsr z0.h, z0.h, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 8 x i16> %a, splat (i16 3)
ret <vscale x 8 x i16> %div
@@ -96,12 +93,9 @@ define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: udiv_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43691 // =0xaaab
+; CHECK-NEXT: dupm z1.b, #0x55
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: lsr z0.s, z0.s, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 4 x i32> %a, splat (i32 3)
ret <vscale x 4 x i32> %div
@@ -110,12 +104,9 @@ define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
define <vscale x 2 x i64> @udiv_i64(<vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: udiv_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
+; CHECK-NEXT: dupm z1.b, #0x55
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: movk x8, #43691
-; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: lsr z0.d, z0.d, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 2 x i64> %a, splat (i64 3)
ret <vscale x 2 x i64> %div
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 516772b8ca664..6d28f343c32db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -26,19 +26,6 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8
-; NEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8
-; NEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -63,6 +50,18 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8
+; NEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8
+; NEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = sdiv <4 x i8> %op1, %op2
ret <4 x i8> %res
}
@@ -91,21 +90,6 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -146,6 +130,20 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v8i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
+; NEON-NOSVE-NEXT: ret
%res = sdiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
@@ -192,30 +190,6 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v16i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll2 v2.8h, v1.16b, #0
-; NEON-NOSVE-NEXT: sshll2 v3.8h, v0.16b, #0
-; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshll2 v4.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -287,6 +261,29 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v16i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll2 v2.8h, v1.16b, #0
+; NEON-NOSVE-NEXT: sshll2 v3.8h, v0.16b, #0
+; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sshll2 v4.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; NEON-NOSVE-NEXT: ret
%res = sdiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
@@ -365,53 +362,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v32i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
-; NEON-NOSVE-NEXT: sshll2 v1.8h, v3.16b, #0
-; NEON-NOSVE-NEXT: sshll2 v4.8h, v2.16b, #0
-; NEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0
-; NEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0
-; NEON-NOSVE-NEXT: sshll2 v7.8h, v6.16b, #0
-; NEON-NOSVE-NEXT: sshll v6.8h, v6.8b, #0
-; NEON-NOSVE-NEXT: sshll2 v0.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: sshll2 v17.4s, v7.8h, #0
-; NEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0
-; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z5.s
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: sdivr z1.s, p0/m, z1.s, z4.s
-; NEON-NOSVE-NEXT: sshll2 v4.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: ldr q5, [x0]
-; NEON-NOSVE-NEXT: sshll2 v16.8h, v5.16b, #0
-; NEON-NOSVE-NEXT: sshll v5.8h, v5.8b, #0
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; NEON-NOSVE-NEXT: sshll2 v18.4s, v16.8h, #0
-; NEON-NOSVE-NEXT: sshll v16.4s, v16.4h, #0
-; NEON-NOSVE-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
-; NEON-NOSVE-NEXT: sshll2 v18.4s, v5.8h, #0
-; NEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0
-; NEON-NOSVE-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
-; NEON-NOSVE-NEXT: sshll2 v16.4s, v6.8h, #0
-; NEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0
-; NEON-NOSVE-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
-; NEON-NOSVE-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
-; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
-; NEON-NOSVE-NEXT: stp q2, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v32i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -552,6 +502,52 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v32i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
+; NEON-NOSVE-NEXT: sshll2 v1.8h, v3.16b, #0
+; NEON-NOSVE-NEXT: sshll2 v4.8h, v2.16b, #0
+; NEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0
+; NEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0
+; NEON-NOSVE-NEXT: sshll2 v7.8h, v6.16b, #0
+; NEON-NOSVE-NEXT: sshll v6.8h, v6.8b, #0
+; NEON-NOSVE-NEXT: sshll2 v0.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: sshll2 v17.4s, v7.8h, #0
+; NEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0
+; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z5.s
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: sdivr z1.s, p0/m, z1.s, z4.s
+; NEON-NOSVE-NEXT: sshll2 v4.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: ldr q5, [x0]
+; NEON-NOSVE-NEXT: sshll2 v16.8h, v5.16b, #0
+; NEON-NOSVE-NEXT: sshll v5.8h, v5.8b, #0
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; NEON-NOSVE-NEXT: sshll2 v18.4s, v16.8h, #0
+; NEON-NOSVE-NEXT: sshll v16.4s, v16.4h, #0
+; NEON-NOSVE-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
+; NEON-NOSVE-NEXT: sshll2 v18.4s, v5.8h, #0
+; NEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0
+; NEON-NOSVE-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
+; NEON-NOSVE-NEXT: sshll2 v16.4s, v6.8h, #0
+; NEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0
+; NEON-NOSVE-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
+; NEON-NOSVE-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
+; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; NEON-NOSVE-NEXT: stp q2, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = sdiv <32 x i8> %op1, %op2
@@ -571,17 +567,6 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v2i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16
-; NEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16
-; NEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v2i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -597,6 +582,16 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v2i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16
+; NEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16
+; NEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <2 x i16> %op1, %op2
ret <2 x i16> %res
}
@@ -614,15 +609,6 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -647,6 +633,14 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = sdiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
@@ -672,18 +666,6 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -723,6 +705,17 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v8i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: ret
%res = sdiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
@@ -760,29 +753,6 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v16i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: ldr q3, [x0]
-; NEON-NOSVE-NEXT: sshll2 v6.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: stp q1, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v16i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -859,6 +829,28 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v16i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
+; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: ldr q3, [x0]
+; NEON-NOSVE-NEXT: sshll2 v6.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = sdiv <16 x i16> %op1, %op2
@@ -876,15 +868,6 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v2i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v2i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -899,6 +882,14 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v2i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
@@ -913,15 +904,6 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -941,6 +923,14 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
@@ -957,17 +947,6 @@ define void @sdiv_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: sdiv z1.s, p0/m, z1.s, z3.s
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1004,6 +983,16 @@ define void @sdiv_v8i32(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v8i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: sdiv z1.s, p0/m, z1.s, z3.s
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = sdiv <8 x i32> %op1, %op2
@@ -1021,15 +1010,6 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v1i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl1
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v1i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #16
@@ -1041,6 +1021,14 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v1i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl1
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
@@ -1055,15 +1043,6 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v2i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v2i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -1077,6 +1056,14 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v2i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
@@ -1093,17 +1080,6 @@ define void @sdiv_v4i64(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: sdivr z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: sdiv z1.d, p0/m, z1.d, z3.d
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1128,6 +1104,16 @@ define void @sdiv_v4i64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: sdivr z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: sdiv z1.d, p0/m, z1.d, z3.d
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = sdiv <4 x i64> %op1, %op2
@@ -1154,17 +1140,6 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: bic v0.4h, #255, lsl #8
-; NEON-NOSVE-NEXT: bic v1.4h, #255, lsl #8
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1189,6 +1164,16 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: bic v0.4h, #255, lsl #8
+; NEON-NOSVE-NEXT: bic v1.4h, #255, lsl #8
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = udiv <4 x i8> %op1, %op2
ret <4 x i8> %res
}
@@ -1217,21 +1202,6 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v8i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v8i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1272,6 +1242,20 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v8i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
+; NEON-NOSVE-NEXT: ret
%res = udiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
@@ -1318,30 +1302,6 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v16i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll2 v2.8h, v1.16b, #0
-; NEON-NOSVE-NEXT: ushll2 v3.8h, v0.16b, #0
-; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ushll2 v4.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: udivr z3.s, p0/m, z3.s, z5.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -1413,6 +1373,29 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v16i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll2 v2.8h, v1.16b, #0
+; NEON-NOSVE-NEXT: ushll2 v3.8h, v0.16b, #0
+; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ushll2 v4.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: udivr z3.s, p0/m, z3.s, z5.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; NEON-NOSVE-NEXT: ret
%res = udiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
@@ -1491,53 +1474,6 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v32i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
-; NEON-NOSVE-NEXT: ushll2 v1.8h, v3.16b, #0
-; NEON-NOSVE-NEXT: ushll2 v4.8h, v2.16b, #0
-; NEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0
-; NEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0
-; NEON-NOSVE-NEXT: ushll2 v7.8h, v6.16b, #0
-; NEON-NOSVE-NEXT: ushll v6.8h, v6.8b, #0
-; NEON-NOSVE-NEXT: ushll2 v0.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: ushll2 v17.4s, v7.8h, #0
-; NEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0
-; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z5.s
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: udivr z1.s, p0/m, z1.s, z4.s
-; NEON-NOSVE-NEXT: ushll2 v4.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: ldr q5, [x0]
-; NEON-NOSVE-NEXT: ushll2 v16.8h, v5.16b, #0
-; NEON-NOSVE-NEXT: ushll v5.8h, v5.8b, #0
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; NEON-NOSVE-NEXT: ushll2 v18.4s, v16.8h, #0
-; NEON-NOSVE-NEXT: ushll v16.4s, v16.4h, #0
-; NEON-NOSVE-NEXT: udivr z17.s, p0/m, z17.s, z18.s
-; NEON-NOSVE-NEXT: ushll2 v18.4s, v5.8h, #0
-; NEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0
-; NEON-NOSVE-NEXT: udivr z7.s, p0/m, z7.s, z16.s
-; NEON-NOSVE-NEXT: ushll2 v16.4s, v6.8h, #0
-; NEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0
-; NEON-NOSVE-NEXT: udivr z16.s, p0/m, z16.s, z18.s
-; NEON-NOSVE-NEXT: udiv z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: udiv z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
-; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
-; NEON-NOSVE-NEXT: stp q2, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v32i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1678,6 +1614,52 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v32i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
+; NEON-NOSVE-NEXT: ushll2 v1.8h, v3.16b, #0
+; NEON-NOSVE-NEXT: ushll2 v4.8h, v2.16b, #0
+; NEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0
+; NEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0
+; NEON-NOSVE-NEXT: ushll2 v7.8h, v6.16b, #0
+; NEON-NOSVE-NEXT: ushll v6.8h, v6.8b, #0
+; NEON-NOSVE-NEXT: ushll2 v0.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: ushll2 v17.4s, v7.8h, #0
+; NEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0
+; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z5.s
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: udivr z1.s, p0/m, z1.s, z4.s
+; NEON-NOSVE-NEXT: ushll2 v4.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: ldr q5, [x0]
+; NEON-NOSVE-NEXT: ushll2 v16.8h, v5.16b, #0
+; NEON-NOSVE-NEXT: ushll v5.8h, v5.8b, #0
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; NEON-NOSVE-NEXT: ushll2 v18.4s, v16.8h, #0
+; NEON-NOSVE-NEXT: ushll v16.4s, v16.4h, #0
+; NEON-NOSVE-NEXT: udivr z17.s, p0/m, z17.s, z18.s
+; NEON-NOSVE-NEXT: ushll2 v18.4s, v5.8h, #0
+; NEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0
+; NEON-NOSVE-NEXT: udivr z7.s, p0/m, z7.s, z16.s
+; NEON-NOSVE-NEXT: ushll2 v16.4s, v6.8h, #0
+; NEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0
+; NEON-NOSVE-NEXT: udivr z16.s, p0/m, z16.s, z18.s
+; NEON-NOSVE-NEXT: udiv z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: udiv z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
+; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; NEON-NOSVE-NEXT: stp q2, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = udiv <32 x i8> %op1, %op2
@@ -1697,16 +1679,6 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v2i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b
-; NEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v2i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1722,6 +1694,15 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v2i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b
+; NEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <2 x i16> %op1, %op2
ret <2 x i16> %res
}
@@ -1739,15 +1720,6 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1772,6 +1744,14 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = udiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
@@ -1797,18 +1777,6 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v8i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v8i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -1848,6 +1816,17 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v8i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: ret
%res = udiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
@@ -1885,29 +1864,6 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v16i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
-; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: ldr q3, [x0]
-; NEON-NOSVE-NEXT: ushll2 v6.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: udivr z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: stp q1, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v16i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1984,6 +1940,28 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v16i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
+; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: ldr q3, [x0]
+; NEON-NOSVE-NEXT: ushll2 v6.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: udivr z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: udiv z3.s, p0/m, z3.s, z4.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = udiv <16 x i16> %op1, %op2
@@ -2001,15 +1979,6 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v2i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v2i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -2024,6 +1993,14 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v2i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
@@ -2038,15 +2015,6 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -2066,6 +2034,14 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
@@ -2082,17 +2058,6 @@ define void @udiv_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v8i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: udiv z1.s, p0/m, z1.s, z3.s
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v8i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -2129,6 +2094,16 @@ define void @udiv_v8i32(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v8i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: udiv z1.s, p0/m, z1.s, z3.s
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = udiv <8 x i32> %op1, %op2
@@ -2146,15 +2121,6 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v1i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl1
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v1i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #16
@@ -2166,6 +2132,14 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v1i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl1
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
@@ -2180,15 +2154,6 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v2i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v2i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -2202,6 +2167,14 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v2i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
@@ -2218,17 +2191,6 @@ define void @udiv_v4i64(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: udivr z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: udiv z1.d, p0/m, z1.d, z3.d
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -2253,6 +2215,16 @@ define void @udiv_v4i64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: udivr z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: udiv z1.d, p0/m, z1.d, z3.d
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = udiv <4 x i64> %op1, %op2
@@ -2263,20 +2235,13 @@ define void @udiv_v4i64(ptr %a, ptr %b) {
define void @udiv_constantsplat_v8i32(ptr %a) {
; SVE-LABEL: udiv_constantsplat_v8i32:
; SVE: // %bb.0:
-; SVE-NEXT: mov w8, #8969 // =0x2309
+; SVE-NEXT: mov w8, #37251 // =0x9183
; SVE-NEXT: ldp q1, q2, [x0]
-; SVE-NEXT: movk w8, #22765, lsl #16
+; SVE-NEXT: movk w8, #44150, lsl #16
; SVE-NEXT: ptrue p0.s, vl4
; SVE-NEXT: mov z0.s, w8
-; SVE-NEXT: movprfx z3, z1
-; SVE-NEXT: umulh z3.s, p0/m, z3.s, z0.s
+; SVE-NEXT: umulh z1.s, p0/m, z1.s, z0.s
; SVE-NEXT: umulh z0.s, p0/m, z0.s, z2.s
-; SVE-NEXT: sub z1.s, z1.s, z3.s
-; SVE-NEXT: sub z2.s, z2.s, z0.s
-; SVE-NEXT: lsr z1.s, z1.s, #1
-; SVE-NEXT: lsr z2.s, z2.s, #1
-; SVE-NEXT: add z1.s, z1.s, z3.s
-; SVE-NEXT: add z0.s, z2.s, z0.s
; SVE-NEXT: lsr z1.s, z1.s, #6
; SVE-NEXT: lsr z0.s, z0.s, #6
; SVE-NEXT: stp q1, q0, [x0]
@@ -2284,21 +2249,58 @@ define void @udiv_constantsplat_v8i32(ptr %a) {
;
; SVE2-LABEL: udiv_constantsplat_v8i32:
; SVE2: // %bb.0:
-; SVE2-NEXT: mov w8, #8969 // =0x2309
+; SVE2-NEXT: mov w8, #37251 // =0x9183
; SVE2-NEXT: ldp q1, q2, [x0]
-; SVE2-NEXT: movk w8, #22765, lsl #16
+; SVE2-NEXT: movk w8, #44150, lsl #16
; SVE2-NEXT: mov z0.s, w8
-; SVE2-NEXT: umulh z3.s, z1.s, z0.s
+; SVE2-NEXT: umulh z1.s, z1.s, z0.s
; SVE2-NEXT: umulh z0.s, z2.s, z0.s
-; SVE2-NEXT: sub z1.s, z1.s, z3.s
-; SVE2-NEXT: sub z2.s, z2.s, z0.s
-; SVE2-NEXT: usra z3.s, z1.s, #1
-; SVE2-NEXT: usra z0.s, z2.s, #1
-; SVE2-NEXT: lsr z1.s, z3.s, #6
+; SVE2-NEXT: lsr z1.s, z1.s, #6
; SVE2-NEXT: lsr z0.s, z0.s, #6
; SVE2-NEXT: stp q1, q0, [x0]
; SVE2-NEXT: ret
;
+; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: mov w8, #37251 // =0x9183
+; NONEON-NOSVE-NEXT: movk w8, #44150, lsl #16
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x10, x9, #38
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x11, x9, #38
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: str w9, [sp, #44]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: str w9, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: str w9, [sp, #36]
+; NONEON-NOSVE-NEXT: ldr w9, [sp]
+; NONEON-NOSVE-NEXT: umull x8, w9, w8
+; NONEON-NOSVE-NEXT: lsr x8, x8, #38
+; NONEON-NOSVE-NEXT: str w8, [sp, #32]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #64
+; NONEON-NOSVE-NEXT: ret
; NEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
; NEON-NOSVE: // %bb.0:
; NEON-NOSVE-NEXT: mov w8, #8969 // =0x2309
@@ -2319,70 +2321,6 @@ define void @udiv_constantsplat_v8i32(ptr %a) {
; NEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #6
; NEON-NOSVE-NEXT: stp q1, q0, [x0]
; NEON-NOSVE-NEXT: ret
-;
-; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
-; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT: mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT: movk w8, #22765, lsl #16
-; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #28]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #20]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #12]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #40]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #4]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp]
-; NONEON-NOSVE-NEXT: umull x8, w9, w8
-; NONEON-NOSVE-NEXT: lsr x8, x8, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w8
-; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w8, w8, #6
-; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32]
-; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
-; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT: add sp, sp, #64
-; NONEON-NOSVE-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
store <8 x i32> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 2212e0a633414..0e70596318ef3 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -4,14 +4,11 @@
define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8969 // =0x2309
-; CHECK-NEXT: movk w8, #22765, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #32
-; CHECK-NEXT: sub w9, w0, w8
-; CHECK-NEXT: add w8, w8, w9, lsr #1
+; CHECK-NEXT: mov w8, #37251 // =0x9183
; CHECK-NEXT: mov w9, #95 // =0x5f
-; CHECK-NEXT: lsr w8, w8, #6
+; CHECK-NEXT: movk w8, #44150, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #38
; CHECK-NEXT: msub w0, w8, w9, w0
; CHECK-NEXT: ret
%1 = urem i32 %x, 95
@@ -22,7 +19,7 @@ define i32 @fold_urem_positive_odd(i32 %x) {
define i32 @fold_urem_positive_even(i32 %x) {
; CHECK-LABEL: fold_urem_positive_even:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16323 // =0x3fc3
+; CHECK-NEXT: mov w8, #16321 // =0x3fc1
; CHECK-NEXT: mov w9, #1060 // =0x424
; CHECK-NEXT: movk w8, #63310, lsl #16
; CHECK-NEXT: umull x8, w0, w8
@@ -38,14 +35,11 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8969 // =0x2309
-; CHECK-NEXT: movk w8, #22765, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #32
-; CHECK-NEXT: sub w9, w0, w8
-; CHECK-NEXT: add w8, w8, w9, lsr #1
+; CHECK-NEXT: mov w8, #37251 // =0x9183
; CHECK-NEXT: mov w9, #95 // =0x5f
-; CHECK-NEXT: lsr w8, w8, #6
+; CHECK-NEXT: movk w8, #44150, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #38
; CHECK-NEXT: msub w9, w8, w9, w0
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
@@ -88,14 +82,14 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
define i64 @dont_fold_urem_i64(i64 %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x9, #58849 // =0xe5e1
-; CHECK-NEXT: lsr x8, x0, #1
-; CHECK-NEXT: movk x9, #48148, lsl #16
-; CHECK-NEXT: movk x9, #33436, lsl #32
-; CHECK-NEXT: movk x9, #21399, lsl #48
-; CHECK-NEXT: umulh x8, x8, x9
+; CHECK-NEXT: mov x8, #42799 // =0xa72f
+; CHECK-NEXT: movk x8, #58848, lsl #16
+; CHECK-NEXT: movk x8, #48148, lsl #32
+; CHECK-NEXT: movk x8, #668, lsl #48
+; CHECK-NEXT: umulh x8, x0, x8
+; CHECK-NEXT: sub x9, x0, x8
+; CHECK-NEXT: add x8, x8, x9, lsr #1
; CHECK-NEXT: mov w9, #98 // =0x62
-; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: msub x0, x8, x9, x0
; CHECK-NEXT: ret
%1 = urem i64 %x, 98
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index ab67be9445ed3..80f3da01db42a 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -94,14 +94,13 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_undef1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #34079 // =0x851f
-; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: mov w8, #28836 // =0x70a4
+; CHECK-NEXT: movk w8, #2621, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v2.4s, #25
-; CHECK-NEXT: ushr v1.4s, v1.4s, #3
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
@@ -116,14 +115,16 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_even_undef1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #34079 // =0x851f
-; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: mov w8, #49807 // =0xc28f
+; CHECK-NEXT: movk w8, #10485, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: usra v1.4s, v2.4s, #1
; CHECK-NEXT: movi v2.4s, #100
-; CHECK-NEXT: ushr v1.4s, v1.4s, #5
+; CHECK-NEXT: ushr v1.4s, v1.4s, #4
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
index 468a33ce5bfcf..b33137549daed 100644
--- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
@@ -5,23 +5,20 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; CHECK-LABEL: fold_urem_vec_1:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: movi v2.2s, #128, lsl #24
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_1
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1]
-; CHECK-NEXT: adrp x8, .LCPI0_2
-; CHECK-NEXT: ushl v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: movi d2, #0000000000000000
-; CHECK-NEXT: shrn v1.4h, v1.4s, #16
+; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: fneg d2, d2
+; CHECK-NEXT: shrn v1.4h, v1.4s, #16
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h
; CHECK-NEXT: shrn v2.4h, v2.4s, #16
; CHECK-NEXT: add v1.4h, v2.4h, v1.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2]
-; CHECK-NEXT: adrp x8, .LCPI0_3
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1]
+; CHECK-NEXT: adrp x8, .LCPI0_2
; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2]
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
@@ -31,12 +28,11 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; CHECK-LABEL: fold_urem_vec_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #44151 // =0xac77
+; CHECK-NEXT: mov w8, #690 // =0x2b2
; CHECK-NEXT: movi v2.4h, #95
; CHECK-NEXT: dup v1.4h, w8
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: ushr v1.4s, v1.4s, #22
-; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: shrn v1.4h, v1.4s, #16
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -48,12 +44,11 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #44151 // =0xac77
+; CHECK-NEXT: mov w8, #690 // =0x2b2
; CHECK-NEXT: movi v2.4h, #95
; CHECK-NEXT: dup v1.4h, w8
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: ushr v1.4s, v1.4s, #22
-; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: shrn v1.4h, v1.4s, #16
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
@@ -72,11 +67,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: adrp x8, .LCPI3_1
; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: adrp x8, .LCPI3_2
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_2]
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
@@ -88,25 +80,19 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; CHECK-LABEL: dont_fold_urem_one:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI4_0
-; CHECK-NEXT: movi d4, #0x0000000000ffff
+; CHECK-NEXT: movi d3, #0x0000000000ffff
+; CHECK-NEXT: movi d4, #0xffffffffffff0000
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: adrp x8, .LCPI4_1
-; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_1]
; CHECK-NEXT: adrp x8, .LCPI4_2
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-NEXT: sub v2.4h, v0.4h, v1.4h
-; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h
-; CHECK-NEXT: movi d3, #0xffffffffffff0000
-; CHECK-NEXT: shrn v2.4h, v2.4s, #16
-; CHECK-NEXT: add v1.4h, v2.4h, v1.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT: adrp x8, .LCPI4_3
; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT: and v2.8b, v0.8b, v4.8b
-; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: and v2.8b, v0.8b, v3.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v4.8b
; CHECK-NEXT: orr v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2]
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
@@ -126,36 +112,33 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #17097 // =0x42c9
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: mov x10, v1.d[1]
-; CHECK-NEXT: movk x8, #45590, lsl #16
+; CHECK-NEXT: mov x8, #8547 // =0x2163
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: movk x8, #22795, lsl #16
+; CHECK-NEXT: mov x12, #35795 // =0x8bd3
; CHECK-NEXT: mov x11, v0.d[1]
-; CHECK-NEXT: mov x12, #12109 // =0x2f4d
-; CHECK-NEXT: movk x8, #34192, lsl #32
-; CHECK-NEXT: movk x12, #52170, lsl #16
+; CHECK-NEXT: movk x8, #17096, lsl #32
+; CHECK-NEXT: movk x12, #29426, lsl #16
+; CHECK-NEXT: mov x13, #54513 // =0xd4f1
+; CHECK-NEXT: movk x8, #45590, lsl #48
+; CHECK-NEXT: movk x12, #56339, lsl #32
+; CHECK-NEXT: movk x13, #400, lsl #16
+; CHECK-NEXT: umulh x8, x10, x8
+; CHECK-NEXT: movk x12, #12374, lsl #48
+; CHECK-NEXT: movk x13, #20242, lsl #32
+; CHECK-NEXT: movk x13, #6413, lsl #48
+; CHECK-NEXT: mov w14, #23 // =0x17
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: movk x8, #25644, lsl #48
-; CHECK-NEXT: movk x12, #28749, lsl #32
-; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: movk x12, #49499, lsl #48
-; CHECK-NEXT: lsr x13, x11, #1
-; CHECK-NEXT: umulh x12, x10, x12
-; CHECK-NEXT: sub x14, x9, x8
-; CHECK-NEXT: add x8, x8, x14, lsr #1
-; CHECK-NEXT: mov x14, #21445 // =0x53c5
-; CHECK-NEXT: movk x14, #1603, lsl #16
-; CHECK-NEXT: movk x14, #15432, lsl #32
+; CHECK-NEXT: umulh x12, x9, x12
+; CHECK-NEXT: umulh x13, x11, x13
; CHECK-NEXT: lsr x8, x8, #4
-; CHECK-NEXT: movk x14, #25653, lsl #48
-; CHECK-NEXT: umulh x13, x13, x14
-; CHECK-NEXT: mov w14, #23 // =0x17
-; CHECK-NEXT: msub x8, x8, x14, x9
-; CHECK-NEXT: lsr x9, x12, #12
+; CHECK-NEXT: msub x8, x8, x14, x10
+; CHECK-NEXT: lsr x10, x12, #10
; CHECK-NEXT: mov w12, #5423 // =0x152f
-; CHECK-NEXT: msub x9, x9, x12, x10
+; CHECK-NEXT: msub x9, x10, x12, x9
+; CHECK-NEXT: lsr x10, x13, #6
; CHECK-NEXT: mov w12, #654 // =0x28e
-; CHECK-NEXT: lsr x10, x13, #7
; CHECK-NEXT: msub x10, x10, x12, x11
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v1.d[1], x9
@@ -168,12 +151,12 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
define <16 x i8> @fold_urem_v16i8(<16 x i8> %x) {
; CHECK-LABEL: fold_urem_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.16b, #205
+; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b
; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: movi v2.16b, #10
-; CHECK-NEXT: ushr v1.16b, v1.16b, #3
+; CHECK-NEXT: ushr v1.16b, v1.16b, #1
; CHECK-NEXT: mls v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%1 = urem <16 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -183,11 +166,11 @@ define <16 x i8> @fold_urem_v16i8(<16 x i8> %x) {
define <8 x i8> @fold_urem_v8i8(<8 x i8> %x) {
; CHECK-LABEL: fold_urem_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #205
+; CHECK-NEXT: movi v1.8b, #51
; CHECK-NEXT: movi v2.8b, #10
; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b
; CHECK-NEXT: shrn v1.8b, v1.8h, #8
-; CHECK-NEXT: ushr v1.8b, v1.8b, #3
+; CHECK-NEXT: ushr v1.8b, v1.8b, #1
; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%1 = urem <8 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -197,13 +180,12 @@ define <8 x i8> @fold_urem_v8i8(<8 x i8> %x) {
define <8 x i16> @fold_urem_v8i16(<8 x i16> %x) {
; CHECK-LABEL: fold_urem_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: dup v1.8h, w8
+; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT: movi v2.8h, #10
-; CHECK-NEXT: ushr v1.8h, v1.8h, #3
+; CHECK-NEXT: ushr v1.8h, v1.8h, #1
; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h
; CHECK-NEXT: ret
%1 = urem <8 x i16> %x, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -213,11 +195,10 @@ define <8 x i16> @fold_urem_v8i16(<8 x i16> %x) {
define <4 x i16> @fold_urem_v4i16(<4 x i16> %x) {
; CHECK-LABEL: fold_urem_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
+; CHECK-NEXT: movi v1.8b, #51
; CHECK-NEXT: movi v2.4h, #10
-; CHECK-NEXT: dup v1.4h, w8
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: ushr v1.4s, v1.4s, #19
+; CHECK-NEXT: ushr v1.4s, v1.4s, #17
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
@@ -228,14 +209,12 @@ define <4 x i16> @fold_urem_v4i16(<4 x i16> %x) {
define <4 x i32> @fold_urem_v4i32(<4 x i32> %x) {
; CHECK-LABEL: fold_urem_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v2.4s, #10
-; CHECK-NEXT: ushr v1.4s, v1.4s, #3
+; CHECK-NEXT: ushr v1.4s, v1.4s, #1
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: ret
%1 = urem <4 x i32> %x, <i32 10, i32 10, i32 10, i32 10>
@@ -245,12 +224,10 @@ define <4 x i32> @fold_urem_v4i32(<4 x i32> %x) {
define <2 x i32> @fold_urem_v2i32(<2 x i32> %x) {
; CHECK-LABEL: fold_urem_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
+; CHECK-NEXT: movi v1.8b, #51
; CHECK-NEXT: movi v2.2s, #10
-; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: dup v1.2s, w8
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: ushr v1.2d, v1.2d, #35
+; CHECK-NEXT: ushr v1.2d, v1.2d, #33
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: mls v0.2s, v1.2s, v2.2s
; CHECK-NEXT: ret
@@ -262,15 +239,14 @@ define <2 x i64> @fold_urem_v2i64(<2 x i64> %x) {
; CHECK-LABEL: fold_urem_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: mov x8, #-3689348814741910324 // =0xcccccccccccccccc
+; CHECK-NEXT: mov x8, #3689348814741910323 // =0x3333333333333333
; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: movk x8, #52429
; CHECK-NEXT: mov w12, #10 // =0xa
; CHECK-NEXT: umulh x11, x10, x8
; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: lsr x11, x11, #3
+; CHECK-NEXT: lsr x11, x11, #1
; CHECK-NEXT: msub x10, x11, x12, x10
-; CHECK-NEXT: lsr x8, x8, #3
+; CHECK-NEXT: lsr x8, x8, #1
; CHECK-NEXT: msub x8, x8, x12, x9
; CHECK-NEXT: fmov d0, x10
; CHECK-NEXT: mov v0.d[1], x8
@@ -284,11 +260,10 @@ define <1 x i64> @fold_urem_v1i64(<1 x i64> %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: mov x8, #-3689348814741910324 // =0xcccccccccccccccc
+; CHECK-NEXT: mov x8, #3689348814741910323 // =0x3333333333333333
; CHECK-NEXT: mov w10, #10 // =0xa
-; CHECK-NEXT: movk x8, #52429
; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: lsr x8, x8, #3
+; CHECK-NEXT: lsr x8, x8, #1
; CHECK-NEXT: msub x8, x8, x10, x9
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index cd01148fa7dd7..53d38aead2b2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -227,11 +227,8 @@ define i32 @v_udiv_i32_oddk_denom(i32 %num) {
; CHECK-LABEL: v_udiv_i32_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881
-; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0xd952843f
+; CHECK-NEXT: v_mul_hi_u32 v0, v0, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i32 %num, 1235195
@@ -242,15 +239,9 @@ define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881
-; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_mov_b32_e32 v2, 0xd952843f
+; CHECK-NEXT: v_mul_hi_u32 v0, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, v2
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index d15551365707b..4692c93245c02 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -980,7 +980,7 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
; CHECK-LABEL: v_udiv_i64_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c2f
; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440
; CHECK-NEXT: v_mul_lo_u32 v4, v1, v2
; CHECK-NEXT: v_mul_lo_u32 v5, v0, v3
@@ -1013,7 +1013,7 @@ define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
; CHECK-LABEL: v_udiv_v2i64_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
+; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c2f
; CHECK-NEXT: v_mov_b32_e32 v5, 0xd9528440
; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4
; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 7cf18171a6cd7..42c8d9dc002d5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -5399,14 +5399,11 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881
+; GFX6-NEXT: v_mov_b32_e32 v0, 0xd952843f
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -5417,11 +5414,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881
-; GFX9-NEXT: s_sub_i32 s3, s4, s2
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_lshr_b32 s2, s3, 20
+; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xd952843f
+; GFX9-NEXT: s_lshr_b32 s2, s2, 20
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -5560,14 +5554,10 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT: v_mul_hi_u32 v1, s3, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_lshr_b32 s0, s2, 12
; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
@@ -5577,12 +5567,8 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_add_i32 s3, s3, s4
; GFX9-NEXT: s_lshr_b32 s2, s2, 12
-; GFX9-NEXT: s_lshr_b32 s3, s3, 11
+; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x100101
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -5786,14 +5772,11 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881
+; GFX6-NEXT: v_mov_b32_e32 v0, 0xd952843f
; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -5807,11 +5790,8 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881
-; GFX9-NEXT: s_sub_i32 s3, s4, s2
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_lshr_b32 s2, s3, 20
+; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xd952843f
+; GFX9-NEXT: s_lshr_b32 s2, s2, 20
; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb
; GFX9-NEXT: s_sub_i32 s2, s4, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 5fbcd0bf66999..7eec4bf885642 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -13,9 +13,9 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocaptu
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; CHECK-NEXT: s_and_b32 s4, s0, 0xffff
; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab
+; CHECK-NEXT: s_mul_i32 s6, s4, 0x5555
; CHECK-NEXT: s_lshl_b64 s[4:5], s[0:1], 2
-; CHECK-NEXT: s_lshr_b32 s1, s6, 19
+; CHECK-NEXT: s_lshr_b32 s1, s6, 18
; CHECK-NEXT: s_mul_i32 s1, s1, 12
; CHECK-NEXT: s_sub_i32 s6, s0, s1
; CHECK-NEXT: s_and_b32 s7, s6, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index dfd9a650ff0e9..e291f829498a0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -1192,7 +1192,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: s_mov_b32 s2, 0xfabbd9c1
+; SI-NEXT: s_mov_b32 s2, 0xfabbd9bf
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1212,7 +1212,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1
+; VI-NEXT: s_mov_b32 s2, 0xfabbd9bf
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1228,7 +1228,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: s_mov_b32 s2, 0xfabbd9c1
+; GCN-NEXT: s_mov_b32 s2, 0xfabbd9bf
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9c1, v1
+; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9bf, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
@@ -1263,7 +1263,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MULHI * T0.X, T0.X, literal.x,
-; EG-NEXT: -88352319(-4.876880e+35), 0(0.000000e+00)
+; EG-NEXT: -88352321(-4.876880e+35), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 25(3.503246e-44), 2(2.802597e-45)
@@ -1286,12 +1286,12 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: s_mov_b32 s2, 0x7d5deca3
+; SI-NEXT: s_mov_b32 s2, 0x3eaef651
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_hi_u32 v0, v0, s2
-; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 23, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1306,12 +1306,12 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0x7d5deca3
+; VI-NEXT: s_mov_b32 s2, 0x3eaef651
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_hi_u32 v0, v0, s2
-; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, 23, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -1322,12 +1322,12 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: s_mov_b32 s2, 0x7d5deca3
+; GCN-NEXT: s_mov_b32 s2, 0x3eaef651
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 23, v2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
;
@@ -1338,8 +1338,8 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_mul_hi_u32 v1, 0x7d5deca3, v1
-; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX1030-NEXT: v_mul_hi_u32 v1, 0x3eaef651, v1
+; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 23, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
;
@@ -1357,10 +1357,10 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MULHI * T0.X, T0.X, literal.x,
-; EG-NEXT: 2103307427(1.843675e+37), 0(0.000000e+00)
+; EG-NEXT: 1051653713(3.417230e-01), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
+; EG-NEXT: 23(3.222986e-44), 2(2.802597e-45)
%b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
%a = load i32, ptr addrspace(1) %in
%result = udiv i32 %a, 34259183
@@ -2055,22 +2055,18 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SI-NEXT: s_mov_b32 s0, 0x1389c755
+; SI-NEXT: s_mov_b32 s0, 0x4e271d53
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_mul_hi_u32 v0, v0, s0
; SI-NEXT: v_mul_hi_u32 v1, v1, s0
; SI-NEXT: v_mul_hi_u32 v2, v2, s0
; SI-NEXT: v_mul_hi_u32 v3, v3, s0
-; SI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; SI-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -2083,22 +2079,18 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s0, 0x1389c755
+; VI-NEXT: s_mov_b32 s0, 0x4e271d53
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; VI-NEXT: v_mul_hi_u32 v0, v0, s0
; VI-NEXT: v_mul_hi_u32 v1, v1, s0
; VI-NEXT: v_mul_hi_u32 v2, v2, s0
; VI-NEXT: v_mul_hi_u32 v3, v3, s0
-; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; VI-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; VI-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -2109,22 +2101,18 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-NEXT: s_mov_b32 s0, 0x1389c755
+; GCN-NEXT: s_mov_b32 s0, 0x4e271d53
; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; GCN-NEXT: v_mul_hi_u32 v0, v0, s0
; GCN-NEXT: v_mul_hi_u32 v1, v1, s0
; GCN-NEXT: v_mul_hi_u32 v2, v2, s0
; GCN-NEXT: v_mul_hi_u32 v3, v3, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
@@ -2135,18 +2123,14 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3
-; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0
-; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1
-; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2
-; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; GFX1030-NEXT: v_mul_hi_u32 v0, 0x4e271d53, v0
+; GFX1030-NEXT: v_mul_hi_u32 v1, 0x4e271d53, v1
+; GFX1030-NEXT: v_mul_hi_u32 v2, 0x4e271d53, v2
+; GFX1030-NEXT: v_mul_hi_u32 v3, 0x4e271d53, v3
+; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX1030-NEXT: s_endpgm
;
@@ -2154,7 +2138,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2163,27 +2147,20 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Y,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR T0.W, T0.W, literal.x,
-; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MULHI * T0.Z, PV.W, literal.x,
-; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
-; EG-NEXT: LSHR T1.Z, T0.Y, literal.x,
-; EG-NEXT: LSHR T0.W, PS, literal.y,
-; EG-NEXT: MULHI * T0.Y, T1.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 10(1.401298e-44)
-; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
+; EG-NEXT: MULHI * T0.W, T0.W, literal.x,
+; EG-NEXT: 1311186259(7.009292e+08), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PS, literal.x,
+; EG-NEXT: MULHI * T0.Z, T0.Z, literal.y,
+; EG-NEXT: 14(1.961818e-44), 1311186259(7.009292e+08)
; EG-NEXT: LSHR T0.Z, PS, literal.x,
-; EG-NEXT: LSHR T1.W, T0.X, literal.y,
-; EG-NEXT: MULHI * T0.X, PV.Z, literal.z,
-; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45)
-; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
+; EG-NEXT: MULHI * T0.Y, T0.Y, literal.y,
+; EG-NEXT: 14(1.961818e-44), 1311186259(7.009292e+08)
; EG-NEXT: LSHR T0.Y, PS, literal.x,
-; EG-NEXT: MULHI * T0.X, PV.W, literal.y,
-; EG-NEXT: 10(1.401298e-44), 327796565(3.478022e-27)
+; EG-NEXT: MULHI * T0.X, T0.X, literal.y,
+; EG-NEXT: 14(1.961818e-44), 1311186259(7.009292e+08)
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.y,
-; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45)
+; EG-NEXT: 14(1.961818e-44), 2(2.802597e-45)
%1 = load <4 x i32>, ptr addrspace(1) %in, align 16
%2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
store <4 x i32> %2, ptr addrspace(1) %out, align 16
@@ -2254,12 +2231,11 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; SI-LABEL: test_udiv_3_mulhu:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[2:3], 0x9
-; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; SI-NEXT: v_mov_b32_e32 v0, 0x55555555
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_hi_u32 v0, s0, v0
-; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
@@ -2267,12 +2243,11 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; VI-LABEL: test_udiv_3_mulhu:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[2:3], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; VI-NEXT: v_mov_b32_e32 v0, 0x55555555
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_hi_u32 v0, s0, v0
-; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -2280,10 +2255,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GCN-LABEL: test_udiv_3_mulhu:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT: v_mov_b32_e32 v0, 0x55555555
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
@@ -2292,8 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab
-; GFX1030-NEXT: s_lshr_b32 s0, s0, 1
+; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0x55555555
; GFX1030-NEXT: v_mov_b32_e32 v0, s0
; GFX1030-NEXT: global_store_dword v[0:1], v0, off
; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2301,16 +2274,14 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
;
; EG-LABEL: test_udiv_3_mulhu:
; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MULHI * T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: -1431655765(-3.031649e-13), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.X, PS, 1,
-; EG-NEXT: MOV * T1.X, literal.x,
-; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, literal.x,
+; EG-NEXT: MULHI * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 0(0.000000e+00), 1431655765(1.466015e+13)
%i = udiv i32 %p, 3
store volatile i32 %i, ptr addrspace(1) undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll
index 4b8127fef822d..d3f0b758be794 100644
--- a/llvm/test/CodeGen/AMDGPU/urem.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem.ll
@@ -21,7 +21,6 @@ define amdgpu_kernel void @test_urem_i32(ptr addrspace(1) %out, ptr addrspace(1)
; FUNC-LABEL: {{^}}test_urem_i32_7:
; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x24924925
; SI: v_mul_hi_u32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]]
-; SI: v_sub_{{[iu]}}32
; SI: v_mul_lo_u32
; SI: v_subrev_{{[iu]}}32
; SI: buffer_store_dword
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index cc38e250f183f..8a7d5ffff6fd6 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -621,9 +621,9 @@ define i64 @test_ds_cross_basic_blocks(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: .LBB6_4: # %bb5
; CHECK-NEXT: #
; CHECK-NEXT: lbzu r30, 1(r5)
-; CHECK-NEXT: mulli r29, r30, 171
-; CHECK-NEXT: rlwinm r28, r29, 24, 8, 30
-; CHECK-NEXT: srwi r29, r29, 9
+; CHECK-NEXT: mulli r29, r30, 85
+; CHECK-NEXT: rlwinm r28, r29, 25, 7, 30
+; CHECK-NEXT: srwi r29, r29, 8
; CHECK-NEXT: add r29, r29, r28
; CHECK-NEXT: sub r30, r30, r29
; CHECK-NEXT: clrlwi r30, r30, 24
diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
index 43a1e5a2faf6d..b75bcd82d4468 100644
--- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
@@ -5,12 +5,9 @@
define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
; CHECK: # %bb.0:
-; CHECK-NEXT: lis 4, 22765
-; CHECK-NEXT: ori 4, 4, 8969
+; CHECK-NEXT: lis 4, -21386
+; CHECK-NEXT: ori 4, 4, 37251
; CHECK-NEXT: mulhwu 4, 3, 4
-; CHECK-NEXT: sub 5, 3, 4
-; CHECK-NEXT: srwi 5, 5, 1
-; CHECK-NEXT: add 4, 5, 4
; CHECK-NEXT: srwi 4, 4, 6
; CHECK-NEXT: mulli 4, 4, 95
; CHECK-NEXT: sub 3, 3, 4
@@ -24,7 +21,7 @@ define i32 @fold_urem_positive_even(i32 %x) {
; CHECK-LABEL: fold_urem_positive_even:
; CHECK: # %bb.0:
; CHECK-NEXT: lis 4, -2226
-; CHECK-NEXT: ori 4, 4, 16323
+; CHECK-NEXT: ori 4, 4, 16321
; CHECK-NEXT: mulhwu 4, 3, 4
; CHECK-NEXT: srwi 4, 4, 10
; CHECK-NEXT: mulli 4, 4, 1060
@@ -39,12 +36,9 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: # %bb.0:
-; CHECK-NEXT: lis 4, 22765
-; CHECK-NEXT: ori 4, 4, 8969
+; CHECK-NEXT: lis 4, -21386
+; CHECK-NEXT: ori 4, 4, 37251
; CHECK-NEXT: mulhwu 4, 3, 4
-; CHECK-NEXT: sub 5, 3, 4
-; CHECK-NEXT: srwi 5, 5, 1
-; CHECK-NEXT: add 4, 5, 4
; CHECK-NEXT: srwi 4, 4, 6
; CHECK-NEXT: mulli 5, 4, 95
; CHECK-NEXT: sub 3, 3, 5
diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index a2ad2946cc8ec..8863a9da800f0 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -810,39 +810,35 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9LE-LABEL: dont_fold_urem_i64:
; P9LE: # %bb.0:
-; P9LE-NEXT: lis r4, 1602
+; P9LE-NEXT: lis r4, 5698
; P9LE-NEXT: mfvsrld r3, v3
+; P9LE-NEXT: lis r5, 12374
; P9LE-NEXT: ori r4, r4, 51289
-; P9LE-NEXT: rldic r4, r4, 36, 1
-; P9LE-NEXT: oris r4, r4, 45590
-; P9LE-NEXT: ori r4, r4, 17097
+; P9LE-NEXT: ori r5, r5, 56339
+; P9LE-NEXT: rldic r4, r4, 35, 0
+; P9LE-NEXT: rldic r5, r5, 32, 2
+; P9LE-NEXT: oris r4, r4, 22795
+; P9LE-NEXT: oris r5, r5, 29426
+; P9LE-NEXT: ori r4, r4, 8547
+; P9LE-NEXT: ori r5, r5, 35795
; P9LE-NEXT: mulhdu r4, r3, r4
-; P9LE-NEXT: sub r5, r3, r4
-; P9LE-NEXT: rldicl r5, r5, 63, 1
-; P9LE-NEXT: add r4, r5, r4
-; P9LE-NEXT: lis r5, -16037
; P9LE-NEXT: rldicl r4, r4, 60, 4
-; P9LE-NEXT: ori r5, r5, 28749
; P9LE-NEXT: mulli r4, r4, 23
-; P9LE-NEXT: rldic r5, r5, 32, 0
-; P9LE-NEXT: oris r5, r5, 52170
-; P9LE-NEXT: ori r5, r5, 12109
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: mfvsrd r4, v3
; P9LE-NEXT: mulhdu r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 52, 12
+; P9LE-NEXT: rldicl r5, r5, 54, 10
; P9LE-NEXT: mulli r5, r5, 5423
; P9LE-NEXT: sub r4, r4, r5
-; P9LE-NEXT: lis r5, 3206
-; P9LE-NEXT: ori r5, r5, 42889
; P9LE-NEXT: mtvsrdd v3, r4, r3
+; P9LE-NEXT: lis r4, 3206
; P9LE-NEXT: mfvsrd r3, v2
-; P9LE-NEXT: rldic r5, r5, 35, 1
-; P9LE-NEXT: rldicl r4, r3, 63, 1
-; P9LE-NEXT: oris r5, r5, 1603
-; P9LE-NEXT: ori r5, r5, 21445
-; P9LE-NEXT: mulhdu r4, r4, r5
-; P9LE-NEXT: rldicl r4, r4, 57, 7
+; P9LE-NEXT: ori r4, r4, 42889
+; P9LE-NEXT: rldic r4, r4, 33, 3
+; P9LE-NEXT: oris r4, r4, 400
+; P9LE-NEXT: ori r4, r4, 54513
+; P9LE-NEXT: mulhdu r4, r3, r4
+; P9LE-NEXT: rldicl r4, r4, 58, 6
; P9LE-NEXT: mulli r4, r4, 654
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: li r4, 0
@@ -851,39 +847,35 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
;
; P9BE-LABEL: dont_fold_urem_i64:
; P9BE: # %bb.0:
-; P9BE-NEXT: lis r4, 1602
-; P9BE-NEXT: mfvsrd r3, v3
-; P9BE-NEXT: ori r4, r4, 51289
-; P9BE-NEXT: rldic r4, r4, 36, 1
-; P9BE-NEXT: oris r4, r4, 45590
-; P9BE-NEXT: ori r4, r4, 17097
+; P9BE-NEXT: lis r4, 12374
+; P9BE-NEXT: mfvsrld r3, v3
+; P9BE-NEXT: lis r5, 5698
+; P9BE-NEXT: ori r4, r4, 56339
+; P9BE-NEXT: ori r5, r5, 51289
+; P9BE-NEXT: rldic r4, r4, 32, 2
+; P9BE-NEXT: rldic r5, r5, 35, 0
+; P9BE-NEXT: oris r4, r4, 29426
+; P9BE-NEXT: oris r5, r5, 22795
+; P9BE-NEXT: ori r4, r4, 35795
+; P9BE-NEXT: ori r5, r5, 8547
; P9BE-NEXT: mulhdu r4, r3, r4
-; P9BE-NEXT: sub r5, r3, r4
-; P9BE-NEXT: rldicl r5, r5, 63, 1
-; P9BE-NEXT: add r4, r5, r4
-; P9BE-NEXT: lis r5, -16037
-; P9BE-NEXT: rldicl r4, r4, 60, 4
-; P9BE-NEXT: ori r5, r5, 28749
-; P9BE-NEXT: mulli r4, r4, 23
-; P9BE-NEXT: rldic r5, r5, 32, 0
-; P9BE-NEXT: oris r5, r5, 52170
-; P9BE-NEXT: ori r5, r5, 12109
+; P9BE-NEXT: rldicl r4, r4, 54, 10
+; P9BE-NEXT: mulli r4, r4, 5423
; P9BE-NEXT: sub r3, r3, r4
-; P9BE-NEXT: mfvsrld r4, v3
+; P9BE-NEXT: mfvsrd r4, v3
; P9BE-NEXT: mulhdu r5, r4, r5
-; P9BE-NEXT: rldicl r5, r5, 52, 12
-; P9BE-NEXT: mulli r5, r5, 5423
+; P9BE-NEXT: rldicl r5, r5, 60, 4
+; P9BE-NEXT: mulli r5, r5, 23
; P9BE-NEXT: sub r4, r4, r5
-; P9BE-NEXT: lis r5, 3206
-; P9BE-NEXT: ori r5, r5, 42889
-; P9BE-NEXT: mtvsrdd v3, r3, r4
+; P9BE-NEXT: mtvsrdd v3, r4, r3
+; P9BE-NEXT: lis r4, 3206
; P9BE-NEXT: mfvsrld r3, v2
-; P9BE-NEXT: rldic r5, r5, 35, 1
-; P9BE-NEXT: rldicl r4, r3, 63, 1
-; P9BE-NEXT: oris r5, r5, 1603
-; P9BE-NEXT: ori r5, r5, 21445
-; P9BE-NEXT: mulhdu r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 57, 7
+; P9BE-NEXT: ori r4, r4, 42889
+; P9BE-NEXT: rldic r4, r4, 33, 3
+; P9BE-NEXT: oris r4, r4, 400
+; P9BE-NEXT: ori r4, r4, 54513
+; P9BE-NEXT: mulhdu r4, r3, r4
+; P9BE-NEXT: rldicl r4, r4, 58, 6
; P9BE-NEXT: mulli r4, r4, 654
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: mtvsrdd v2, 0, r3
@@ -891,94 +883,86 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
;
; P8LE-LABEL: dont_fold_urem_i64:
; P8LE: # %bb.0:
-; P8LE-NEXT: lis r3, 1602
+; P8LE-NEXT: lis r3, 5698
; P8LE-NEXT: xxswapd vs0, v3
-; P8LE-NEXT: lis r5, 3206
-; P8LE-NEXT: mfvsrd r6, v2
-; P8LE-NEXT: mfvsrd r8, v3
+; P8LE-NEXT: mfvsrd r5, v3
; P8LE-NEXT: ori r3, r3, 51289
+; P8LE-NEXT: mffprd r4, f0
+; P8LE-NEXT: mfvsrd r6, v2
+; P8LE-NEXT: rldic r3, r3, 35, 0
+; P8LE-NEXT: oris r3, r3, 22795
+; P8LE-NEXT: ori r3, r3, 8547
+; P8LE-NEXT: mulhdu r3, r4, r3
+; P8LE-NEXT: rldicl r3, r3, 60, 4
+; P8LE-NEXT: mulli r3, r3, 23
+; P8LE-NEXT: sub r3, r4, r3
+; P8LE-NEXT: lis r4, 12374
+; P8LE-NEXT: ori r4, r4, 56339
+; P8LE-NEXT: mtfprd f0, r3
+; P8LE-NEXT: li r3, 0
+; P8LE-NEXT: rldic r4, r4, 32, 2
+; P8LE-NEXT: oris r4, r4, 29426
+; P8LE-NEXT: ori r4, r4, 35795
+; P8LE-NEXT: mulhdu r4, r5, r4
+; P8LE-NEXT: rldicl r4, r4, 54, 10
+; P8LE-NEXT: mulli r4, r4, 5423
+; P8LE-NEXT: sub r4, r5, r4
+; P8LE-NEXT: lis r5, 3206
; P8LE-NEXT: ori r5, r5, 42889
-; P8LE-NEXT: rldic r4, r3, 36, 1
-; P8LE-NEXT: mffprd r3, f0
-; P8LE-NEXT: rldic r5, r5, 35, 1
-; P8LE-NEXT: rldicl r7, r6, 63, 1
-; P8LE-NEXT: oris r4, r4, 45590
-; P8LE-NEXT: oris r5, r5, 1603
-; P8LE-NEXT: ori r4, r4, 17097
-; P8LE-NEXT: ori r5, r5, 21445
-; P8LE-NEXT: mulhdu r4, r3, r4
-; P8LE-NEXT: mulhdu r5, r7, r5
-; P8LE-NEXT: sub r7, r3, r4
-; P8LE-NEXT: rldicl r5, r5, 57, 7
-; P8LE-NEXT: rldicl r7, r7, 63, 1
+; P8LE-NEXT: mtfprd f1, r4
+; P8LE-NEXT: rldic r5, r5, 33, 3
+; P8LE-NEXT: oris r5, r5, 400
+; P8LE-NEXT: ori r5, r5, 54513
+; P8LE-NEXT: mulhdu r5, r6, r5
+; P8LE-NEXT: rldicl r5, r5, 58, 6
; P8LE-NEXT: mulli r5, r5, 654
-; P8LE-NEXT: add r4, r7, r4
-; P8LE-NEXT: lis r7, -16037
-; P8LE-NEXT: ori r7, r7, 28749
-; P8LE-NEXT: rldicl r4, r4, 60, 4
; P8LE-NEXT: sub r5, r6, r5
-; P8LE-NEXT: rldic r7, r7, 32, 0
-; P8LE-NEXT: mulli r4, r4, 23
-; P8LE-NEXT: oris r7, r7, 52170
-; P8LE-NEXT: ori r7, r7, 12109
-; P8LE-NEXT: sub r3, r3, r4
-; P8LE-NEXT: mulhdu r7, r8, r7
+; P8LE-NEXT: xxmrghd v3, vs1, vs0
; P8LE-NEXT: mtfprd f1, r3
-; P8LE-NEXT: li r3, 0
-; P8LE-NEXT: rldicl r7, r7, 52, 12
-; P8LE-NEXT: mulli r7, r7, 5423
-; P8LE-NEXT: sub r7, r8, r7
-; P8LE-NEXT: mtfprd f0, r7
-; P8LE-NEXT: xxmrghd v3, vs0, vs1
; P8LE-NEXT: mtfprd f0, r5
-; P8LE-NEXT: mtfprd f1, r3
; P8LE-NEXT: xxmrghd v2, vs0, vs1
; P8LE-NEXT: blr
;
; P8BE-LABEL: dont_fold_urem_i64:
; P8BE: # %bb.0:
-; P8BE-NEXT: lis r3, 1602
-; P8BE-NEXT: mfvsrd r4, v3
+; P8BE-NEXT: lis r3, 12374
+; P8BE-NEXT: xxswapd vs0, v3
+; P8BE-NEXT: mfvsrd r5, v3
+; P8BE-NEXT: ori r3, r3, 56339
+; P8BE-NEXT: mffprd r4, f0
+; P8BE-NEXT: xxswapd vs1, v2
+; P8BE-NEXT: mffprd r6, f1
+; P8BE-NEXT: rldic r3, r3, 32, 2
+; P8BE-NEXT: oris r3, r3, 29426
+; P8BE-NEXT: ori r3, r3, 35795
+; P8BE-NEXT: mulhdu r3, r4, r3
+; P8BE-NEXT: rldicl r3, r3, 54, 10
+; P8BE-NEXT: mulli r3, r3, 5423
+; P8BE-NEXT: sub r3, r4, r3
+; P8BE-NEXT: lis r4, 5698
+; P8BE-NEXT: ori r4, r4, 51289
+; P8BE-NEXT: mtfprd f0, r3
+; P8BE-NEXT: li r3, 0
+; P8BE-NEXT: rldic r4, r4, 35, 0
+; P8BE-NEXT: oris r4, r4, 22795
+; P8BE-NEXT: ori r4, r4, 8547
+; P8BE-NEXT: mulhdu r4, r5, r4
+; P8BE-NEXT: rldicl r4, r4, 60, 4
+; P8BE-NEXT: mulli r4, r4, 23
+; P8BE-NEXT: sub r4, r5, r4
; P8BE-NEXT: lis r5, 3206
-; P8BE-NEXT: xxswapd vs0, v2
-; P8BE-NEXT: xxswapd vs1, v3
-; P8BE-NEXT: ori r3, r3, 51289
; P8BE-NEXT: ori r5, r5, 42889
-; P8BE-NEXT: mffprd r6, f0
-; P8BE-NEXT: mffprd r8, f1
-; P8BE-NEXT: rldic r3, r3, 36, 1
-; P8BE-NEXT: rldic r5, r5, 35, 1
-; P8BE-NEXT: oris r3, r3, 45590
-; P8BE-NEXT: oris r5, r5, 1603
-; P8BE-NEXT: rldicl r7, r6, 63, 1
-; P8BE-NEXT: ori r3, r3, 17097
-; P8BE-NEXT: ori r5, r5, 21445
-; P8BE-NEXT: mulhdu r3, r4, r3
-; P8BE-NEXT: mulhdu r5, r7, r5
-; P8BE-NEXT: sub r7, r4, r3
-; P8BE-NEXT: rldicl r5, r5, 57, 7
-; P8BE-NEXT: rldicl r7, r7, 63, 1
+; P8BE-NEXT: mtfprd f1, r4
+; P8BE-NEXT: rldic r5, r5, 33, 3
+; P8BE-NEXT: oris r5, r5, 400
+; P8BE-NEXT: ori r5, r5, 54513
+; P8BE-NEXT: mulhdu r5, r6, r5
+; P8BE-NEXT: rldicl r5, r5, 58, 6
; P8BE-NEXT: mulli r5, r5, 654
-; P8BE-NEXT: add r3, r7, r3
-; P8BE-NEXT: lis r7, -16037
-; P8BE-NEXT: ori r7, r7, 28749
-; P8BE-NEXT: rldicl r3, r3, 60, 4
; P8BE-NEXT: sub r5, r6, r5
-; P8BE-NEXT: rldic r7, r7, 32, 0
-; P8BE-NEXT: mulli r3, r3, 23
-; P8BE-NEXT: oris r7, r7, 52170
-; P8BE-NEXT: ori r7, r7, 12109
-; P8BE-NEXT: sub r3, r4, r3
-; P8BE-NEXT: mulhdu r7, r8, r7
-; P8BE-NEXT: mtfprd f1, r3
-; P8BE-NEXT: li r3, 0
-; P8BE-NEXT: rldicl r7, r7, 52, 12
-; P8BE-NEXT: mulli r7, r7, 5423
-; P8BE-NEXT: sub r7, r8, r7
-; P8BE-NEXT: mtfprd f0, r7
; P8BE-NEXT: xxmrghd v3, vs1, vs0
-; P8BE-NEXT: mtfprd f0, r5
; P8BE-NEXT: mtfprd f1, r3
+; P8BE-NEXT: mtfprd f0, r5
; P8BE-NEXT: xxmrghd v2, vs1, vs0
; P8BE-NEXT: blr
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 91ac7c5ddae3f..bc0ea9db9a1af 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -14,21 +14,29 @@
define i32 @udiv_constant_no_add(i32 %a) nounwind {
; RV32-LABEL: udiv_constant_no_add:
; RV32: # %bb.0:
-; RV32-NEXT: lui a1, 838861
-; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: mulhu a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: ret
;
-; RV64-LABEL: udiv_constant_no_add:
-; RV64: # %bb.0:
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addi a1, a1, -819
-; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 34
-; RV64-NEXT: ret
+; RV64IM-LABEL: udiv_constant_no_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addi a1, a1, 819
+; RV64IM-NEXT: slli a1, a1, 32
+; RV64IM-NEXT: mulhu a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv_constant_no_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 209715
+; RV64IMZB-NEXT: addiw a1, a1, 819
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 32
+; RV64IMZB-NEXT: ret
%1 = udiv i32 %a, 5
ret i32 %1
}
@@ -39,76 +47,88 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: lui a1, 149797
; RV32-NEXT: addi a1, a1, -1755
-; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: srli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: mulhu a0, a0, a1
; RV32-NEXT: ret
;
; RV64IM-LABEL: udiv_constant_add:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 149797
-; RV64IM-NEXT: addi a2, a2, -1755
-; RV64IM-NEXT: slli a2, a2, 32
-; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 1
-; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 149797
+; RV64IM-NEXT: addi a1, a1, -1755
+; RV64IM-NEXT: slli a1, a1, 32
+; RV64IM-NEXT: mulhu a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 32
; RV64IM-NEXT: ret
;
; RV64IMZB-LABEL: udiv_constant_add:
; RV64IMZB: # %bb.0:
-; RV64IMZB-NEXT: zext.w a1, a0
-; RV64IMZB-NEXT: lui a2, 149797
-; RV64IMZB-NEXT: addiw a2, a2, -1755
-; RV64IMZB-NEXT: mul a1, a1, a2
-; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: subw a0, a0, a1
-; RV64IMZB-NEXT: srliw a0, a0, 1
-; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 149797
+; RV64IMZB-NEXT: addiw a1, a1, -1755
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 32
; RV64IMZB-NEXT: ret
%1 = udiv i32 %a, 7
ret i32 %1
}
define i64 @udiv64_constant_no_add(i64 %a) nounwind {
-; RV32-LABEL: udiv64_constant_no_add:
-; RV32: # %bb.0:
-; RV32-NEXT: add a2, a0, a1
-; RV32-NEXT: sltu a3, a2, a0
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 838861
-; RV32-NEXT: addi a4, a3, -819
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 2
-; RV32-NEXT: andi a5, a5, -4
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -820
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
-; RV32-NEXT: ret
+; RV32IM-LABEL: udiv64_constant_no_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: add a2, a0, a1
+; RV32IM-NEXT: sltu a3, a2, a0
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: lui a3, 209715
+; RV32IM-NEXT: addi a3, a3, 819
+; RV32IM-NEXT: mulhu a3, a2, a3
+; RV32IM-NEXT: slli a4, a3, 2
+; RV32IM-NEXT: add a3, a4, a3
+; RV32IM-NEXT: sub a2, a2, a3
+; RV32IM-NEXT: sub a3, a0, a2
+; RV32IM-NEXT: lui a4, 838861
+; RV32IM-NEXT: addi a5, a4, -820
+; RV32IM-NEXT: mul a5, a3, a5
+; RV32IM-NEXT: addi a4, a4, -819
+; RV32IM-NEXT: mulhu a6, a3, a4
+; RV32IM-NEXT: add a5, a6, a5
+; RV32IM-NEXT: sltu a0, a0, a2
+; RV32IM-NEXT: sub a1, a1, a0
+; RV32IM-NEXT: mul a1, a1, a4
+; RV32IM-NEXT: add a1, a5, a1
+; RV32IM-NEXT: mul a0, a3, a4
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv64_constant_no_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: add a2, a0, a1
+; RV32IMZB-NEXT: sltu a3, a2, a0
+; RV32IMZB-NEXT: add a2, a2, a3
+; RV32IMZB-NEXT: lui a3, 209715
+; RV32IMZB-NEXT: addi a3, a3, 819
+; RV32IMZB-NEXT: mulhu a3, a2, a3
+; RV32IMZB-NEXT: sh2add a3, a3, a3
+; RV32IMZB-NEXT: sub a2, a2, a3
+; RV32IMZB-NEXT: sub a3, a0, a2
+; RV32IMZB-NEXT: lui a4, 838861
+; RV32IMZB-NEXT: addi a5, a4, -820
+; RV32IMZB-NEXT: mul a5, a3, a5
+; RV32IMZB-NEXT: addi a4, a4, -819
+; RV32IMZB-NEXT: mulhu a6, a3, a4
+; RV32IMZB-NEXT: add a5, a6, a5
+; RV32IMZB-NEXT: sltu a0, a0, a2
+; RV32IMZB-NEXT: sub a1, a1, a0
+; RV32IMZB-NEXT: mul a1, a1, a4
+; RV32IMZB-NEXT: add a1, a5, a1
+; RV32IMZB-NEXT: mul a0, a3, a4
+; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_no_add:
; RV64: # %bb.0:
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: lui a1, 209715
+; RV64-NEXT: addiw a1, a1, 819
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
; RV64-NEXT: ret
%1 = udiv i64 %a, 5
ret i64 %1
@@ -130,11 +150,7 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: lui a1, %hi(.LCPI3_0)
; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: srli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: mulhu a0, a0, a1
; RV64-NEXT: ret
%1 = udiv i64 %a, 7
ret i64 %1
@@ -144,17 +160,17 @@ define i8 @udiv8_constant_no_add(i8 %a) nounwind {
; RV32-LABEL: udiv8_constant_no_add:
; RV32: # %bb.0:
; RV32-NEXT: andi a0, a0, 255
-; RV32-NEXT: li a1, 205
+; RV32-NEXT: li a1, 51
; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 10
+; RV32-NEXT: srli a0, a0, 8
; RV32-NEXT: ret
;
; RV64-LABEL: udiv8_constant_no_add:
; RV64: # %bb.0:
; RV64-NEXT: andi a0, a0, 255
-; RV64-NEXT: li a1, 205
+; RV64-NEXT: li a1, 51
; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 10
+; RV64-NEXT: srli a0, a0, 8
; RV64-NEXT: ret
%1 = udiv i8 %a, 5
ret i8 %1
@@ -163,54 +179,34 @@ define i8 @udiv8_constant_no_add(i8 %a) nounwind {
define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV32IM-LABEL: udiv8_constant_add:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: andi a1, a0, 255
-; RV32IM-NEXT: li a2, 37
-; RV32IM-NEXT: mul a1, a1, a2
-; RV32IM-NEXT: srli a1, a1, 8
-; RV32IM-NEXT: sub a0, a0, a1
-; RV32IM-NEXT: slli a0, a0, 24
-; RV32IM-NEXT: srli a0, a0, 25
-; RV32IM-NEXT: add a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 2
+; RV32IM-NEXT: andi a0, a0, 255
+; RV32IM-NEXT: li a1, 37
+; RV32IM-NEXT: mul a0, a0, a1
+; RV32IM-NEXT: srli a0, a0, 8
; RV32IM-NEXT: ret
;
; RV32IMZB-LABEL: udiv8_constant_add:
; RV32IMZB: # %bb.0:
-; RV32IMZB-NEXT: andi a1, a0, 255
-; RV32IMZB-NEXT: sh3add a2, a1, a1
-; RV32IMZB-NEXT: sh2add a1, a2, a1
-; RV32IMZB-NEXT: srli a1, a1, 8
-; RV32IMZB-NEXT: sub a0, a0, a1
-; RV32IMZB-NEXT: slli a0, a0, 24
-; RV32IMZB-NEXT: srli a0, a0, 25
-; RV32IMZB-NEXT: add a0, a0, a1
-; RV32IMZB-NEXT: srli a0, a0, 2
+; RV32IMZB-NEXT: andi a0, a0, 255
+; RV32IMZB-NEXT: sh3add a1, a0, a0
+; RV32IMZB-NEXT: sh2add a0, a1, a0
+; RV32IMZB-NEXT: srli a0, a0, 8
; RV32IMZB-NEXT: ret
;
; RV64IM-LABEL: udiv8_constant_add:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: andi a1, a0, 255
-; RV64IM-NEXT: li a2, 37
-; RV64IM-NEXT: mul a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 8
-; RV64IM-NEXT: subw a0, a0, a1
-; RV64IM-NEXT: slli a0, a0, 56
-; RV64IM-NEXT: srli a0, a0, 57
-; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: andi a0, a0, 255
+; RV64IM-NEXT: li a1, 37
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 8
; RV64IM-NEXT: ret
;
; RV64IMZB-LABEL: udiv8_constant_add:
; RV64IMZB: # %bb.0:
-; RV64IMZB-NEXT: andi a1, a0, 255
-; RV64IMZB-NEXT: sh3add a2, a1, a1
-; RV64IMZB-NEXT: sh2add a1, a2, a1
-; RV64IMZB-NEXT: srli a1, a1, 8
-; RV64IMZB-NEXT: subw a0, a0, a1
-; RV64IMZB-NEXT: slli a0, a0, 56
-; RV64IMZB-NEXT: srli a0, a0, 57
-; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: andi a0, a0, 255
+; RV64IMZB-NEXT: sh3add a1, a0, a0
+; RV64IMZB-NEXT: sh2add a0, a1, a0
+; RV64IMZB-NEXT: srli a0, a0, 8
; RV64IMZB-NEXT: ret
%1 = udiv i8 %a, 7
ret i8 %1
@@ -220,18 +216,17 @@ define i16 @udiv16_constant_no_add(i16 %a) nounwind {
; RV32-LABEL: udiv16_constant_no_add:
; RV32: # %bb.0:
; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: lui a1, 838864
+; RV32-NEXT: lui a1, 209712
; RV32-NEXT: mulhu a0, a0, a1
-; RV32-NEXT: srli a0, a0, 18
+; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: udiv16_constant_no_add:
; RV64: # %bb.0:
-; RV64-NEXT: lui a1, 52429
-; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: lui a1, 209712
; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 18
+; RV64-NEXT: srli a0, a0, 16
; RV64-NEXT: ret
%1 = udiv i16 %a, 5
ret i16 %1
@@ -240,28 +235,18 @@ define i16 @udiv16_constant_no_add(i16 %a) nounwind {
define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV32-LABEL: udiv16_constant_add:
; RV32: # %bb.0:
-; RV32-NEXT: slli a1, a0, 16
-; RV32-NEXT: lui a2, 149808
-; RV32-NEXT: mulhu a1, a1, a2
-; RV32-NEXT: srli a1, a1, 16
-; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 17
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: lui a1, 149808
+; RV32-NEXT: mulhu a0, a0, a1
+; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: udiv16_constant_add:
; RV64: # %bb.0:
-; RV64-NEXT: slli a1, a0, 48
-; RV64-NEXT: lui a2, 149808
-; RV64-NEXT: mulhu a1, a1, a2
-; RV64-NEXT: srli a1, a1, 16
-; RV64-NEXT: subw a0, a0, a1
; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: srli a0, a0, 49
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: lui a1, 149808
+; RV64-NEXT: mulhu a0, a0, a1
+; RV64-NEXT: srli a0, a0, 16
; RV64-NEXT: ret
%1 = udiv i16 %a, 7
ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index 99c83b99497dd..d96d4983c18f1 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -47,10 +47,9 @@ define i32 @udiv_constant(i32 %a) nounwind {
;
; RV32IM-LABEL: udiv_constant:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a1, 838861
-; RV32IM-NEXT: addi a1, a1, -819
+; RV32IM-NEXT: lui a1, 209715
+; RV32IM-NEXT: addi a1, a1, 819
; RV32IM-NEXT: mulhu a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 2
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv_constant:
@@ -68,11 +67,11 @@ define i32 @udiv_constant(i32 %a) nounwind {
; RV64IM-LABEL: udiv_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a0, a0, 32
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addi a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addi a1, a1, 819
; RV64IM-NEXT: slli a1, a1, 32
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 34
+; RV64IM-NEXT: srli a0, a0, 32
; RV64IM-NEXT: ret
%1 = udiv i32 %a, 5
ret i32 %1
@@ -184,23 +183,24 @@ define i64 @udiv64_constant(i64 %a) nounwind {
; RV32IM-NEXT: add a2, a0, a1
; RV32IM-NEXT: sltu a3, a2, a0
; RV32IM-NEXT: add a2, a2, a3
-; RV32IM-NEXT: lui a3, 838861
-; RV32IM-NEXT: addi a4, a3, -819
-; RV32IM-NEXT: mulhu a5, a2, a4
-; RV32IM-NEXT: srli a6, a5, 2
-; RV32IM-NEXT: andi a5, a5, -4
-; RV32IM-NEXT: add a5, a5, a6
-; RV32IM-NEXT: sub a2, a2, a5
-; RV32IM-NEXT: sub a5, a0, a2
-; RV32IM-NEXT: addi a3, a3, -820
-; RV32IM-NEXT: mul a3, a5, a3
-; RV32IM-NEXT: mulhu a6, a5, a4
-; RV32IM-NEXT: add a3, a6, a3
+; RV32IM-NEXT: lui a3, 209715
+; RV32IM-NEXT: addi a3, a3, 819
+; RV32IM-NEXT: mulhu a3, a2, a3
+; RV32IM-NEXT: slli a4, a3, 2
+; RV32IM-NEXT: add a3, a4, a3
+; RV32IM-NEXT: sub a2, a2, a3
+; RV32IM-NEXT: sub a3, a0, a2
+; RV32IM-NEXT: lui a4, 838861
+; RV32IM-NEXT: addi a5, a4, -820
+; RV32IM-NEXT: mul a5, a3, a5
+; RV32IM-NEXT: addi a4, a4, -819
+; RV32IM-NEXT: mulhu a6, a3, a4
+; RV32IM-NEXT: add a5, a6, a5
; RV32IM-NEXT: sltu a0, a0, a2
; RV32IM-NEXT: sub a1, a1, a0
; RV32IM-NEXT: mul a1, a1, a4
-; RV32IM-NEXT: add a1, a3, a1
-; RV32IM-NEXT: mul a0, a5, a4
+; RV32IM-NEXT: add a1, a5, a1
+; RV32IM-NEXT: mul a0, a3, a4
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv64_constant:
@@ -210,12 +210,11 @@ define i64 @udiv64_constant(i64 %a) nounwind {
;
; RV64IM-LABEL: udiv64_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addiw a1, a1, 819
; RV64IM-NEXT: slli a2, a1, 32
; RV64IM-NEXT: add a1, a1, a2
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
; RV64IM-NEXT: ret
%1 = udiv i64 %a, 5
ret i64 %1
@@ -318,9 +317,9 @@ define i8 @udiv8_constant(i8 %a) nounwind {
; RV32IM-LABEL: udiv8_constant:
; RV32IM: # %bb.0:
; RV32IM-NEXT: andi a0, a0, 255
-; RV32IM-NEXT: li a1, 205
+; RV32IM-NEXT: li a1, 51
; RV32IM-NEXT: mul a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 10
+; RV32IM-NEXT: srli a0, a0, 8
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv8_constant:
@@ -337,9 +336,9 @@ define i8 @udiv8_constant(i8 %a) nounwind {
; RV64IM-LABEL: udiv8_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: andi a0, a0, 255
-; RV64IM-NEXT: li a1, 205
+; RV64IM-NEXT: li a1, 51
; RV64IM-NEXT: mul a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 10
+; RV64IM-NEXT: srli a0, a0, 8
; RV64IM-NEXT: ret
%1 = udiv i8 %a, 5
ret i8 %1
@@ -477,9 +476,9 @@ define i16 @udiv16_constant(i16 %a) nounwind {
; RV32IM-LABEL: udiv16_constant:
; RV32IM: # %bb.0:
; RV32IM-NEXT: slli a0, a0, 16
-; RV32IM-NEXT: lui a1, 838864
+; RV32IM-NEXT: lui a1, 209712
; RV32IM-NEXT: mulhu a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 18
+; RV32IM-NEXT: srli a0, a0, 16
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv16_constant:
@@ -496,11 +495,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
;
; RV64IM-LABEL: udiv16_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 52429
-; RV64IM-NEXT: slli a1, a1, 4
; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: lui a1, 209712
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 18
+; RV64IM-NEXT: srli a0, a0, 16
; RV64IM-NEXT: ret
%1 = udiv i16 %a, 5
ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index 8aa145f6ac5ef..af44ce92a89e0 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -20,10 +20,9 @@ define signext i32 @wobble() nounwind {
; CHECK-NEXT: sw a0, %lo(global.1)(a2)
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: slli a1, a0, 48
-; CHECK-NEXT: lui a2, 52429
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: lui a2, 209712
; CHECK-NEXT: mulhu a1, a1, a2
-; CHECK-NEXT: srli a1, a1, 18
+; CHECK-NEXT: srli a1, a1, 16
; CHECK-NEXT: lui a2, %hi(global.3)
; CHECK-NEXT: li a3, 5
; CHECK-NEXT: sw a1, %lo(global.3)(a2)
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
index 17d9e9cefe117..1abace8bbba0e 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
@@ -42,11 +42,11 @@ define i32 @udiv_constant(i32 %a) nounwind {
; RV64IM-LABEL: udiv_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a0, a0, 32
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addi a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addi a1, a1, 819
; RV64IM-NEXT: slli a1, a1, 32
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 34
+; RV64IM-NEXT: srli a0, a0, 32
; RV64IM-NEXT: ret
%1 = udiv i32 %a, 5
ret i32 %1
@@ -109,12 +109,11 @@ define i64 @udiv64_constant(i64 %a) nounwind {
;
; RV64IM-LABEL: udiv64_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addiw a1, a1, 819
; RV64IM-NEXT: slli a2, a1, 32
; RV64IM-NEXT: add a1, a1, a2
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
; RV64IM-NEXT: ret
%1 = udiv i64 %a, 5
ret i64 %1
@@ -173,9 +172,9 @@ define i8 @udiv8_constant(i8 %a) nounwind {
; RV64IM-LABEL: udiv8_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: andi a0, a0, 255
-; RV64IM-NEXT: li a1, 205
+; RV64IM-NEXT: li a1, 51
; RV64IM-NEXT: mul a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 10
+; RV64IM-NEXT: srliw a0, a0, 8
; RV64IM-NEXT: ret
%1 = udiv i8 %a, 5
ret i8 %1
@@ -260,11 +259,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
;
; RV64IM-LABEL: udiv16_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 52429
-; RV64IM-NEXT: slli a1, a1, 4
; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: lui a1, 209712
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 18
+; RV64IM-NEXT: srliw a0, a0, 16
; RV64IM-NEXT: ret
%1 = udiv i16 %a, 5
ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index 65a1035fd815c..3621be0126c34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -116,24 +116,18 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vslide1down.vx v8, v8, a1
+; CHECK-NEXT: vslide1down.vx v8, v8, a2
; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0)
; CHECK-NEXT: vle32.v v9, (a0)
-; CHECK-NEXT: vslide1down.vx v8, v8, a1
-; CHECK-NEXT: vslide1down.vx v8, v8, a2
-; CHECK-NEXT: vslide1down.vx v8, v8, a3
-; CHECK-NEXT: vmulhu.vv v9, v8, v9
-; CHECK-NEXT: vsub.vv v10, v8, v9
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: lui a0, 524288
-; CHECK-NEXT: vslide1down.vx v11, v11, a0
; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_1)
-; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vmulhu.vv v10, v10, v11
-; CHECK-NEXT: vadd.vv v9, v10, v9
+; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vslide1down.vx v8, v8, a3
+; CHECK-NEXT: vmulhu.vv v9, v8, v9
; CHECK-NEXT: vmv.v.i v0, 4
-; CHECK-NEXT: vsrl.vv v9, v9, v12
+; CHECK-NEXT: vsrl.vv v9, v9, v10
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%e0 = udiv i32 %a, 23
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index d309da6df7dc7..dd3c1644b1f22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -1016,14 +1016,12 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
; RV32NOM-LABEL: extractelt_udiv_v4i32:
; RV32NOM: # %bb.0:
+; RV32NOM-NEXT: lui a0, 80660
+; RV32NOM-NEXT: addi a0, a0, -1260
; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32NOM-NEXT: vsrl.vi v8, v8, 0
-; RV32NOM-NEXT: lui a0, 322639
-; RV32NOM-NEXT: addi a0, a0, -945
; RV32NOM-NEXT: vmulhu.vx v8, v8, a0
; RV32NOM-NEXT: vslidedown.vi v8, v8, 2
; RV32NOM-NEXT: vmv.x.s a0, v8
-; RV32NOM-NEXT: srli a0, a0, 2
; RV32NOM-NEXT: ret
;
; RV32M-LABEL: extractelt_udiv_v4i32:
@@ -1031,36 +1029,32 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV32M-NEXT: vslidedown.vi v8, v8, 2
; RV32M-NEXT: vmv.x.s a0, v8
-; RV32M-NEXT: lui a1, 322639
-; RV32M-NEXT: addi a1, a1, -945
+; RV32M-NEXT: lui a1, 80660
+; RV32M-NEXT: addi a1, a1, -1260
; RV32M-NEXT: mulhu a0, a0, a1
-; RV32M-NEXT: srli a0, a0, 2
; RV32M-NEXT: ret
;
; RV64NOM-LABEL: extractelt_udiv_v4i32:
; RV64NOM: # %bb.0:
+; RV64NOM-NEXT: lui a0, 80660
+; RV64NOM-NEXT: addi a0, a0, -1260
; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64NOM-NEXT: vsrl.vi v8, v8, 0
-; RV64NOM-NEXT: lui a0, 322639
-; RV64NOM-NEXT: addi a0, a0, -945
; RV64NOM-NEXT: vmulhu.vx v8, v8, a0
; RV64NOM-NEXT: vslidedown.vi v8, v8, 2
; RV64NOM-NEXT: vmv.x.s a0, v8
-; RV64NOM-NEXT: slli a0, a0, 33
-; RV64NOM-NEXT: srli a0, a0, 35
; RV64NOM-NEXT: ret
;
; RV64M-LABEL: extractelt_udiv_v4i32:
; RV64M: # %bb.0:
-; RV64M-NEXT: lui a0, 322639
-; RV64M-NEXT: addi a0, a0, -945
-; RV64M-NEXT: slli a0, a0, 32
+; RV64M-NEXT: lui a0, 20165
+; RV64M-NEXT: addi a0, a0, -315
+; RV64M-NEXT: slli a0, a0, 34
; RV64M-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64M-NEXT: vslidedown.vi v8, v8, 2
; RV64M-NEXT: vmv.x.s a1, v8
; RV64M-NEXT: slli a1, a1, 32
; RV64M-NEXT: mulhu a0, a1, a0
-; RV64M-NEXT: srli a0, a0, 34
+; RV64M-NEXT: srli a0, a0, 32
; RV64M-NEXT: ret
%bo = udiv <4 x i32> %x, <i32 11, i32 12, i32 13, i32 14>
%ext = extractelement <4 x i32> %bo, i32 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index ea2cdae903e5a..e9b4af22a364e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1099,47 +1099,25 @@ define void @urem_v2i64(ptr %x, ptr %y) {
define void @mulhu_v16i8(ptr %x) {
; CHECK-LABEL: mulhu_v16i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vle8.v v9, (a0)
-; CHECK-NEXT: lui a1, 3
-; CHECK-NEXT: addi a1, a1, -2044
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: lui a1, 1
-; CHECK-NEXT: addi a2, a1, 32
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v8, a2
-; CHECK-NEXT: lui a2, %hi(.LCPI65_0)
-; CHECK-NEXT: addi a2, a2, %lo(.LCPI65_0)
-; CHECK-NEXT: vle8.v v11, (a2)
-; CHECK-NEXT: li a2, -128
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmerge.vxm v12, v10, a2, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT: vsrl.vv v8, v9, v8
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: lui a1, %hi(.LCPI65_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI65_0)
+; CHECK-NEXT: vle8.v v9, (a1)
+; CHECK-NEXT: li a1, -128
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
+; CHECK-NEXT: vslideup.vi v11, v10, 5
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmulhu.vv v9, v8, v9
+; CHECK-NEXT: lui a1, %hi(.LCPI65_1)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI65_1)
+; CHECK-NEXT: vle8.v v10, (a1)
+; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vmulhu.vv v8, v8, v11
-; CHECK-NEXT: vsub.vv v9, v9, v8
-; CHECK-NEXT: vmulhu.vv v9, v9, v12
-; CHECK-NEXT: vadd.vv v9, v9, v8
-; CHECK-NEXT: li a2, 513
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a2
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 4
-; CHECK-NEXT: vmerge.vim v10, v8, 1, v0
-; CHECK-NEXT: addi a1, a1, 78
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: lui a1, 8
-; CHECK-NEXT: addi a1, a1, 304
-; CHECK-NEXT: vmv.s.x v8, a1
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmerge.vim v10, v10, 3, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v10, 2, v0
-; CHECK-NEXT: vsrl.vv v8, v9, v8
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: vsrl.vv v8, v8, v10
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
@@ -1153,32 +1131,23 @@ define void @mulhu_v8i16(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.v.i v9, 0
-; CHECK-NEXT: lui a1, 1048568
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT: vmv.s.x v10, a1
; CHECK-NEXT: lui a1, %hi(.LCPI66_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_0)
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v11, (a1)
-; CHECK-NEXT: vmv.v.i v12, 1
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vmv.v.i v11, 0
; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v12, 6
+; CHECK-NEXT: vslideup.vi v11, v10, 6
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vsrl.vv v9, v8, v9
-; CHECK-NEXT: vmulhu.vv v9, v9, v11
+; CHECK-NEXT: vmulhu.vv v9, v8, v9
+; CHECK-NEXT: lui a1, %hi(.LCPI66_1)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_1)
+; CHECK-NEXT: vle16.v v10, (a1)
; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: vmulhu.vv v8, v8, v11
; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: li a1, 33
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: vmv.v.i v9, 3
-; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
-; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v12, 6
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vsrl.vv v8, v8, v9
+; CHECK-NEXT: vsrl.vv v8, v8, v10
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
@@ -1225,18 +1194,9 @@ define void @mulhu_v4i32(ptr %x) {
; CHECK-NEXT: lui a1, %hi(.LCPI68_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0)
; CHECK-NEXT: vle32.v v9, (a1)
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v11, v10, 2
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmulhu.vv v9, v8, v9
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vmulhu.vv v8, v8, v11
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: lui a1, 4128
-; CHECK-NEXT: addi a1, a1, 514
+; CHECK-NEXT: vmulhu.vv v8, v8, v9
+; CHECK-NEXT: lui a1, 8192
+; CHECK-NEXT: addi a1, a1, 256
; CHECK-NEXT: vmv.s.x v9, a1
; CHECK-NEXT: vsext.vf4 v10, v9
; CHECK-NEXT: vsrl.vv v8, v8, v10
@@ -1253,19 +1213,16 @@ define void @mulhu_v2i64(ptr %x) {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: lui a1, %hi(.LCPI69_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI69_0)
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vle32.v v9, (a1)
+; RV32-NEXT: vmv.v.x v9, a1
+; RV32-NEXT: vmv.v.i v0, 3
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vmerge.vxm v9, v9, a1, v0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vmulhu.vv v8, v8, v9
-; RV32-NEXT: lui a1, 32
-; RV32-NEXT: addi a1, a1, 1
-; RV32-NEXT: vmv.s.x v9, a1
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vsext.vf4 v10, v9
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vv v8, v8, v10
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
@@ -1273,22 +1230,19 @@ define void @mulhu_v2i64(ptr %x) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: lui a1, 209715
+; RV64-NEXT: addiw a1, a1, 819
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: vmv.v.x v9, a1
-; RV64-NEXT: lui a1, 699051
-; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: addiw a1, a1, 1365
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma
; RV64-NEXT: vmv.s.x v9, a1
; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; RV64-NEXT: vmulhu.vv v8, v8, v9
-; RV64-NEXT: vid.v v9
-; RV64-NEXT: vadd.vi v9, v9, 1
-; RV64-NEXT: vsrl.vv v8, v8, v9
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
@@ -1302,18 +1256,18 @@ define void @mulhs_v16i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a1, -123
+; CHECK-NEXT: li a1, 33
; CHECK-NEXT: vmv.v.x v9, a1
; CHECK-NEXT: lui a1, 5
; CHECK-NEXT: addi a1, a1, -1452
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: li a1, 57
+; CHECK-NEXT: li a1, 113
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0
; CHECK-NEXT: vmulhu.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.i v9, 7
-; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
; CHECK-NEXT: vsrl.vv v8, v8, v9
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
@@ -3260,46 +3214,44 @@ define void @mulhu_v32i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT: vle8.v v10, (a0)
-; CHECK-NEXT: vmv.v.i v12, 0
-; CHECK-NEXT: lui a1, 163907
-; CHECK-NEXT: addi a1, a1, -2044
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: lui a1, 66049
-; CHECK-NEXT: addi a1, a1, 32
-; CHECK-NEXT: vmv.s.x v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: lui a1, %hi(.LCPI181_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI181_0)
-; CHECK-NEXT: vle8.v v14, (a1)
+; CHECK-NEXT: vle8.v v12, (a1)
+; CHECK-NEXT: lui a1, 512
+; CHECK-NEXT: addi a1, a1, 32
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: li a1, -128
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vxm v16, v12, a1, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT: vsrl.vv v8, v10, v8
-; CHECK-NEXT: vmulhu.vv v8, v8, v14
-; CHECK-NEXT: vsub.vv v10, v10, v8
-; CHECK-NEXT: vmulhu.vv v10, v10, v16
-; CHECK-NEXT: vadd.vv v10, v10, v8
-; CHECK-NEXT: lui a1, 8208
-; CHECK-NEXT: addi a1, a1, 513
+; CHECK-NEXT: vmerge.vxm v10, v10, a1, v0
+; CHECK-NEXT: vmulhu.vv v12, v8, v12
+; CHECK-NEXT: vsub.vv v8, v8, v12
+; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v10, v8, v12
+; CHECK-NEXT: vmv.v.i v12, 3
+; CHECK-NEXT: lui a1, 16528
+; CHECK-NEXT: addi a1, a1, 1033
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: lui a1, 32
+; CHECK-NEXT: addi a1, a1, 2
+; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 4
-; CHECK-NEXT: vmerge.vim v12, v8, 1, v0
-; CHECK-NEXT: lui a1, 66785
-; CHECK-NEXT: addi a1, a1, 78
+; CHECK-NEXT: vmerge.vim v12, v12, 2, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT: lui a1, 3328
+; CHECK-NEXT: addi a1, a1, 208
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: lui a1, 529160
-; CHECK-NEXT: addi a1, a1, 304
+; CHECK-NEXT: lui a1, 720907
; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vim v12, v12, 3, v0
+; CHECK-NEXT: vmerge.vim v12, v12, 0, v0
; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v12, 2, v0
+; CHECK-NEXT: vmerge.vim v8, v12, 4, v0
; CHECK-NEXT: vsrl.vv v8, v10, v8
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
@@ -3313,37 +3265,36 @@ define void @mulhu_v16i16(ptr %x) {
; RV32-LABEL: mulhu_v16i16:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vle16.v v10, (a0)
-; RV32-NEXT: li a1, 257
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: lui a1, 1048568
-; RV32-NEXT: vmerge.vxm v12, v8, a1, v0
+; RV32-NEXT: vle16.v v8, (a0)
; RV32-NEXT: lui a1, 4
; RV32-NEXT: addi a1, a1, 64
-; RV32-NEXT: vmv.s.x v8, a1
-; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: vmv.s.x v0, a1
; RV32-NEXT: lui a1, %hi(.LCPI182_0)
; RV32-NEXT: addi a1, a1, %lo(.LCPI182_0)
-; RV32-NEXT: vle16.v v14, (a1)
-; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vsext.vf2 v16, v9
-; RV32-NEXT: vsrl.vv v16, v10, v16
-; RV32-NEXT: vmulhu.vv v14, v16, v14
-; RV32-NEXT: vsub.vv v10, v10, v14
-; RV32-NEXT: vmulhu.vv v10, v10, v12
-; RV32-NEXT: vadd.vv v10, v10, v14
+; RV32-NEXT: vle16.v v10, (a1)
+; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: lui a1, 1048568
+; RV32-NEXT: vmerge.vxm v12, v12, a1, v0
+; RV32-NEXT: vmulhu.vv v10, v8, v10
+; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: vmulhu.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v10, v8, v10
; RV32-NEXT: lui a1, 2
-; RV32-NEXT: addi a1, a1, 289
+; RV32-NEXT: addi a1, a1, 546
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v9, v8, 3, v0
+; RV32-NEXT: li a1, 1028
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: lui a1, 1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vmv.s.x v8, a1
; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT: vmv.v.i v9, 3
-; RV32-NEXT: vmerge.vim v9, v9, 2, v0
+; RV32-NEXT: vmerge.vim v9, v9, 1, v0
; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vmerge.vim v8, v9, 1, v0
+; RV32-NEXT: vmerge.vim v8, v9, 2, v0
; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV32-NEXT: vsext.vf2 v12, v8
; RV32-NEXT: vsrl.vv v8, v10, v12
@@ -3354,27 +3305,23 @@ define void @mulhu_v16i16(ptr %x) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v8, (a0)
-; RV64-NEXT: li a1, 257
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: addi a1, a1, 64
; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: lui a1, 1048568
-; RV64-NEXT: vmerge.vxm v10, v10, a1, v0
; RV64-NEXT: lui a1, %hi(.LCPI182_0)
; RV64-NEXT: addi a1, a1, %lo(.LCPI182_0)
-; RV64-NEXT: vle16.v v12, (a1)
-; RV64-NEXT: li a1, 1
-; RV64-NEXT: slli a1, a1, 48
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v14, a1
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vsext.vf2 v16, v14
-; RV64-NEXT: vsrl.vv v14, v8, v16
-; RV64-NEXT: vmulhu.vv v12, v14, v12
-; RV64-NEXT: lui a1, %hi(.LCPI182_1)
-; RV64-NEXT: ld a1, %lo(.LCPI182_1)(a1)
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: vmulhu.vv v8, v8, v10
-; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vle16.v v10, (a1)
+; RV64-NEXT: vmv.v.i v12, 0
+; RV64-NEXT: lui a1, 1048568
+; RV64-NEXT: vmerge.vxm v12, v12, a1, v0
+; RV64-NEXT: vmulhu.vv v10, v8, v10
+; RV64-NEXT: vsub.vv v8, v8, v10
+; RV64-NEXT: vmulhu.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: lui a1, 12320
+; RV64-NEXT: addiw a1, a1, 1
+; RV64-NEXT: slli a1, a1, 16
+; RV64-NEXT: addi a1, a1, 768
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
@@ -3393,20 +3340,12 @@ define void @mulhu_v8i32(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: li a1, 68
-; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: lui a1, %hi(.LCPI183_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI183_0)
; CHECK-NEXT: vle32.v v10, (a1)
-; CHECK-NEXT: vmv.v.i v12, 0
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: vmerge.vxm v12, v12, a1, v0
-; CHECK-NEXT: vmulhu.vv v10, v8, v10
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: vmulhu.vv v8, v8, v12
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: lui a1, 4128
-; CHECK-NEXT: addi a1, a1, 514
+; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: lui a1, 8192
+; CHECK-NEXT: addi a1, a1, 256
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v10, a1
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
@@ -3430,25 +3369,16 @@ define void @mulhu_v4i64(ptr %x) {
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle32.v v10, (a1)
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vmulhu.vv v10, v8, v10
-; RV32-NEXT: lui a1, 524288
-; RV32-NEXT: vmv.s.x v12, a1
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.i v14, 0
-; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV32-NEXT: vslideup.vi v14, v12, 5
-; RV32-NEXT: lui a1, %hi(.LCPI184_1)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI184_1)
+; RV32-NEXT: vmulhu.vv v8, v8, v10
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vle8.v v12, (a1)
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vmulhu.vv v8, v8, v14
-; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: vmv.v.i v10, 3
+; RV32-NEXT: vmv.v.i v11, 0
+; RV32-NEXT: vsetivli zero, 7, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vi v11, v10, 6
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vsext.vf4 v10, v12
+; RV32-NEXT: vsext.vf4 v12, v11
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vv v8, v8, v10
+; RV32-NEXT: vsrl.vv v8, v8, v12
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
@@ -3459,19 +3389,8 @@ define void @mulhu_v4i64(ptr %x) {
; RV64-NEXT: lui a1, %hi(.LCPI184_0)
; RV64-NEXT: addi a1, a1, %lo(.LCPI184_0)
; RV64-NEXT: vle64.v v10, (a1)
-; RV64-NEXT: vmulhu.vv v10, v8, v10
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: li a1, -1
-; RV64-NEXT: slli a1, a1, 63
-; RV64-NEXT: vmv.s.x v12, a1
-; RV64-NEXT: vmv.v.i v14, 0
-; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; RV64-NEXT: vslideup.vi v14, v12, 2
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vmulhu.vv v8, v8, v14
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a1, 12320
-; RV64-NEXT: addi a1, a1, 513
+; RV64-NEXT: vmulhu.vv v8, v8, v10
+; RV64-NEXT: lui a1, 12288
; RV64-NEXT: vmv.s.x v10, a1
; RV64-NEXT: vsext.vf8 v12, v10
; RV64-NEXT: vsrl.vv v8, v8, v12
@@ -3493,11 +3412,11 @@ define void @mulhs_v32i8(ptr %x) {
; CHECK-NEXT: addi a1, a1, -1452
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 7
-; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT: li a1, -123
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: vmerge.vim v10, v10, 2, v0
+; CHECK-NEXT: li a1, 33
; CHECK-NEXT: vmv.v.x v12, a1
-; CHECK-NEXT: li a1, 57
+; CHECK-NEXT: li a1, 113
; CHECK-NEXT: vmerge.vxm v12, v12, a1, v0
; CHECK-NEXT: vmulhu.vv v8, v8, v12
; CHECK-NEXT: vsrl.vv v8, v8, v10
@@ -5573,9 +5492,9 @@ define void @mulhu_vx_v16i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a1, 57
+; CHECK-NEXT: li a1, 113
; CHECK-NEXT: vmulhu.vx v8, v8, a1
-; CHECK-NEXT: vsrl.vi v8, v8, 1
+; CHECK-NEXT: vsrl.vi v8, v8, 2
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
@@ -5591,11 +5510,7 @@ define void @mulhu_vx_v8i16(ptr %x) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: lui a1, 2
; CHECK-NEXT: addi a1, a1, 1171
-; CHECK-NEXT: vmulhu.vx v9, v8, a1
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v8, v8, 1
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v8, v8, 2
+; CHECK-NEXT: vmulhu.vx v8, v8, a1
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
@@ -5609,10 +5524,9 @@ define void @mulhu_vx_v4i32(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: lui a1, 838861
-; CHECK-NEXT: addi a1, a1, -819
+; CHECK-NEXT: lui a1, 209715
+; CHECK-NEXT: addi a1, a1, 819
; CHECK-NEXT: vmulhu.vx v8, v8, a1
-; CHECK-NEXT: vsrl.vi v8, v8, 2
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%a = load <4 x i32>, ptr %x
@@ -5624,33 +5538,26 @@ define void @mulhu_vx_v4i32(ptr %x) {
define void @mulhu_vx_v2i64(ptr %x) {
; RV32-LABEL: mulhu_vx_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: lui a1, 699051
-; RV32-NEXT: addi a2, a1, -1366
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: addi a1, a1, -1365
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v9, (a1), zero
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.v.x v9, a1
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vmulhu.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 1
; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: mulhu_vx_v2i64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: lui a1, 699051
-; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: addiw a1, a1, 1365
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: vmulhu.vx v8, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 1
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
@@ -5664,9 +5571,9 @@ define void @mulhs_vx_v16i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a1, -123
+; CHECK-NEXT: li a1, 33
; CHECK-NEXT: vmulhu.vx v8, v8, a1
-; CHECK-NEXT: vsrl.vi v8, v8, 7
+; CHECK-NEXT: vsrl.vi v8, v8, 5
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
index 4f2fb937ca73f..6a938a679f57c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
@@ -29,10 +29,10 @@ define <vscale x 1 x i8> @vdivu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b)
define <vscale x 1 x i8> @vdivu_vi_nxv1i8_0(<vscale x 1 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv1i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 1 x i8> %va, splat (i8 -7)
ret <vscale x 1 x i8> %vc
@@ -83,10 +83,10 @@ define <vscale x 2 x i8> @vdivu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b)
define <vscale x 2 x i8> @vdivu_vi_nxv2i8_0(<vscale x 2 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv2i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 2 x i8> %va, splat (i8 -7)
ret <vscale x 2 x i8> %vc
@@ -117,10 +117,10 @@ define <vscale x 4 x i8> @vdivu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b)
define <vscale x 4 x i8> @vdivu_vi_nxv4i8_0(<vscale x 4 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv4i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 4 x i8> %va, splat (i8 -7)
ret <vscale x 4 x i8> %vc
@@ -151,10 +151,10 @@ define <vscale x 8 x i8> @vdivu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b)
define <vscale x 8 x i8> @vdivu_vi_nxv8i8_0(<vscale x 8 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv8i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 8 x i8> %va, splat (i8 -7)
ret <vscale x 8 x i8> %vc
@@ -185,10 +185,10 @@ define <vscale x 16 x i8> @vdivu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %
define <vscale x 16 x i8> @vdivu_vi_nxv16i8_0(<vscale x 16 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv16i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 16 x i8> %va, splat (i8 -7)
ret <vscale x 16 x i8> %vc
@@ -219,10 +219,10 @@ define <vscale x 32 x i8> @vdivu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %
define <vscale x 32 x i8> @vdivu_vi_nxv32i8_0(<vscale x 32 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv32i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 32 x i8> %va, splat (i8 -7)
ret <vscale x 32 x i8> %vc
@@ -253,10 +253,10 @@ define <vscale x 64 x i8> @vdivu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %
define <vscale x 64 x i8> @vdivu_vi_nxv64i8_0(<vscale x 64 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv64i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 64 x i8> %va, splat (i8 -7)
ret <vscale x 64 x i8> %vc
@@ -287,11 +287,11 @@ define <vscale x 1 x i16> @vdivu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext
define <vscale x 1 x i16> @vdivu_vi_nxv1i16_0(<vscale x 1 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv1i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 1 x i16> %va, splat (i16 -7)
ret <vscale x 1 x i16> %vc
@@ -322,11 +322,11 @@ define <vscale x 2 x i16> @vdivu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext
define <vscale x 2 x i16> @vdivu_vi_nxv2i16_0(<vscale x 2 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv2i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 2 x i16> %va, splat (i16 -7)
ret <vscale x 2 x i16> %vc
@@ -357,11 +357,11 @@ define <vscale x 4 x i16> @vdivu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext
define <vscale x 4 x i16> @vdivu_vi_nxv4i16_0(<vscale x 4 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv4i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 4 x i16> %va, splat (i16 -7)
ret <vscale x 4 x i16> %vc
@@ -392,11 +392,11 @@ define <vscale x 8 x i16> @vdivu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext
define <vscale x 8 x i16> @vdivu_vi_nxv8i16_0(<vscale x 8 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv8i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 8 x i16> %va, splat (i16 -7)
ret <vscale x 8 x i16> %vc
@@ -427,11 +427,11 @@ define <vscale x 16 x i16> @vdivu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signe
define <vscale x 16 x i16> @vdivu_vi_nxv16i16_0(<vscale x 16 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv16i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 16 x i16> %va, splat (i16 -7)
ret <vscale x 16 x i16> %vc
@@ -462,11 +462,11 @@ define <vscale x 32 x i16> @vdivu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signe
define <vscale x 32 x i16> @vdivu_vi_nxv32i16_0(<vscale x 32 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv32i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 32 x i16> %va, splat (i16 -7)
ret <vscale x 32 x i16> %vc
@@ -497,11 +497,11 @@ define <vscale x 1 x i32> @vdivu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 signext
define <vscale x 1 x i32> @vdivu_vi_nxv1i32_0(<vscale x 1 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv1i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 1 x i32> %va, splat (i32 -7)
ret <vscale x 1 x i32> %vc
@@ -532,11 +532,11 @@ define <vscale x 2 x i32> @vdivu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 signext
define <vscale x 2 x i32> @vdivu_vi_nxv2i32_0(<vscale x 2 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv2i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 2 x i32> %va, splat (i32 -7)
ret <vscale x 2 x i32> %vc
@@ -567,11 +567,11 @@ define <vscale x 4 x i32> @vdivu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 signext
define <vscale x 4 x i32> @vdivu_vi_nxv4i32_0(<vscale x 4 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv4i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 4 x i32> %va, splat (i32 -7)
ret <vscale x 4 x i32> %vc
@@ -602,11 +602,11 @@ define <vscale x 8 x i32> @vdivu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 signext
define <vscale x 8 x i32> @vdivu_vi_nxv8i32_0(<vscale x 8 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv8i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 8 x i32> %va, splat (i32 -7)
ret <vscale x 8 x i32> %vc
@@ -637,11 +637,11 @@ define <vscale x 16 x i32> @vdivu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 signe
define <vscale x 16 x i32> @vdivu_vi_nxv16i32_0(<vscale x 16 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv16i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 16 x i32> %va, splat (i32 -7)
ret <vscale x 16 x i32> %vc
@@ -687,15 +687,15 @@ define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV32-V-NEXT: vlse64.v v9, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v9
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -709,12 +709,12 @@ define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv1i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 1 x i64> %va, splat (i64 -7)
@@ -784,15 +784,15 @@ define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV32-V-NEXT: vlse64.v v10, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v10
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -806,12 +806,12 @@ define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv2i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 2 x i64> %va, splat (i64 -7)
@@ -881,15 +881,15 @@ define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV32-V-NEXT: vlse64.v v12, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v12
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -903,12 +903,12 @@ define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv4i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 4 x i64> %va, splat (i64 -7)
@@ -978,15 +978,15 @@ define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-V-NEXT: vlse64.v v16, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v16
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -1000,12 +1000,12 @@ define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv8i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 8 x i64> %va, splat (i64 -7)
@@ -1069,11 +1069,7 @@ define <vscale x 8 x i32> @vdivu_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
; CHECK-NEXT: lui a0, 149797
; CHECK-NEXT: addi a0, a0, -1755
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsub.vv v16, v8, v12
-; CHECK-NEXT: vsrl.vi v16, v16, 1
-; CHECK-NEXT: vadd.vv v12, v16, v12
-; CHECK-NEXT: vsrl.vi v8, v12, 2, v0.t
+; CHECK-NEXT: vmulhu.vx v8, v8, a0, v0.t
; CHECK-NEXT: ret
%vs = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> splat (i32 7), <vscale x 8 x i32> splat (i32 1)
%vc = udiv <vscale x 8 x i32> %va, %vs
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
index ed40f5af4fa4c..c880ae4f44b75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
@@ -29,10 +29,10 @@ define <vscale x 1 x i8> @vremu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b)
define <vscale x 1 x i8> @vremu_vi_nxv1i8_0(<vscale x 1 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv1i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -65,10 +65,10 @@ define <vscale x 2 x i8> @vremu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b)
define <vscale x 2 x i8> @vremu_vi_nxv2i8_0(<vscale x 2 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv2i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -101,10 +101,10 @@ define <vscale x 4 x i8> @vremu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b)
define <vscale x 4 x i8> @vremu_vi_nxv4i8_0(<vscale x 4 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv4i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -137,10 +137,10 @@ define <vscale x 8 x i8> @vremu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b)
define <vscale x 8 x i8> @vremu_vi_nxv8i8_0(<vscale x 8 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv8i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -173,10 +173,10 @@ define <vscale x 16 x i8> @vremu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %
define <vscale x 16 x i8> @vremu_vi_nxv16i8_0(<vscale x 16 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv16i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v10, v10, 5
+; CHECK-NEXT: vsrl.vi v10, v10, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v10
; CHECK-NEXT: ret
@@ -209,10 +209,10 @@ define <vscale x 32 x i8> @vremu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %
define <vscale x 32 x i8> @vremu_vi_nxv32i8_0(<vscale x 32 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv32i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v12, v12, 5
+; CHECK-NEXT: vsrl.vi v12, v12, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v12
; CHECK-NEXT: ret
@@ -245,10 +245,10 @@ define <vscale x 64 x i8> @vremu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %
define <vscale x 64 x i8> @vremu_vi_nxv64i8_0(<vscale x 64 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv64i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v16, v16, 5
+; CHECK-NEXT: vsrl.vi v16, v16, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v16
; CHECK-NEXT: ret
@@ -281,11 +281,11 @@ define <vscale x 1 x i16> @vremu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext
define <vscale x 1 x i16> @vremu_vi_nxv1i16_0(<vscale x 1 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv1i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 13
+; CHECK-NEXT: vsrl.vi v9, v9, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -318,11 +318,11 @@ define <vscale x 2 x i16> @vremu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext
define <vscale x 2 x i16> @vremu_vi_nxv2i16_0(<vscale x 2 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv2i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 13
+; CHECK-NEXT: vsrl.vi v9, v9, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -355,11 +355,11 @@ define <vscale x 4 x i16> @vremu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext
define <vscale x 4 x i16> @vremu_vi_nxv4i16_0(<vscale x 4 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv4i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 13
+; CHECK-NEXT: vsrl.vi v9, v9, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -392,11 +392,11 @@ define <vscale x 8 x i16> @vremu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext
define <vscale x 8 x i16> @vremu_vi_nxv8i16_0(<vscale x 8 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv8i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v10, v10, 13
+; CHECK-NEXT: vsrl.vi v10, v10, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v10
; CHECK-NEXT: ret
@@ -429,11 +429,11 @@ define <vscale x 16 x i16> @vremu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signe
define <vscale x 16 x i16> @vremu_vi_nxv16i16_0(<vscale x 16 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv16i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v12, v12, 13
+; CHECK-NEXT: vsrl.vi v12, v12, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v12
; CHECK-NEXT: ret
@@ -466,11 +466,11 @@ define <vscale x 32 x i16> @vremu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signe
define <vscale x 32 x i16> @vremu_vi_nxv32i16_0(<vscale x 32 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv32i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v16, v16, 13
+; CHECK-NEXT: vsrl.vi v16, v16, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v16
; CHECK-NEXT: ret
@@ -503,11 +503,11 @@ define <vscale x 1 x i32> @vremu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 signext
define <vscale x 1 x i32> @vremu_vi_nxv1i32_0(<vscale x 1 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv1i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 29
+; CHECK-NEXT: vsrl.vi v9, v9, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -540,11 +540,11 @@ define <vscale x 2 x i32> @vremu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 signext
define <vscale x 2 x i32> @vremu_vi_nxv2i32_0(<vscale x 2 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv2i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 29
+; CHECK-NEXT: vsrl.vi v9, v9, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -577,11 +577,11 @@ define <vscale x 4 x i32> @vremu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 signext
define <vscale x 4 x i32> @vremu_vi_nxv4i32_0(<vscale x 4 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv4i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v10, v10, 29
+; CHECK-NEXT: vsrl.vi v10, v10, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v10
; CHECK-NEXT: ret
@@ -614,11 +614,11 @@ define <vscale x 8 x i32> @vremu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 signext
define <vscale x 8 x i32> @vremu_vi_nxv8i32_0(<vscale x 8 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv8i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v12, v12, 29
+; CHECK-NEXT: vsrl.vi v12, v12, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v12
; CHECK-NEXT: ret
@@ -651,11 +651,11 @@ define <vscale x 16 x i32> @vremu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 signe
define <vscale x 16 x i32> @vremu_vi_nxv16i32_0(<vscale x 16 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv16i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v16, v16, 29
+; CHECK-NEXT: vsrl.vi v16, v16, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v16
; CHECK-NEXT: ret
@@ -703,15 +703,15 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV32-V-NEXT: vlse64.v v9, (a0), zero
; RV32-V-NEXT: vmulhu.vv v9, v8, v9
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v9, v9, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v9
@@ -727,12 +727,12 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv1i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV64-V-NEXT: vmulhu.vx v9, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v9, v9, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v9
@@ -808,15 +808,15 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV32-V-NEXT: vlse64.v v10, (a0), zero
; RV32-V-NEXT: vmulhu.vv v10, v8, v10
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v10, v10, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v10
@@ -832,12 +832,12 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv2i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV64-V-NEXT: vmulhu.vx v10, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v10, v10, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v10
@@ -913,15 +913,15 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV32-V-NEXT: vlse64.v v12, (a0), zero
; RV32-V-NEXT: vmulhu.vv v12, v8, v12
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v12, v12, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v12
@@ -937,12 +937,12 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv4i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV64-V-NEXT: vmulhu.vx v12, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v12, v12, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v12
@@ -1018,15 +1018,15 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-V-NEXT: vlse64.v v16, (a0), zero
; RV32-V-NEXT: vmulhu.vv v16, v8, v16
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v16, v16, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v16
@@ -1042,12 +1042,12 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv8i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV64-V-NEXT: vmulhu.vx v16, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v16, v16, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v16
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index ffbbe31412ed2..c2557a91714d0 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1096,11 +1096,10 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
; RV32IM: # %bb.0: # %entry
; RV32IM-NEXT: bnez a0, .LBB27_2
; RV32IM-NEXT: # %bb.1: # %entry
-; RV32IM-NEXT: srli a1, a1, 1
-; RV32IM-NEXT: lui a0, 199729
-; RV32IM-NEXT: addi a0, a0, -975
+; RV32IM-NEXT: lui a0, 399458
+; RV32IM-NEXT: addi a0, a0, -1951
; RV32IM-NEXT: mulhu a1, a1, a0
-; RV32IM-NEXT: srli a1, a1, 2
+; RV32IM-NEXT: srli a1, a1, 4
; RV32IM-NEXT: .LBB27_2: # %entry
; RV32IM-NEXT: mv a0, a1
; RV32IM-NEXT: ret
@@ -1109,22 +1108,24 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
; RV64IM: # %bb.0: # %entry
; RV64IM-NEXT: bnez a0, .LBB27_2
; RV64IM-NEXT: # %bb.1: # %entry
-; RV64IM-NEXT: srliw a0, a1, 1
-; RV64IM-NEXT: lui a1, 199729
-; RV64IM-NEXT: addiw a1, a1, -975
-; RV64IM-NEXT: mul a1, a0, a1
-; RV64IM-NEXT: srli a1, a1, 34
+; RV64IM-NEXT: slli a1, a1, 32
+; RV64IM-NEXT: lui a0, 399458
+; RV64IM-NEXT: addi a0, a0, -1951
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: mulhu a1, a1, a0
+; RV64IM-NEXT: srli a1, a1, 36
; RV64IM-NEXT: .LBB27_2: # %entry
; RV64IM-NEXT: mv a0, a1
; RV64IM-NEXT: ret
;
; RV64IMXVTCONDOPS-LABEL: select_udiv_3:
; RV64IMXVTCONDOPS: # %bb.0: # %entry
-; RV64IMXVTCONDOPS-NEXT: srliw a2, a1, 1
-; RV64IMXVTCONDOPS-NEXT: lui a3, 199729
-; RV64IMXVTCONDOPS-NEXT: addiw a3, a3, -975
-; RV64IMXVTCONDOPS-NEXT: mul a2, a2, a3
-; RV64IMXVTCONDOPS-NEXT: srli a2, a2, 34
+; RV64IMXVTCONDOPS-NEXT: slli a2, a1, 32
+; RV64IMXVTCONDOPS-NEXT: lui a3, 399458
+; RV64IMXVTCONDOPS-NEXT: addi a3, a3, -1951
+; RV64IMXVTCONDOPS-NEXT: slli a3, a3, 32
+; RV64IMXVTCONDOPS-NEXT: mulhu a2, a2, a3
+; RV64IMXVTCONDOPS-NEXT: srli a2, a2, 36
; RV64IMXVTCONDOPS-NEXT: vt.maskc a1, a1, a0
; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a2, a0
; RV64IMXVTCONDOPS-NEXT: or a0, a1, a0
@@ -1132,11 +1133,10 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
;
; RV32IMZICOND-LABEL: select_udiv_3:
; RV32IMZICOND: # %bb.0: # %entry
-; RV32IMZICOND-NEXT: srli a2, a1, 1
-; RV32IMZICOND-NEXT: lui a3, 199729
-; RV32IMZICOND-NEXT: addi a3, a3, -975
-; RV32IMZICOND-NEXT: mulhu a2, a2, a3
-; RV32IMZICOND-NEXT: srli a2, a2, 2
+; RV32IMZICOND-NEXT: lui a2, 399458
+; RV32IMZICOND-NEXT: addi a2, a2, -1951
+; RV32IMZICOND-NEXT: mulhu a2, a1, a2
+; RV32IMZICOND-NEXT: srli a2, a2, 4
; RV32IMZICOND-NEXT: czero.eqz a1, a1, a0
; RV32IMZICOND-NEXT: czero.nez a0, a2, a0
; RV32IMZICOND-NEXT: or a0, a1, a0
@@ -1144,11 +1144,12 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
;
; RV64IMZICOND-LABEL: select_udiv_3:
; RV64IMZICOND: # %bb.0: # %entry
-; RV64IMZICOND-NEXT: srliw a2, a1, 1
-; RV64IMZICOND-NEXT: lui a3, 199729
-; RV64IMZICOND-NEXT: addiw a3, a3, -975
-; RV64IMZICOND-NEXT: mul a2, a2, a3
-; RV64IMZICOND-NEXT: srli a2, a2, 34
+; RV64IMZICOND-NEXT: slli a2, a1, 32
+; RV64IMZICOND-NEXT: lui a3, 399458
+; RV64IMZICOND-NEXT: addi a3, a3, -1951
+; RV64IMZICOND-NEXT: slli a3, a3, 32
+; RV64IMZICOND-NEXT: mulhu a2, a2, a3
+; RV64IMZICOND-NEXT: srli a2, a2, 36
; RV64IMZICOND-NEXT: czero.eqz a1, a1, a0
; RV64IMZICOND-NEXT: czero.nez a0, a2, a0
; RV64IMZICOND-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index 5fa802b7f27ca..547ba26a198ca 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -10,23 +10,24 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 699051
-; RV32-NEXT: addi a4, a3, -1365
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 1
-; RV32-NEXT: andi a5, a5, -2
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -1366
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 699051
+; RV32-NEXT: addi a5, a4, -1366
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -1365
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_3:
@@ -34,26 +35,29 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 699051
-; RV64-NEXT: addiw a3, a3, -1365
+; RV64-NEXT: lui a3, 349525
+; RV64-NEXT: addiw a3, a3, 1365
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 1
-; RV64-NEXT: andi a4, a4, -2
-; RV64-NEXT: lui a6, %hi(.LCPI0_0)
-; RV64-NEXT: ld a6, %lo(.LCPI0_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 1
+; RV64-NEXT: lui a5, %hi(.LCPI0_0)
+; RV64-NEXT: ld a5, %lo(.LCPI0_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 699051
+; RV64-NEXT: addiw a5, a5, -1365
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 3
ret iXLen2 %a
@@ -65,23 +69,24 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 838861
-; RV32-NEXT: addi a4, a3, -819
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 2
-; RV32-NEXT: andi a5, a5, -4
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -820
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 2
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 838861
+; RV32-NEXT: addi a5, a4, -820
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -819
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_5:
@@ -89,26 +94,29 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 838861
-; RV64-NEXT: addiw a3, a3, -819
+; RV64-NEXT: lui a3, 209715
+; RV64-NEXT: addiw a3, a3, 819
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 2
-; RV64-NEXT: andi a4, a4, -4
-; RV64-NEXT: lui a6, %hi(.LCPI1_0)
-; RV64-NEXT: ld a6, %lo(.LCPI1_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 2
+; RV64-NEXT: lui a5, %hi(.LCPI1_0)
+; RV64-NEXT: ld a5, %lo(.LCPI1_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 838861
+; RV64-NEXT: addiw a5, a5, -819
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 5
ret iXLen2 %a
@@ -172,10 +180,9 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 559241
-; RV32-NEXT: addi a3, a3, -1911
+; RV32-NEXT: lui a3, 69905
+; RV32-NEXT: addi a3, a3, 273
; RV32-NEXT: mulhu a3, a2, a3
-; RV32-NEXT: srli a3, a3, 3
; RV32-NEXT: slli a4, a3, 4
; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: add a2, a2, a3
@@ -198,12 +205,11 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 559241
-; RV64-NEXT: addiw a3, a3, -1911
+; RV64-NEXT: lui a3, 69905
+; RV64-NEXT: addiw a3, a3, 273
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a3, a2, a3
-; RV64-NEXT: srli a3, a3, 3
; RV64-NEXT: slli a4, a3, 4
; RV64-NEXT: lui a5, %hi(.LCPI4_0)
; RV64-NEXT: ld a5, %lo(.LCPI4_0)(a5)
@@ -233,23 +239,24 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 986895
-; RV32-NEXT: addi a4, a3, 241
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 4
-; RV32-NEXT: andi a5, a5, -16
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, 240
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 986895
+; RV32-NEXT: addi a5, a4, 240
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, 241
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_17:
@@ -257,26 +264,29 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 986895
-; RV64-NEXT: addiw a3, a3, 241
+; RV64-NEXT: lui a3, 61681
+; RV64-NEXT: addiw a3, a3, -241
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 4
-; RV64-NEXT: andi a4, a4, -16
-; RV64-NEXT: lui a6, %hi(.LCPI5_0)
-; RV64-NEXT: ld a6, %lo(.LCPI5_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 4
+; RV64-NEXT: lui a5, %hi(.LCPI5_0)
+; RV64-NEXT: ld a5, %lo(.LCPI5_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 986895
+; RV64-NEXT: addiw a5, a5, 241
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 17
ret iXLen2 %a
@@ -288,10 +298,9 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 526344
-; RV32-NEXT: addi a3, a3, 129
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: mulhu a3, a2, a3
-; RV32-NEXT: srli a3, a3, 7
; RV32-NEXT: slli a4, a3, 8
; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: add a2, a2, a3
@@ -314,12 +323,11 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 526344
-; RV64-NEXT: addiw a3, a3, 129
+; RV64-NEXT: lui a3, 4112
+; RV64-NEXT: addiw a3, a3, 257
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a3, a2, a3
-; RV64-NEXT: srli a3, a3, 7
; RV64-NEXT: slli a4, a3, 8
; RV64-NEXT: lui a5, %hi(.LCPI6_0)
; RV64-NEXT: ld a5, %lo(.LCPI6_0)(a5)
@@ -349,23 +357,24 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 1044496
-; RV32-NEXT: addi a4, a3, -255
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 8
-; RV32-NEXT: andi a5, a5, -256
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -256
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: addi a3, a3, 255
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 1044496
+; RV32-NEXT: addi a5, a4, -256
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -255
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_257:
@@ -373,26 +382,29 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 1044496
-; RV64-NEXT: addiw a3, a3, -255
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: addiw a3, a3, 255
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 8
-; RV64-NEXT: andi a4, a4, -256
-; RV64-NEXT: lui a6, %hi(.LCPI7_0)
-; RV64-NEXT: ld a6, %lo(.LCPI7_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 8
+; RV64-NEXT: lui a5, %hi(.LCPI7_0)
+; RV64-NEXT: ld a5, %lo(.LCPI7_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 1044496
+; RV64-NEXT: addiw a5, a5, -255
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 257
ret iXLen2 %a
@@ -404,10 +416,9 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 524296
+; RV32-NEXT: lui a3, 16
; RV32-NEXT: addi a3, a3, 1
; RV32-NEXT: mulhu a3, a2, a3
-; RV32-NEXT: srli a3, a3, 15
; RV32-NEXT: slli a4, a3, 16
; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: add a2, a2, a3
@@ -433,12 +444,11 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 524296
+; RV64-NEXT: lui a3, 16
; RV64-NEXT: addiw a3, a3, 1
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a3, a2, a3
-; RV64-NEXT: srli a3, a3, 15
; RV64-NEXT: slli a4, a3, 16
; RV64-NEXT: sub a3, a3, a4
; RV64-NEXT: add a2, a2, a3
@@ -471,14 +481,15 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 1048560
-; RV32-NEXT: addi a4, a3, 1
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: and a3, a5, a3
-; RV32-NEXT: srli a5, a5, 16
-; RV32-NEXT: or a3, a3, a5
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: addi a3, a3, -1
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: sub a2, a2, a3
; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 1048560
+; RV32-NEXT: addi a4, a4, 1
; RV32-NEXT: mulhu a4, a3, a4
; RV32-NEXT: slli a5, a3, 16
; RV32-NEXT: sub a4, a4, a5
@@ -495,28 +506,30 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 1048560
-; RV64-NEXT: addiw a4, a3, 1
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: mulhu a5, a2, a4
-; RV64-NEXT: and a3, a5, a3
-; RV64-NEXT: srli a5, a5, 16
-; RV64-NEXT: add a3, a3, a5
-; RV64-NEXT: sub a2, a2, a3
-; RV64-NEXT: sub a3, a0, a2
-; RV64-NEXT: lui a5, 983041
-; RV64-NEXT: slli a5, a5, 4
-; RV64-NEXT: addi a5, a5, -1
-; RV64-NEXT: slli a5, a5, 16
-; RV64-NEXT: mul a5, a3, a5
-; RV64-NEXT: mulhu a6, a3, a4
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: lui a3, 983041
+; RV64-NEXT: slli a4, a3, 20
+; RV64-NEXT: addi a4, a4, -1
+; RV64-NEXT: srli a4, a4, 16
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: slli a5, a4, 16
+; RV64-NEXT: add a4, a5, a4
+; RV64-NEXT: sub a2, a2, a4
+; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: slli a3, a3, 16
+; RV64-NEXT: mul a3, a4, a3
+; RV64-NEXT: lui a5, 1048560
+; RV64-NEXT: addiw a5, a5, 1
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a4, a5
+; RV64-NEXT: add a3, a6, a3
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a4
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a3, a4
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a3, a1
+; RV64-NEXT: mul a0, a4, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 65537
ret iXLen2 %a
@@ -532,23 +545,24 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 699051
-; RV32-NEXT: addi a4, a3, -1365
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 1
-; RV32-NEXT: andi a5, a5, -2
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -1366
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 699051
+; RV32-NEXT: addi a5, a4, -1366
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -1365
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_12:
@@ -560,26 +574,29 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 699051
-; RV64-NEXT: addiw a3, a3, -1365
+; RV64-NEXT: lui a3, 349525
+; RV64-NEXT: addiw a3, a3, 1365
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 1
-; RV64-NEXT: andi a4, a4, -2
-; RV64-NEXT: lui a6, %hi(.LCPI10_0)
-; RV64-NEXT: ld a6, %lo(.LCPI10_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 1
+; RV64-NEXT: lui a5, %hi(.LCPI10_0)
+; RV64-NEXT: ld a5, %lo(.LCPI10_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 699051
+; RV64-NEXT: addiw a5, a5, -1365
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 12
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 8444520fcc771..2c70ae3215c9d 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -10,12 +10,11 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 699051
-; RV32-NEXT: addi a1, a1, -1365
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 1
-; RV32-NEXT: andi a1, a1, -2
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -25,14 +24,13 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 699051
-; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: addiw a1, a1, 1365
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 1
-; RV64-NEXT: andi a1, a1, -2
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -46,12 +44,11 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 838861
-; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 2
-; RV32-NEXT: andi a1, a1, -4
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -61,14 +58,13 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: lui a1, 209715
+; RV64-NEXT: addiw a1, a1, 819
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 2
-; RV64-NEXT: andi a1, a1, -4
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -134,10 +130,9 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 559241
-; RV32-NEXT: addi a1, a1, -1911
+; RV32-NEXT: lui a1, 69905
+; RV32-NEXT: addi a1, a1, 273
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a1, a1, 3
; RV32-NEXT: slli a2, a1, 4
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -149,12 +144,11 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 559241
-; RV64-NEXT: addiw a1, a1, -1911
+; RV64-NEXT: lui a1, 69905
+; RV64-NEXT: addiw a1, a1, 273
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a1, a1, 3
; RV64-NEXT: slli a2, a1, 4
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -170,12 +164,11 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 986895
-; RV32-NEXT: addi a1, a1, 241
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 4
-; RV32-NEXT: andi a1, a1, -16
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 4
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -185,14 +178,13 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 986895
-; RV64-NEXT: addiw a1, a1, 241
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 4
-; RV64-NEXT: andi a1, a1, -16
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -206,10 +198,9 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 526344
-; RV32-NEXT: addi a1, a1, 129
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a1, a1, 7
; RV32-NEXT: slli a2, a1, 8
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -221,12 +212,11 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 526344
-; RV64-NEXT: addiw a1, a1, 129
+; RV64-NEXT: lui a1, 4112
+; RV64-NEXT: addiw a1, a1, 257
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a1, a1, 7
; RV64-NEXT: slli a2, a1, 8
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -242,12 +232,11 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 1044496
-; RV32-NEXT: addi a1, a1, -255
+; RV32-NEXT: lui a1, 4080
+; RV32-NEXT: addi a1, a1, 255
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 8
-; RV32-NEXT: andi a1, a1, -256
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 8
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -257,14 +246,13 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 1044496
-; RV64-NEXT: addiw a1, a1, -255
+; RV64-NEXT: lui a1, 4080
+; RV64-NEXT: addiw a1, a1, 255
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 8
-; RV64-NEXT: andi a1, a1, -256
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 8
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -278,10 +266,9 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 524296
+; RV32-NEXT: lui a1, 16
; RV32-NEXT: addi a1, a1, 1
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a1, a1, 15
; RV32-NEXT: slli a2, a1, 16
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -293,12 +280,11 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 524296
+; RV64-NEXT: lui a1, 16
; RV64-NEXT: addiw a1, a1, 1
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a1, a1, 15
; RV64-NEXT: slli a2, a1, 16
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -314,12 +300,11 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 1048560
-; RV32-NEXT: addi a2, a1, 1
-; RV32-NEXT: mulhu a2, a0, a2
-; RV32-NEXT: and a1, a2, a1
-; RV32-NEXT: srli a2, a2, 16
-; RV32-NEXT: or a1, a1, a2
+; RV32-NEXT: lui a1, 16
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: slli a2, a1, 16
+; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -329,14 +314,13 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 1048560
-; RV64-NEXT: addiw a2, a1, 1
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: mulhu a2, a0, a2
-; RV64-NEXT: and a1, a2, a1
-; RV64-NEXT: srli a2, a2, 16
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: lui a1, 983041
+; RV64-NEXT: slli a1, a1, 20
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: srli a1, a1, 16
+; RV64-NEXT: mulhu a1, a0, a1
+; RV64-NEXT: slli a2, a1, 16
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -354,12 +338,11 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: lui a2, 699051
-; RV32-NEXT: addi a2, a2, -1365
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: mulhu a2, a1, a2
-; RV32-NEXT: srli a3, a2, 1
-; RV32-NEXT: andi a2, a2, -2
-; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: slli a3, a2, 1
+; RV32-NEXT: add a2, a3, a2
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: andi a0, a0, 3
@@ -376,14 +359,13 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: lui a2, 699051
-; RV64-NEXT: addiw a2, a2, -1365
+; RV64-NEXT: lui a2, 349525
+; RV64-NEXT: addiw a2, a2, 1365
; RV64-NEXT: slli a3, a2, 32
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: mulhu a2, a1, a2
-; RV64-NEXT: srli a3, a2, 1
-; RV64-NEXT: andi a2, a2, -2
-; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: slli a3, a2, 1
+; RV64-NEXT: add a2, a3, a2
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: slli a1, a1, 2
; RV64-NEXT: andi a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index f83a933c0b5c8..f78e44f869624 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -16,12 +16,9 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
;
; RV32IM-LABEL: fold_urem_positive_odd:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a1, 364242
-; RV32IM-NEXT: addi a1, a1, 777
+; RV32IM-NEXT: lui a1, 706409
+; RV32IM-NEXT: addi a1, a1, 387
; RV32IM-NEXT: mulhu a1, a0, a1
-; RV32IM-NEXT: sub a2, a0, a1
-; RV32IM-NEXT: srli a2, a2, 1
-; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: srli a1, a1, 6
; RV32IM-NEXT: li a2, 95
; RV32IM-NEXT: mul a1, a1, a2
@@ -43,15 +40,11 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
; RV64IM-LABEL: fold_urem_positive_odd:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 364242
-; RV64IM-NEXT: addi a2, a2, 777
+; RV64IM-NEXT: lui a2, 706409
+; RV64IM-NEXT: addi a2, a2, 387
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a2, a0, a1
-; RV64IM-NEXT: srliw a2, a2, 1
-; RV64IM-NEXT: add a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 6
+; RV64IM-NEXT: srli a1, a1, 38
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: subw a0, a0, a1
@@ -70,7 +63,7 @@ define i32 @fold_urem_positive_even(i32 %x) nounwind {
; RV32IM-LABEL: fold_urem_positive_even:
; RV32IM: # %bb.0:
; RV32IM-NEXT: lui a1, 1012964
-; RV32IM-NEXT: addi a1, a1, -61
+; RV32IM-NEXT: addi a1, a1, -63
; RV32IM-NEXT: mulhu a1, a0, a1
; RV32IM-NEXT: srli a1, a1, 10
; RV32IM-NEXT: li a2, 1060
@@ -94,7 +87,7 @@ define i32 @fold_urem_positive_even(i32 %x) nounwind {
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
; RV64IM-NEXT: lui a2, 1012964
-; RV64IM-NEXT: addi a2, a2, -61
+; RV64IM-NEXT: addi a2, a2, -63
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 42
@@ -131,12 +124,9 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
;
; RV32IM-LABEL: combine_urem_udiv:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a1, 364242
-; RV32IM-NEXT: addi a1, a1, 777
+; RV32IM-NEXT: lui a1, 706409
+; RV32IM-NEXT: addi a1, a1, 387
; RV32IM-NEXT: mulhu a1, a0, a1
-; RV32IM-NEXT: sub a2, a0, a1
-; RV32IM-NEXT: srli a2, a2, 1
-; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: srli a1, a1, 6
; RV32IM-NEXT: li a2, 95
; RV32IM-NEXT: mul a2, a1, a2
@@ -169,15 +159,11 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
; RV64IM-LABEL: combine_urem_udiv:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 364242
-; RV64IM-NEXT: addi a2, a2, 777
+; RV64IM-NEXT: lui a2, 706409
+; RV64IM-NEXT: addi a2, a2, 387
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a2, a0, a1
-; RV64IM-NEXT: srliw a2, a2, 1
-; RV64IM-NEXT: add a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 6
+; RV64IM-NEXT: srli a1, a1, 38
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mul a2, a1, a2
; RV64IM-NEXT: add a0, a0, a1
@@ -251,9 +237,10 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
; RV64IM: # %bb.0:
; RV64IM-NEXT: lui a1, %hi(.LCPI6_0)
; RV64IM-NEXT: ld a1, %lo(.LCPI6_0)(a1)
-; RV64IM-NEXT: srli a2, a0, 1
-; RV64IM-NEXT: mulhu a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 4
+; RV64IM-NEXT: mulhu a1, a0, a1
+; RV64IM-NEXT: sub a2, a0, a1
+; RV64IM-NEXT: srli a2, a2, 1
+; RV64IM-NEXT: add a1, a2, a1
; RV64IM-NEXT: li a2, 98
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index c057c656e0fb7..fedb8c0bed02f 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -956,39 +956,35 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
;
; RV64IM-LABEL: dont_fold_urem_i64:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: ld a2, 16(a1)
+; RV64IM-NEXT: ld a2, 8(a1)
; RV64IM-NEXT: lui a3, %hi(.LCPI6_0)
; RV64IM-NEXT: ld a3, %lo(.LCPI6_0)(a3)
; RV64IM-NEXT: ld a4, 24(a1)
-; RV64IM-NEXT: ld a1, 8(a1)
+; RV64IM-NEXT: ld a1, 16(a1)
; RV64IM-NEXT: mulhu a3, a2, a3
-; RV64IM-NEXT: sub a5, a2, a3
-; RV64IM-NEXT: srli a5, a5, 1
-; RV64IM-NEXT: add a3, a5, a3
-; RV64IM-NEXT: srli a3, a3, 4
-; RV64IM-NEXT: li a5, 23
-; RV64IM-NEXT: lui a6, %hi(.LCPI6_1)
-; RV64IM-NEXT: ld a6, %lo(.LCPI6_1)(a6)
-; RV64IM-NEXT: mul a3, a3, a5
+; RV64IM-NEXT: srli a3, a3, 6
+; RV64IM-NEXT: lui a5, %hi(.LCPI6_1)
+; RV64IM-NEXT: ld a5, %lo(.LCPI6_1)(a5)
+; RV64IM-NEXT: li a6, 654
+; RV64IM-NEXT: mul a3, a3, a6
; RV64IM-NEXT: sub a2, a2, a3
-; RV64IM-NEXT: srli a3, a1, 1
-; RV64IM-NEXT: mulhu a3, a3, a6
-; RV64IM-NEXT: srli a3, a3, 7
+; RV64IM-NEXT: mulhu a3, a1, a5
+; RV64IM-NEXT: srli a3, a3, 4
; RV64IM-NEXT: lui a5, %hi(.LCPI6_2)
; RV64IM-NEXT: ld a5, %lo(.LCPI6_2)(a5)
-; RV64IM-NEXT: li a6, 654
+; RV64IM-NEXT: li a6, 23
; RV64IM-NEXT: mul a3, a3, a6
; RV64IM-NEXT: sub a1, a1, a3
; RV64IM-NEXT: mulhu a3, a4, a5
-; RV64IM-NEXT: srli a3, a3, 12
+; RV64IM-NEXT: srli a3, a3, 10
; RV64IM-NEXT: lui a5, 1
; RV64IM-NEXT: addiw a5, a5, 1327
; RV64IM-NEXT: mul a3, a3, a5
; RV64IM-NEXT: sub a4, a4, a3
; RV64IM-NEXT: sd zero, 0(a0)
; RV64IM-NEXT: sd a4, 24(a0)
-; RV64IM-NEXT: sd a1, 8(a0)
-; RV64IM-NEXT: sd a2, 16(a0)
+; RV64IM-NEXT: sd a1, 16(a0)
+; RV64IM-NEXT: sd a2, 8(a0)
; RV64IM-NEXT: ret
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/SystemZ/int-div-06.ll b/llvm/test/CodeGen/SystemZ/int-div-06.ll
index 9de717857d7d9..f3c8e15873489 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-06.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-06.ll
@@ -1,16 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Test that divisions by constants are implemented as multiplications.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu < %s | FileCheck %s
; Check signed 32-bit division.
define i32 @f1(i32 %a) {
; CHECK-LABEL: f1:
-; CHECK: lgfr [[REG:%r[0-5]]], %r2
-; CHECK: msgfi [[REG]], 502748801
-; CHECK-DAG: srlg [[RES1:%r[0-5]]], [[REG]], 63
-; CHECK-DAG: srag %r2, [[REG]], 46
-; CHECK: ar %r2, [[RES1]]
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgfr %r0, %r2
+; CHECK-NEXT: msgfi %r0, 502748801
+; CHECK-NEXT: srlg %r1, %r0, 63
+; CHECK-NEXT: srag %r2, %r0, 46
+; CHECK-NEXT: ar %r2, %r1
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
%b = sdiv i32 %a, 139968
ret i32 %b
}
@@ -18,10 +21,13 @@ define i32 @f1(i32 %a) {
; Check unsigned 32-bit division.
define i32 @f2(i32 %a) {
; CHECK-LABEL: f2:
-; CHECK: llgfr [[REG:%r[0-5]]], %r2
-; CHECK: msgfi [[REG]], 502748801
-; CHECK: srlg %r2, [[REG]], 46
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: llgfr %r0, %r2
+; CHECK-NEXT: llilf %r1, 4021990407
+; CHECK-NEXT: msgr %r1, %r0
+; CHECK-NEXT: srlg %r2, %r1, 49
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
%b = udiv i32 %a, 139968
ret i32 %b
}
@@ -29,16 +35,18 @@ define i32 @f2(i32 %a) {
; Check signed 64-bit division.
define i64 @f3(i64 %dummy, i64 %a) {
; CHECK-LABEL: f3:
-; CHECK-DAG: llihf [[CONST:%r[0-5]]], 1005497601
-; CHECK-DAG: oilf [[CONST]], 4251762321
-; CHECK-DAG: srag [[REG:%r[0-5]]], %r3, 63
-; CHECK-DAG: ngr [[REG]], [[CONST]]
-; CHECK-DAG: mlgr %r2, [[CONST]]
-; CHECK: sgr %r2, [[REG]]
-; CHECK: srlg [[RES1:%r[0-5]]], %r2, 63
-; CHECK: srag %r2, %r2, 15
-; CHECK: agr %r2, [[RES1]]
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
+; CHECK-NEXT: srag %r0, %r3, 63
+; CHECK-NEXT: llihf %r1, 1005497601
+; CHECK-NEXT: oilf %r1, 4251762321
+; CHECK-NEXT: ngr %r0, %r1
+; CHECK-NEXT: mlgr %r2, %r1
+; CHECK-NEXT: sgr %r2, %r0
+; CHECK-NEXT: srlg %r0, %r2, 63
+; CHECK-NEXT: srag %r2, %r2, 15
+; CHECK-NEXT: agr %r2, %r0
+; CHECK-NEXT: br %r14
%b = sdiv i64 %a, 139968
ret i64 %b
}
@@ -46,11 +54,13 @@ define i64 @f3(i64 %dummy, i64 %a) {
; Check unsigned 64-bit division.
define i64 @f4(i64 %dummy, i64 %a) {
; CHECK-LABEL: f4:
-; CHECK: llihf [[CONST:%r[0-5]]], 1005497601
-; CHECK: oilf [[CONST]], 4251762321
-; CHECK: mlgr %r2, [[CONST]]
-; CHECK: srlg %r2, %r2, 15
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
+; CHECK-NEXT: llihf %r0, 2010995203
+; CHECK-NEXT: oilf %r0, 4208557345
+; CHECK-NEXT: mlgr %r2, %r0
+; CHECK-NEXT: srlg %r2, %r2, 16
+; CHECK-NEXT: br %r14
%b = udiv i64 %a, 139968
ret i64 %b
}
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-13.ll b/llvm/test/CodeGen/SystemZ/int-mul-13.ll
index 82937cf66c629..db3aa1531d90a 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-13.ll
@@ -81,11 +81,11 @@ define i64 @f4(i64 %dummy, i64 %a, i64 %b) {
define i64 @f5(i64 %dummy, i64 %a) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: llihf %r0, 1782028570
-; CHECK-NEXT: oilf %r0, 598650223
+; CHECK-NEXT: llihf %r0, 891014285
+; CHECK-NEXT: oilf %r0, 299325111
; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
; CHECK-NEXT: mlgr %r2, %r0
-; CHECK-NEXT: srlg %r2, %r2, 9
+; CHECK-NEXT: srlg %r2, %r2, 8
; CHECK-NEXT: br %r14
%res = udiv i64 %a, 1234
ret i64 %res
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 7087041e8dace..e005f57d2a0e7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -362,31 +362,29 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: sbcs r0, r1, #0
; CHECK-NEXT: blt.w .LBB1_28
; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph
-; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: mov r7, r2
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: csel r7, r2, r0, lt
-; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: mov r1, r7
+; CHECK-NEXT: it ge
+; CHECK-NEXT: movge r7, #1
+; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: it ls
-; CHECK-NEXT: movls r1, #3
-; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: subs r1, r1, r7
-; CHECK-NEXT: movw r2, #43691
-; CHECK-NEXT: adds r1, #2
-; CHECK-NEXT: movt r2, #43690
+; CHECK-NEXT: movls r0, #3
+; CHECK-NEXT: subs r0, r0, r7
+; CHECK-NEXT: mov r12, r1
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: mov.w r1, #1431655765
; CHECK-NEXT: ldr r6, [sp, #128]
; CHECK-NEXT: movw r8, :lower16:c
-; CHECK-NEXT: umull r1, r2, r1, r2
+; CHECK-NEXT: umull r0, r1, r0, r1
; CHECK-NEXT: movt r8, :upper16:c
-; CHECK-NEXT: movs r1, #4
+; CHECK-NEXT: mov.w r9, #12
+; CHECK-NEXT: @ implicit-def: $r11
; CHECK-NEXT: @ implicit-def: $r10
; CHECK-NEXT: @ implicit-def: $r5
-; CHECK-NEXT: @ implicit-def: $r11
-; CHECK-NEXT: mov.w r9, #12
-; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r1, r2, lsr #1
-; CHECK-NEXT: add.w r0, r0, r2, lsr #1
+; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r1, #1
+; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: adr r1, .LCPI1_0
; CHECK-NEXT: vldrw.u32 q0, [r1]
@@ -399,35 +397,31 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r11
-; CHECK-NEXT: cmn.w r11, #4
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: cmn.w r5, #4
; CHECK-NEXT: it le
; CHECK-NEXT: mvnle r0, #3
; CHECK-NEXT: movw r2, #18725
; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: movt r2, #9362
-; CHECK-NEXT: sub.w r1, r0, r11
-; CHECK-NEXT: mov r10, r3
+; CHECK-NEXT: subs r1, r0, r5
+; CHECK-NEXT: mov r5, r3
; CHECK-NEXT: umull r2, r3, r1, r2
-; CHECK-NEXT: subs r2, r1, r3
-; CHECK-NEXT: add.w r2, r3, r2, lsr #1
-; CHECK-NEXT: lsrs r3, r2, #2
-; CHECK-NEXT: lsls r3, r3, #3
-; CHECK-NEXT: sub.w r2, r3, r2, lsr #2
+; CHECK-NEXT: rsb r2, r3, r3, lsl #3
+; CHECK-NEXT: mov r3, r5
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #7
+; CHECK-NEXT: adds r5, r0, #7
; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: adds r5, #2
-; CHECK-NEXT: subs.w r1, r5, lr
-; CHECK-NEXT: asr.w r0, r5, #31
+; CHECK-NEXT: add.w r10, r10, #2
+; CHECK-NEXT: subs.w r1, r10, lr
+; CHECK-NEXT: asr.w r0, r10, #31
; CHECK-NEXT: sbcs.w r0, r0, r12
; CHECK-NEXT: bge.w .LBB1_28
; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader
@@ -436,7 +430,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Child Loop BB1_10 Depth 2
; CHECK-NEXT: @ Child Loop BB1_12 Depth 3
; CHECK-NEXT: @ Child Loop BB1_14 Depth 3
-; CHECK-NEXT: cmp.w r11, #2
+; CHECK-NEXT: cmp r5, #2
; CHECK-NEXT: bgt .LBB1_5
; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
@@ -458,14 +452,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: b .LBB1_10
; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: add.w r11, r0, #7
+; CHECK-NEXT: adds r5, r0, #7
; CHECK-NEXT: cmn.w r0, #4
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: bge .LBB1_5
; CHECK-NEXT: .LBB1_10: @ %for.body6.us
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
@@ -523,7 +517,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: beq .LBB1_9
; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: eor r1, r10, #1
+; CHECK-NEXT: eor r1, r11, #1
; CHECK-NEXT: lsls r1, r1, #31
; CHECK-NEXT: bne .LBB1_9
; CHECK-NEXT: b .LBB1_26
@@ -532,11 +526,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: beq.w .LBB1_2
; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: .LBB1_19: @ %for.body6.us60
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: lsls.w r1, r10, #31
+; CHECK-NEXT: lsls.w r1, r11, #31
; CHECK-NEXT: bne .LBB1_27
; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
@@ -552,19 +546,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: bgt .LBB1_25
; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
-; CHECK-NEXT: add.w r11, r0, #28
+; CHECK-NEXT: add.w r5, r0, #28
; CHECK-NEXT: cmn.w r0, #25
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: blt .LBB1_19
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #14
+; CHECK-NEXT: add.w r5, r0, #14
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #21
+; CHECK-NEXT: add.w r5, r0, #21
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_26: @ %for.inc19.us
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-select.ll b/llvm/test/CodeGen/Thumb2/thumb2-select.ll
index 105c2672ee1b9..656b6f45f061c 100644
--- a/llvm/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,11 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding %s -o - \
; RUN: | FileCheck %s
define i32 @f1(i32 %a.s) {
-entry:
; CHECK-LABEL: f1:
-; CHECK: it eq
-; CHECK: moveq
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #3 @ encoding: [0x03,0x21]
+; CHECK-NEXT: cmp r0, #4 @ encoding: [0x04,0x28]
+; CHECK-NEXT: it eq @ encoding: [0x08,0xbf]
+; CHECK-NEXT: moveq r1, #2 @ encoding: [0x02,0x21]
+; CHECK-NEXT: mov r0, r1 @ encoding: [0x08,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp eq i32 %a.s, 4
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -13,30 +19,45 @@ entry:
}
define i32 @f2(i32 %a.s) {
-entry:
; CHECK-LABEL: f2:
-; CHECK: it gt
-; CHECK: movgt
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #3 @ encoding: [0x03,0x21]
+; CHECK-NEXT: cmp r0, #4 @ encoding: [0x04,0x28]
+; CHECK-NEXT: it gt @ encoding: [0xc8,0xbf]
+; CHECK-NEXT: movgt r1, #2 @ encoding: [0x02,0x21]
+; CHECK-NEXT: mov r0, r1 @ encoding: [0x08,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp sgt i32 %a.s, 4
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f3(i32 %a.s, i32 %b.s) {
-entry:
; CHECK-LABEL: f3:
-; CHECK: it lt
-; CHECK: movlt
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it lt @ encoding: [0xb8,0xbf]
+; CHECK-NEXT: movlt r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp slt i32 %a.s, %b.s
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f4(i32 %a.s, i32 %b.s) {
-entry:
; CHECK-LABEL: f4:
-; CHECK: it le
-; CHECK: movle
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it le @ encoding: [0xd8,0xbf]
+; CHECK-NEXT: movle r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp sle i32 %a.s, %b.s
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -44,30 +65,46 @@ entry:
}
define i32 @f5(i32 %a.u, i32 %b.u) {
-entry:
; CHECK-LABEL: f5:
-; CHECK: it ls
-; CHECK: movls
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ls @ encoding: [0x98,0xbf]
+; CHECK-NEXT: movls r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp ule i32 %a.u, %b.u
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f6(i32 %a.u, i32 %b.u) {
-entry:
; CHECK-LABEL: f6:
-; CHECK: it hi
-; CHECK: movhi
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it hi @ encoding: [0x88,0xbf]
+; CHECK-NEXT: movhi r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp ugt i32 %a.u, %b.u
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f7(i32 %a, i32 %b, i32 %c) {
-entry:
; CHECK-LABEL: f7:
-; CHECK: it hi
-; CHECK: lsrhi {{r[0-9]+}}
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: mov.w r3, #1431655765 @ encoding: [0x4f,0xf0,0x55,0x33]
+; CHECK-NEXT: umull r3, r2, r2, r3 @ encoding: [0xa2,0xfb,0x03,0x32]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ls @ encoding: [0x98,0xbf]
+; CHECK-NEXT: movls r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp1 = icmp ugt i32 %a, %b
%tmp2 = udiv i32 %c, 3
%tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -75,10 +112,15 @@ entry:
}
define i32 @f8(i32 %a, i32 %b, i32 %c) {
-entry:
; CHECK-LABEL: f8:
-; CHECK: it lo
-; CHECK: lsllo {{r[0-9]+}}
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r3, #3 @ encoding: [0x03,0x23]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it lo @ encoding: [0x38,0xbf]
+; CHECK-NEXT: lsllo r3, r2, #2 @ encoding: [0x93,0x00]
+; CHECK-NEXT: mov r0, r3 @ encoding: [0x18,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp1 = icmp ult i32 %a, %b
%tmp2 = mul i32 %c, 4
%tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -86,10 +128,15 @@ entry:
}
define i32 @f9(i32 %a, i32 %b, i32 %c) {
-entry:
; CHECK-LABEL: f9:
-; CHECK: it ge
-; CHECK: rorge.w
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r3, #3 @ encoding: [0x03,0x23]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ge @ encoding: [0xa8,0xbf]
+; CHECK-NEXT: rorge.w r3, r2, #22 @ encoding: [0x4f,0xea,0xb2,0x53]
+; CHECK-NEXT: mov r0, r3 @ encoding: [0x18,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp1 = icmp sge i32 %a, %b
%tmp2 = shl i32 %c, 10
%tmp3 = lshr i32 %c, 22
@@ -100,7 +147,13 @@ entry:
define i32 @f10(i32 %a, i32 %b) {
; CHECK-LABEL: f10:
-; CHECK: movwne {{r[0-9]+}}, #1234 @ encoding: [0x40,0xf2,0xd2,0x4{{[0-9a-f]+}}]
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, #12345 @ encoding: [0x43,0xf2,0x39,0x02]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ne @ encoding: [0x18,0xbf]
+; CHECK-NEXT: movwne r2, #1234 @ encoding: [0x40,0xf2,0xd2,0x42]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
%tst = icmp ne i32 %a, %b
%val = select i1 %tst, i32 1234, i32 12345
ret i32 %val
@@ -109,7 +162,15 @@ define i32 @f10(i32 %a, i32 %b) {
; Make sure we pick the Thumb encoding for movw/movt
define i32 @f11(i32 %a, i32 %b) {
; CHECK-LABEL: f11:
-; CHECK: movwne {{r[0-9]+}}, #50033 @ encoding: [0x4c,0xf2,0x71,0x3{{[0-9a-f]+}}]
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, #49977 @ encoding: [0x4c,0xf2,0x39,0x32]
+; CHECK-NEXT: movt r2, #8288 @ encoding: [0xc2,0xf2,0x60,0x02]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: itt ne @ encoding: [0x1c,0xbf]
+; CHECK-NEXT: movwne r2, #50033 @ encoding: [0x4c,0xf2,0x71,0x32]
+; CHECK-NEXT: movtne r2, #1883 @ encoding: [0xc0,0xf2,0x5b,0x72]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
%tst = icmp ne i32 %a, %b
%val = select i1 %tst, i32 123454321, i32 543212345
ret i32 %val
diff --git a/llvm/test/CodeGen/VE/Scalar/div.ll b/llvm/test/CodeGen/VE/Scalar/div.ll
index 64caf8a835468..a44a669f50a37 100644
--- a/llvm/test/CodeGen/VE/Scalar/div.ll
+++ b/llvm/test/CodeGen/VE/Scalar/div.ll
@@ -1,14 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
; Function Attrs: norecurse nounwind readnone
define i128 @divi128(i128, i128) {
; CHECK-LABEL: divi128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: lea %s4, __divti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __divti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%3 = sdiv i128 %0, %1
ret i128 %3
}
@@ -37,12 +56,30 @@ define signext i32 @divi32(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divu128(i128, i128) {
; CHECK-LABEL: divu128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: lea %s4, __udivti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __udivti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%3 = udiv i128 %0, %1
ret i128 %3
}
@@ -123,7 +160,22 @@ define zeroext i8 @divu8(i8 zeroext %a, i8 zeroext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divi128ri(i128) {
; CHECK-LABEL: divi128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB10_2:
; CHECK-NEXT: lea %s2, __divti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __divti3 at hi(, %s2)
@@ -131,6 +183,9 @@ define i128 @divi128ri(i128) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = sdiv i128 %0, 3
ret i128 %2
}
@@ -163,7 +218,22 @@ define signext i32 @divi32ri(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divu128ri(i128) {
; CHECK-LABEL: divu128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: lea %s2, __udivti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __udivti3 at hi(, %s2)
@@ -171,6 +241,9 @@ define i128 @divu128ri(i128) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = udiv i128 %0, 3
ret i128 %2
}
@@ -189,10 +262,9 @@ define i64 @divu64ri(i64 %a, i64 %b) {
define zeroext i32 @divu32ri(i32 zeroext %a, i32 zeroext %b) {
; CHECK-LABEL: divu32ri:
; CHECK: # %bb.0:
-; CHECK-NEXT: lea %s1, -1431655765
-; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea %s1, 1431655765
; CHECK-NEXT: muls.l %s0, %s0, %s1
-; CHECK-NEXT: srl %s0, %s0, 33
+; CHECK-NEXT: srl %s0, %s0, 32
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv i32 %a, 3
ret i32 %r
@@ -201,7 +273,22 @@ define zeroext i32 @divu32ri(i32 zeroext %a, i32 zeroext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divi128li(i128) {
; CHECK-LABEL: divi128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB16_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB16_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __divti3 at lo
@@ -211,6 +298,9 @@ define i128 @divi128li(i128) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = sdiv i128 3, %0
ret i128 %2
}
@@ -239,7 +329,22 @@ define signext i32 @divi32li(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divu128li(i128) {
; CHECK-LABEL: divu128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB19_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB19_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __udivti3 at lo
@@ -249,6 +354,9 @@ define i128 @divu128li(i128) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = udiv i128 3, %0
ret i128 %2
}
diff --git a/llvm/test/CodeGen/VE/Scalar/rem.ll b/llvm/test/CodeGen/VE/Scalar/rem.ll
index 9911405c6a68d..4625e274baa1e 100644
--- a/llvm/test/CodeGen/VE/Scalar/rem.ll
+++ b/llvm/test/CodeGen/VE/Scalar/rem.ll
@@ -1,14 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
; Function Attrs: norecurse nounwind readnone
define i128 @remi128(i128 %a, i128 %b) {
; CHECK-LABEL: remi128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: lea %s4, __modti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __modti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = srem i128 %a, %b
ret i128 %r
}
@@ -41,12 +60,30 @@ define signext i32 @remi32(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @remu128(i128 %a, i128 %b) {
; CHECK-LABEL: remu128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: lea %s4, __umodti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __umodti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = urem i128 %a, %b
ret i128 %r
}
@@ -137,7 +174,22 @@ define zeroext i8 @remu8(i8 zeroext %a, i8 zeroext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @remi128ri(i128 %a) {
; CHECK-LABEL: remi128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB10_2:
; CHECK-NEXT: lea %s2, __modti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __modti3 at hi(, %s2)
@@ -145,6 +197,9 @@ define i128 @remi128ri(i128 %a) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = srem i128 %a, 3
ret i128 %r
}
@@ -181,7 +236,22 @@ define signext i32 @remi32ri(i32 signext %a) {
; Function Attrs: norecurse nounwind readnone
define i128 @remu128ri(i128 %a) {
; CHECK-LABEL: remu128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: lea %s2, __umodti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __umodti3 at hi(, %s2)
@@ -189,6 +259,9 @@ define i128 @remu128ri(i128 %a) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = urem i128 %a, 11
ret i128 %r
}
@@ -209,10 +282,9 @@ define i64 @remu64ri(i64 %a) {
define zeroext i32 @remu32ri(i32 zeroext %a) {
; CHECK-LABEL: remu32ri:
; CHECK: # %bb.0:
-; CHECK-NEXT: lea %s1, -1431655765
-; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea %s1, 1431655765
; CHECK-NEXT: muls.l %s1, %s0, %s1
-; CHECK-NEXT: srl %s1, %s1, 33
+; CHECK-NEXT: srl %s1, %s1, 32
; CHECK-NEXT: muls.w.sx %s1, 3, %s1
; CHECK-NEXT: subs.w.sx %s0, %s0, %s1
; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1
@@ -224,7 +296,22 @@ define zeroext i32 @remu32ri(i32 zeroext %a) {
; Function Attrs: norecurse nounwind readnone
define i128 @remi128li(i128 %a) {
; CHECK-LABEL: remi128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB16_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB16_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __modti3 at lo
@@ -234,6 +321,9 @@ define i128 @remi128li(i128 %a) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = srem i128 3, %a
ret i128 %r
}
@@ -266,7 +356,22 @@ define signext i32 @remi32li(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @remu128li(i128) {
; CHECK-LABEL: remu128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB19_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB19_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __umodti3 at lo
@@ -276,6 +381,9 @@ define i128 @remu128li(i128) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = urem i128 3, %0
ret i128 %2
}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index 3bc0aba8d4264..940886ca1b096 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -7,19 +7,10 @@
define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: udiv_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s0, %s0, %s4
-; CHECK-NEXT: srl %s0, %s0, 32
-; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: muls.l %s1, %s1, %s4
-; CHECK-NEXT: srl %s1, %s1, 32
-; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: muls.l %s2, %s2, %s4
-; CHECK-NEXT: srl %s2, %s2, 32
-; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: muls.l %s3, %s3, %s4
-; CHECK-NEXT: srl %s3, %s3, 32
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
@@ -32,23 +23,6 @@ define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
; CHECK-NEXT: and %s1, %s1, (56)0
; CHECK-NEXT: and %s2, %s2, (56)0
; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s5, %s3, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s3, %s3, %s5
-; CHECK-NEXT: muls.l %s5, %s2, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s2, %s2, %s5
-; CHECK-NEXT: muls.l %s5, %s1, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s1, %s1, %s5
-; CHECK-NEXT: muls.l %s4, %s0, %s4
-; CHECK-NEXT: srl %s4, %s4, 32
-; CHECK-NEXT: muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT: subs.w.sx %s0, %s0, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
diff --git a/llvm/test/CodeGen/X86/and-encoding.ll b/llvm/test/CodeGen/X86/and-encoding.ll
index 248686ff8b7a2..db60d2f561e3e 100644
--- a/llvm/test/CodeGen/X86/and-encoding.ll
+++ b/llvm/test/CodeGen/X86/and-encoding.ll
@@ -104,11 +104,10 @@ define i64 @lopped64_64to32(i64 %x) {
define i32 @shrinkAndKnownBits(i32 %x) {
; CHECK-LABEL: shrinkAndKnownBits:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %ecx # encoding: [0x89,0xf9]
-; CHECK-NEXT: movl $4042322161, %eax # encoding: [0xb8,0xf1,0xf0,0xf0,0xf0]
-; CHECK-NEXT: # imm = 0xF0F0F0F1
-; CHECK-NEXT: imulq %rcx, %rax # encoding: [0x48,0x0f,0xaf,0xc1]
-; CHECK-NEXT: shrq $36, %rax # encoding: [0x48,0xc1,0xe8,0x24]
+; CHECK-NEXT: movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT: imulq $252645135, %rax, %rax # encoding: [0x48,0x69,0xc0,0x0f,0x0f,0x0f,0x0f]
+; CHECK-NEXT: # imm = 0xF0F0F0F
+; CHECK-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
; CHECK-NEXT: andl $-128, %eax # encoding: [0x83,0xe0,0x80]
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a..e97b813e34921 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -695,9 +695,8 @@ define i64 @load_fold_udiv1(ptr %p) {
; CHECK-O3-LABEL: load_fold_udiv1:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rdx
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movabsq $1229782938247303441, %rax # imm = 0x1111111111111111
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
-; CHECK-O3-NEXT: shrq $3, %rax
; CHECK-O3-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = udiv i64 %v, 15
@@ -882,10 +881,9 @@ define i64 @load_fold_urem1(ptr %p) {
; CHECK-O3-LABEL: load_fold_urem1:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rcx
-; CHECK-O3-NEXT: shrq $3, %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
; CHECK-O3-NEXT: subq %rcx, %rax
@@ -1493,9 +1491,8 @@ define void @rmw_fold_udiv1(ptr %p, i64 %v) {
; CHECK-LABEL: rmw_fold_udiv1:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rdx
-; CHECK-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-NEXT: movabsq $1229782938247303441, %rax # imm = 0x1111111111111111
; CHECK-NEXT: mulxq %rax, %rax, %rax
-; CHECK-NEXT: shrq $3, %rax
; CHECK-NEXT: movq %rax, (%rdi)
; CHECK-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
@@ -1623,10 +1620,9 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) {
; CHECK-O0-LABEL: rmw_fold_urem1:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-O0-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; CHECK-O0-NEXT: movq %rax, %rdx
; CHECK-O0-NEXT: mulxq %rcx, %rcx, %rcx
-; CHECK-O0-NEXT: shrq $3, %rcx
; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx
; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx
; CHECK-O0-NEXT: subq %rcx, %rax
@@ -1636,9 +1632,8 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) {
; CHECK-O3-LABEL: rmw_fold_urem1:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rdx
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movabsq $1229782938247303441, %rax # imm = 0x1111111111111111
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
-; CHECK-O3-NEXT: shrq $3, %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
; CHECK-O3-NEXT: subq %rax, %rdx
diff --git a/llvm/test/CodeGen/X86/bug80500.ll b/llvm/test/CodeGen/X86/bug80500.ll
index bdf72887ef2f9..90864535c5145 100644
--- a/llvm/test/CodeGen/X86/bug80500.ll
+++ b/llvm/test/CodeGen/X86/bug80500.ll
@@ -7,9 +7,8 @@ define i32 @load_fold_udiv1(ptr %p) {
; CHECK-LABEL: load_fold_udiv1:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl $-2004318071, %edx # imm = 0x88888889
+; CHECK-NEXT: movl $286331153, %edx # imm = 0x11111111
; CHECK-NEXT: mulxl (%eax), %eax, %eax
-; CHECK-NEXT: shrl $3, %eax
; CHECK-NEXT: retl
%v = load i32, ptr %p, align 4
%ret = udiv i32 %v, 15
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index aa3bea2791416..163c11c28882c 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -203,68 +203,56 @@ define i32 @PR43159(ptr %a0) {
; SSE-LABEL: PR43159:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $1, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [822987745,18122225,2164392967,3105965049]
+; SSE-NEXT: pmuludq %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE-NEXT: psubd %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: psrld $7, %xmm0
-; SSE-NEXT: psrld $6, %xmm2
-; SSE-NEXT: movd %xmm2, %edi
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $7, %xmm1
; SSE-NEXT: pextrd $1, %xmm0, %esi
-; SSE-NEXT: pextrd $2, %xmm2, %edx
-; SSE-NEXT: pextrd $3, %xmm0, %ecx
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: movd %xmm0, %edi
+; SSE-NEXT: pextrd $2, %xmm1, %edx
+; SSE-NEXT: pextrd $3, %xmm1, %ecx
; SSE-NEXT: jmp foo # TAILCALL
;
; AVX1-LABEL: PR43159:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edi
-; AVX1-NEXT: vpextrd $1, %xmm1, %esi
-; AVX1-NEXT: vpextrd $2, %xmm0, %edx
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %edi
+; AVX1-NEXT: vpextrd $1, %xmm0, %esi
+; AVX1-NEXT: vpextrd $2, %xmm1, %edx
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; AVX1-NEXT: jmp foo # TAILCALL
;
; AVX2-LABEL: PR43159:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %edi
@@ -276,18 +264,14 @@ define i32 @PR43159(ptr %a0) {
; AVX512VL-LABEL: PR43159:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovd %xmm0, %edi
@@ -299,18 +283,14 @@ define i32 @PR43159(ptr %a0) {
; AVX512DQVL-LABEL: PR43159:
; AVX512DQVL: # %bb.0: # %entry
; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovd %xmm0, %edi
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index d5a481549f851..95ceabfb25f44 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -449,7 +449,7 @@ define i32 @combine_udiv_uniform(i32 %x) {
; CHECK-LABEL: combine_udiv_uniform:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: movl $2987803337, %eax # imm = 0xB21642C9
+; CHECK-NEXT: movl $2987803335, %eax # imm = 0xB21642C7
; CHECK-NEXT: imulq %rcx, %rax
; CHECK-NEXT: shrq $36, %rax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
@@ -461,29 +461,19 @@ define i32 @combine_udiv_uniform(i32 %x) {
define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; SSE-LABEL: combine_vec_udiv_uniform:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [25645,25645,25645,25645,25645,25645,25645,25645]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [45589,45589,45589,45589,45589,45589,45589,45589]
; SSE-NEXT: psrlw $4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_uniform:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [45589,45589,45589,45589,45589,45589,45589,45589]
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_uniform:
; XOP: # %bb.0:
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645]
-; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
-; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [45589,45589,45589,45589,45589,45589,45589,45589]
; XOP-NEXT: vpsrlw $4, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
@@ -493,18 +483,12 @@ define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_udiv_nonuniform:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlw $3, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45589,3855,32779,4681,512,1,257,2]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm1
; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,32768,0,0,0,0]
; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,0,65535,0]
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
@@ -512,36 +496,31 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
;
; SSE41-LABEL: combine_vec_udiv_nonuniform:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45589,3855,32779,4681,512,1,257,2]
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,32768,0,0,0,0]
; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,32768,2,16384,u,u,256,u]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45589,3855,32779,4681,512,1,257,2]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,32768,0,0,0,0]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,32768,2,16384,u,u,256,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6],xmm0[7]
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45589,3855,32779,4681,512,1,257,2]
; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,32768,0,0,0,0]
; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
@@ -550,40 +529,31 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
}
define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
-; SSE2-LABEL: combine_vec_udiv_nonuniform2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [16393,59919,58255,32787,55189,8197,52429,32789]
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,2048,2048,2,2048,8,2048,2]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: combine_vec_udiv_nonuniform2:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,2048,2048,2,2048,8,2048,2]
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_vec_udiv_nonuniform2:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2049,59917,58253,16393,55187,32787,13107,8197]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32,2048,2048,4,2048,2,8192,8]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform2:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [8,2048,2048,2,2048,8,2048,2]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2049,59917,58253,16393,55187,32787,13107,8197]
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32,2048,2048,4,2048,2,8192,8]
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform2:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2049,59917,58253,16393,55187,32787,13107,8197]
+; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -591,31 +561,33 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
}
define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
-; SSE-LABEL: combine_vec_udiv_nonuniform3:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,25645,18351,12137,2115,23705,1041,517]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024]
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_vec_udiv_nonuniform3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,65535,0,0]
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_udiv_nonuniform3:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,4096,8192,4096,u,2048,u,u]
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform3:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4096,8192,4096,u,2048,u,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform3:
; XOP: # %bb.0:
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517]
-; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
-; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
@@ -631,7 +603,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $15, %xmm0
+; SSE2-NEXT: psrlw $14, %xmm0
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -644,7 +616,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm2
-; SSE41-NEXT: psrlw $7, %xmm2
+; SSE41-NEXT: psrlw $6, %xmm2
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
@@ -657,7 +629,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -665,12 +637,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
-; XOP-NEXT: movl $171, %eax
+; XOP-NEXT: movl $85, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
-; XOP-NEXT: movl $249, %eax
+; XOP-NEXT: movl $250, %eax
; XOP-NEXT: vmovd %eax, %xmm2
; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
@@ -683,54 +655,39 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-LABEL: pr38477:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
-; SSE2-NEXT: pmulhuw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psubw %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768]
-; SSE2-NEXT: paddw %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,551,57455,32823,32769,4443,2048,2115]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,0]
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: pr38477:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,551,57455,32823,32769,4443,2048,2115]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubw %xmm1, %xmm2
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768]
-; SSE41-NEXT: paddw %xmm1, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,1024,1024,16,4,1024,u,4096]
-; SSE41-NEXT: pmulhuw %xmm2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [u,u,1024,2,2,8192,u,u]
+; SSE41-NEXT: pmulhuw %xmm1, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: pr38477:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768]
-; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,1024,1024,16,4,1024,u,4096]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,551,57455,32823,32769,4443,2048,2115]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,u,1024,2,2,8192,u,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX-NEXT: retq
;
; XOP-LABEL: pr38477:
; XOP: # %bb.0:
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115]
-; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768]
-; XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,551,57455,32823,32769,4443,2048,2115]
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; XOP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index ac78136b9d8ea..95be53d3e4a30 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -7,14 +7,14 @@ define zeroext i16 @test1(i16 zeroext %x) nounwind {
; X86-LABEL: test1:
; X86: # %bb.0: # %entry
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $63551, %eax, %eax # imm = 0xF83F
+; X86-NEXT: imull $63549, %eax, %eax # imm = 0xF83D
; X86-NEXT: shrl $21, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0: # %entry
-; X64-NEXT: imull $63551, %edi, %eax # imm = 0xF83F
+; X64-NEXT: imull $63549, %edi, %eax # imm = 0xF83D
; X64-NEXT: shrl $21, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -27,15 +27,15 @@ define zeroext i16 @test2(i8 signext %x, i16 zeroext %c) nounwind readnone ssp n
; X86-LABEL: test2:
; X86: # %bb.0: # %entry
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
-; X86-NEXT: shrl $17, %eax
+; X86-NEXT: imull $21845, %eax, %eax # imm = 0x5555
+; X86-NEXT: shrl $16, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test2:
; X64: # %bb.0: # %entry
-; X64-NEXT: imull $43691, %esi, %eax # imm = 0xAAAB
-; X64-NEXT: shrl $17, %eax
+; X64-NEXT: imull $21845, %esi, %eax # imm = 0x5555
+; X64-NEXT: shrl $16, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -48,15 +48,14 @@ define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) nounwind readnone ssp nor
; X86-LABEL: test3:
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %eax
-; X86-NEXT: shrl $9, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: imull $85, %eax, %eax
+; X86-NEXT: movb %ah, %al
; X86-NEXT: retl
;
; X64-LABEL: test3:
; X64: # %bb.0: # %entry
-; X64-NEXT: imull $171, %esi, %eax
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: imull $85, %esi, %eax
+; X64-NEXT: shrl $8, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
entry:
@@ -93,17 +92,17 @@ entry:
define i32 @test5(i32 %A) nounwind {
; X86-LABEL: test5:
; X86: # %bb.0:
-; X86-NEXT: movl $365384439, %eax # imm = 0x15C752F7
+; X86-NEXT: movl $1461537755, %eax # imm = 0x571D4BDB
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $27, %eax
+; X86-NEXT: shrl $29, %eax
; X86-NEXT: retl
;
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: imulq $365384439, %rax, %rax # imm = 0x15C752F7
-; X64-NEXT: shrq $59, %rax
+; X64-NEXT: imulq $1461537755, %rax, %rax # imm = 0x571D4BDB
+; X64-NEXT: shrq $61, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%tmp1 = udiv i32 %A, 1577682821 ; <i32> [#uses=1]
@@ -139,19 +138,27 @@ entry:
define i32 @test7(i32 %x) nounwind {
; X86-LABEL: test7:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: movl $613566757, %ecx # imm = 0x24924925
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-1840700271, %edx # imm = 0x92492491
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: shrl $2, %edi
-; X64-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: movl $2454267025, %eax # imm = 0x92492491
+; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: shrq $32, %rax
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: shrl %edi
+; X64-NEXT: addl %edi, %eax
+; X64-NEXT: shrl $4, %eax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%div = udiv i32 %x, 28
@@ -163,19 +170,16 @@ define i8 @test8(i8 %x) nounwind {
; X86-LABEL: test8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $211, %eax, %eax
-; X86-NEXT: shrl $13, %eax
+; X86-NEXT: imull $209, %eax, %eax
+; X86-NEXT: shrl $14, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test8:
; X64: # %bb.0:
-; X64-NEXT: shrb %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $211, %eax, %eax
-; X64-NEXT: shrl $13, %eax
+; X64-NEXT: imull $209, %eax, %eax
+; X64-NEXT: shrl $14, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%div = udiv i8 %x, 78
@@ -186,19 +190,23 @@ define i8 @test9(i8 %x) nounwind {
; X86-LABEL: test9:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $71, %eax, %eax
-; X86-NEXT: shrl $11, %eax
+; X86-NEXT: imull $35, %eax, %ecx
+; X86-NEXT: subb %ch, %al
+; X86-NEXT: shrb %al
+; X86-NEXT: addb %ch, %al
+; X86-NEXT: shrb $4, %al
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test9:
; X64: # %bb.0:
-; X64-NEXT: shrb $2, %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $71, %eax, %eax
-; X64-NEXT: shrl $11, %eax
+; X64-NEXT: imull $35, %eax, %ecx
+; X64-NEXT: shrl $8, %ecx
+; X64-NEXT: subb %cl, %al
+; X64-NEXT: shrb %al
+; X64-NEXT: addb %cl, %al
+; X64-NEXT: shrb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%div = udiv i8 %x, 116
@@ -311,10 +319,9 @@ define i64 @PR23590(i64 %x) nounwind {
;
; X64-FAST-LABEL: PR23590:
; X64-FAST: # %bb.0: # %entry
-; X64-FAST-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F
+; X64-FAST-NEXT: movabsq $1494268454735485, %rcx # imm = 0x54F077C718E7D
; X64-FAST-NEXT: movq %rdi, %rax
; X64-FAST-NEXT: mulq %rcx
-; X64-FAST-NEXT: shrq $12, %rdx
; X64-FAST-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039
; X64-FAST-NEXT: subq %rax, %rdi
; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
@@ -325,10 +332,9 @@ define i64 @PR23590(i64 %x) nounwind {
;
; X64-SLOW-LABEL: PR23590:
; X64-SLOW: # %bb.0: # %entry
-; X64-SLOW-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F
+; X64-SLOW-NEXT: movabsq $1494268454735485, %rcx # imm = 0x54F077C718E7D
; X64-SLOW-NEXT: movq %rdi, %rax
; X64-SLOW-NEXT: mulq %rcx
-; X64-SLOW-NEXT: shrq $12, %rdx
; X64-SLOW-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039
; X64-SLOW-NEXT: subq %rax, %rdi
; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925
@@ -376,12 +382,14 @@ define { i64, i32 } @PR38622(i64) nounwind {
;
; X64-LABEL: PR38622:
; X64: # %bb.0:
+; X64-NEXT: movabsq $4951760157141521099, %rcx # imm = 0x44B82FA09B5A52CB
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $11, %rax
-; X64-NEXT: movabsq $4835703278458517, %rcx # imm = 0x112E0BE826D695
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $9, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq %rdx, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: shrq $30, %rax
; X64-NEXT: imull $-294967296, %eax, %ecx # imm = 0xEE6B2800
; X64-NEXT: subl %ecx, %edi
; X64-NEXT: movl %edi, %edx
@@ -455,10 +463,9 @@ define i64 @urem_i64_3(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
@@ -467,10 +474,9 @@ define i64 @urem_i64_3(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_3:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -486,10 +492,9 @@ define i64 @urem_i64_5(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
+; X86-NEXT: movl $858993459, %edx # imm = 0x33333333
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $2, %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
@@ -498,10 +503,9 @@ define i64 @urem_i64_5(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_5:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $2, %rdx
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -517,10 +521,9 @@ define i64 @urem_i64_15(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-2004318071, %edx # imm = 0x88888889
+; X86-NEXT: movl $286331153, %edx # imm = 0x11111111
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $3, %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: subl %eax, %ecx
@@ -530,10 +533,9 @@ define i64 @urem_i64_15(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_15:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X64-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $3, %rdx
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
; X64-NEXT: leaq (%rax,%rax,2), %rax
; X64-NEXT: subq %rax, %rdi
@@ -550,28 +552,26 @@ define i64 @urem_i64_17(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1
+; X86-NEXT: movl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-16, %eax
-; X86-NEXT: shrl $4, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_17:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-16, %rax
-; X64-NEXT: shrq $4, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shlq $4, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -588,9 +588,8 @@ define i64 @urem_i64_255(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; X86-NEXT: movl $16843009, %edx # imm = 0x1010101
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $7, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: subl %eax, %edx
@@ -603,10 +602,9 @@ define i64 @urem_i64_255(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_255:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $7, %rdx
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: shlq $8, %rax
; X64-NEXT: subq %rax, %rdx
@@ -623,28 +621,26 @@ define i64 @urem_i64_257(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01
+; X86-NEXT: movl $16711935, %edx # imm = 0xFF00FF
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-256, %eax
-; X86-NEXT: shrl $8, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_257:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X64-NEXT: movabsq $71777214294589695, %rcx # imm = 0xFF00FF00FF00FF
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-256, %rax
-; X64-NEXT: shrq $8, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shlq $8, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -661,9 +657,8 @@ define i64 @urem_i64_65535(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2147450879, %edx # imm = 0x80008001
+; X86-NEXT: movl $65537, %edx # imm = 0x10001
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $15, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: subl %eax, %edx
@@ -676,10 +671,9 @@ define i64 @urem_i64_65535(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_65535:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $15, %rdx
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: shlq $16, %rax
; X64-NEXT: subq %rax, %rdx
@@ -696,12 +690,12 @@ define i64 @urem_i64_65537(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-65535, %edx # imm = 0xFFFF0001
+; X86-NEXT: movl $65535, %edx # imm = 0xFFFF
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $16, %eax
-; X86-NEXT: shldl $16, %edx, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
@@ -709,14 +703,13 @@ define i64 @urem_i64_65537(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_65537:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X64-NEXT: movabsq $281470681808895, %rcx # imm = 0xFFFF0000FFFF
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; X64-NEXT: shrq $16, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shlq $16, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -735,10 +728,9 @@ define i64 @urem_i64_12(i64 %x) nounwind {
; X86-NEXT: shldl $30, %esi, %ecx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: andl $3, %esi
@@ -749,10 +741,9 @@ define i64 @urem_i64_12(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_12:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: andq $-4, %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
@@ -766,7 +757,6 @@ entry:
define i64 @udiv_i64_3(i64 %x) nounwind {
; X86-LABEL: udiv_i64_3:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -774,32 +764,30 @@ define i64 @udiv_i64_3(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: shrl %edx
+; X86-NEXT: mull %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_3:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 3
@@ -809,7 +797,6 @@ entry:
define i64 @udiv_i64_5(i64 %x) nounwind {
; X86-LABEL: udiv_i64_5:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -817,32 +804,30 @@ define i64 @udiv_i64_5(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-858993459, %ebx # imm = 0xCCCCCCCD
+; X86-NEXT: movl $858993459, %edx # imm = 0x33333333
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: shrl $2, %edx
+; X86-NEXT: mull %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_5:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $2, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 5
@@ -859,10 +844,9 @@ define i64 @udiv_i64_15(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-2004318071, %edx # imm = 0x88888889
+; X86-NEXT: movl $286331153, %edx # imm = 0x11111111
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $3, %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: subl %eax, %esi
@@ -882,10 +866,9 @@ define i64 @udiv_i64_15(i64 %x) nounwind {
; X64-LABEL: udiv_i64_15:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X64-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $3, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 15
@@ -895,7 +878,6 @@ entry:
define i64 @udiv_i64_17(i64 %x) nounwind {
; X86-LABEL: udiv_i64_17:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -903,34 +885,32 @@ define i64 @udiv_i64_17(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-252645135, %ebx # imm = 0xF0F0F0F1
+; X86-NEXT: movl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-16, %eax
-; X86-NEXT: shrl $4, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_17:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $4, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 17
@@ -946,9 +926,8 @@ define i64 @udiv_i64_255(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; X86-NEXT: movl $16843009, %edx # imm = 0x1010101
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $7, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: subl %eax, %edx
@@ -970,10 +949,9 @@ define i64 @udiv_i64_255(i64 %x) nounwind {
; X64-LABEL: udiv_i64_255:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $7, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 255
@@ -983,7 +961,6 @@ entry:
define i64 @udiv_i64_257(i64 %x) nounwind {
; X86-LABEL: udiv_i64_257:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -991,34 +968,32 @@ define i64 @udiv_i64_257(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-16711935, %ebx # imm = 0xFF00FF01
+; X86-NEXT: movl $16711935, %edx # imm = 0xFF00FF
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-256, %eax
-; X86-NEXT: shrl $8, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-16711935, %edi, %ecx # imm = 0xFF00FF01
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_257:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X64-NEXT: movabsq $71777214294589695, %rcx # imm = 0xFF00FF00FF00FF
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $8, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 257
@@ -1034,9 +1009,8 @@ define i64 @udiv_i64_65535(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2147450879, %edx # imm = 0x80008001
+; X86-NEXT: movl $65537, %edx # imm = 0x10001
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $15, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: subl %eax, %edx
@@ -1060,10 +1034,9 @@ define i64 @udiv_i64_65535(i64 %x) nounwind {
; X64-LABEL: udiv_i64_65535:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $15, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 65535
@@ -1073,7 +1046,6 @@ entry:
define i64 @udiv_i64_65537(i64 %x) nounwind {
; X86-LABEL: udiv_i64_65537:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1081,17 +1053,18 @@ define i64 @udiv_i64_65537(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-65535, %ebx # imm = 0xFFFF0001
+; X86-NEXT: movl $65535, %edx # imm = 0xFFFF
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $16, %eax
-; X86-NEXT: shldl $16, %edx, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-65535, %edx # imm = 0xFFFF0001
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: shll $16, %ecx
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: movl %edi, %ecx
@@ -1100,16 +1073,14 @@ define i64 @udiv_i64_65537(i64 %x) nounwind {
; X86-NEXT: addl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_65537:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X64-NEXT: movabsq $281470681808895, %rcx # imm = 0xFFFF0000FFFF
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $16, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 65537
@@ -1119,7 +1090,6 @@ entry:
define i64 @udiv_i64_12(i64 %x) nounwind {
; X86-LABEL: udiv_i64_12:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1129,32 +1099,31 @@ define i64 @udiv_i64_12(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: shrl %edx
+; X86-NEXT: mull %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_12:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $3, %rax
+; X64-NEXT: shrq $2, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 12
@@ -1176,10 +1145,9 @@ define i64 @urem_i64_3_optsize(i64 %x) nounwind optsize {
;
; X64-LABEL: urem_i64_3_optsize:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9..df763d04d6681 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -99,13 +99,13 @@ define i64 @udiv128(i128 %x) nounwind {
; X86-64: # %bb.0:
; X86-64-NEXT: addq %rdi, %rsi
; X86-64-NEXT: adcq $0, %rsi
-; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rsi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq %rdx
-; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT: subq %rsi, %rax
-; X86-64-NEXT: addq %rdi, %rax
+; X86-64-NEXT: leaq (%rdx,%rdx,2), %rcx
+; X86-64-NEXT: subq %rsi, %rcx
+; X86-64-NEXT: addq %rdi, %rcx
+; X86-64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB
; X86-64-NEXT: imulq %rcx, %rax
; X86-64-NEXT: retq
;
@@ -114,14 +114,14 @@ define i64 @udiv128(i128 %x) nounwind {
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: addq %rcx, %r8
; WIN64-NEXT: adcq $0, %r8
-; WIN64-NEXT: movabsq $-6148914691236517205, %r9 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r8, %rax
-; WIN64-NEXT: mulq %r9
-; WIN64-NEXT: shrq %rdx
-; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT: subq %r8, %rax
-; WIN64-NEXT: addq %rcx, %rax
-; WIN64-NEXT: imulq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: leaq (%rdx,%rdx,2), %rdx
+; WIN64-NEXT: subq %r8, %rdx
+; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: imulq %rdx, %rax
; WIN64-NEXT: retq
@@ -135,10 +135,9 @@ define i128 @urem_i128_3(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
@@ -149,10 +148,9 @@ define i128 @urem_i128_3(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
@@ -168,10 +166,9 @@ define i128 @urem_i128_5(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X86-64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $2, %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
@@ -182,10 +179,9 @@ define i128 @urem_i128_5(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-3689348814741910323, %rdx # imm = 0xCCCCCCCCCCCCCCCD
+; WIN64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $2, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
@@ -201,10 +197,9 @@ define i128 @urem_i128_15(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X86-64-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $3, %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: leaq (%rax,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rdi
@@ -216,10 +211,9 @@ define i128 @urem_i128_15(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; WIN64-NEXT: movabsq $1229782938247303441, %rdx # imm = 0x1111111111111111
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $3, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rax,%rax,2), %rax
; WIN64-NEXT: subq %rax, %rcx
@@ -236,14 +230,13 @@ define i128 @urem_i128_17(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X86-64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-16, %rax
-; X86-64-NEXT: shrq $4, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: shlq $4, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -252,14 +245,13 @@ define i128 @urem_i128_17(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F1
+; WIN64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-16, %rax
-; WIN64-NEXT: shrq $4, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: shlq $4, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -274,9 +266,8 @@ define i128 @urem_i128_255(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X86-64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $7, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $8, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -292,9 +283,8 @@ define i128 @urem_i128_255(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081
+; WIN64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $7, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $8, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -313,14 +303,13 @@ define i128 @urem_i128_257(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X86-64-NEXT: movabsq $71777214294589695, %rcx # imm = 0xFF00FF00FF00FF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-256, %rax
-; X86-64-NEXT: shrq $8, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: shlq $8, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -329,14 +318,13 @@ define i128 @urem_i128_257(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-71777214294589695, %rdx # imm = 0xFF00FF00FF00FF01
+; WIN64-NEXT: movabsq $71777214294589695, %rdx # imm = 0xFF00FF00FF00FF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-256, %rax
-; WIN64-NEXT: shrq $8, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: shlq $8, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -351,9 +339,8 @@ define i128 @urem_i128_65535(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X86-64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $15, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $16, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -369,9 +356,8 @@ define i128 @urem_i128_65535(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001
+; WIN64-NEXT: movabsq $281479271743489, %rdx # imm = 0x1000100010001
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $15, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $16, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -390,14 +376,13 @@ define i128 @urem_i128_65537(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X86-64-NEXT: movabsq $281470681808895, %rcx # imm = 0xFFFF0000FFFF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; X86-64-NEXT: shrq $16, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: shlq $16, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -406,14 +391,13 @@ define i128 @urem_i128_65537(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-281470681808895, %rdx # imm = 0xFFFF0000FFFF0001
+; WIN64-NEXT: movabsq $281470681808895, %rdx # imm = 0xFFFF0000FFFF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; WIN64-NEXT: shrq $16, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: shlq $16, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -430,10 +414,9 @@ define i128 @urem_i128_12(i128 %x) nounwind {
; X86-64-NEXT: shrq $2, %rsi
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
-; X86-64-NEXT: shrq %rdx
; X86-64-NEXT: leal (%rdx,%rdx,2), %eax
; X86-64-NEXT: subl %eax, %ecx
; X86-64-NEXT: andl $3, %edi
@@ -448,10 +431,9 @@ define i128 @urem_i128_12(i128 %x) nounwind {
; WIN64-NEXT: shrq $2, %rdx
; WIN64-NEXT: addq %rdx, %r8
; WIN64-NEXT: adcq $0, %r8
-; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq %rdx
; WIN64-NEXT: leal (%rdx,%rdx,2), %eax
; WIN64-NEXT: subl %eax, %r8d
; WIN64-NEXT: andl $3, %ecx
@@ -469,16 +451,16 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: shrq %rdx
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -492,16 +474,16 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: shrq %rdx
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -519,16 +501,16 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-3689348814741910323, %r8 # imm = 0xCCCCCCCCCCCCCCCD
+; X86-64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: shrq $2, %rdx
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-3689348814741910323, %r8 # imm = 0xCCCCCCCCCCCCCCCD
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -542,16 +524,16 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD
+; WIN64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: shrq $2, %rdx
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-3689348814741910324, %r9 # imm = 0xCCCCCCCCCCCCCCCC
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -569,10 +551,9 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; X86-64-NEXT: movabsq $1229782938247303441, %rdx # imm = 0x1111111111111111
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
-; X86-64-NEXT: shrq $3, %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: leaq (%rax,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rcx
@@ -594,10 +575,9 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; WIN64-NEXT: movabsq $1229782938247303441, %rdx # imm = 0x1111111111111111
; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $3, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rax,%rax,2), %rax
; WIN64-NEXT: subq %rax, %r9
@@ -623,18 +603,18 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F1
+; X86-64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-16, %rax
-; X86-64-NEXT: shrq $4, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: shlq $4, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F1
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -648,18 +628,18 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1
+; WIN64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-16, %rax
-; WIN64-NEXT: shrq $4, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: shlq $4, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-1085102592571150096, %r9 # imm = 0xF0F0F0F0F0F0F0F0
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -677,9 +657,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X86-64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $7, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $8, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -704,9 +683,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081
+; WIN64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $7, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $8, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -735,18 +713,18 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-71777214294589695, %r8 # imm = 0xFF00FF00FF00FF01
+; X86-64-NEXT: movabsq $71777214294589695, %rdx # imm = 0xFF00FF00FF00FF
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-256, %rax
-; X86-64-NEXT: shrq $8, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: shlq $8, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-71777214294589695, %r8 # imm = 0xFF00FF00FF00FF01
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -760,18 +738,18 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01
+; WIN64-NEXT: movabsq $71777214294589695, %rdx # imm = 0xFF00FF00FF00FF
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-256, %rax
-; WIN64-NEXT: shrq $8, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: shlq $8, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-71777214294589696, %r9 # imm = 0xFF00FF00FF00FF00
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -789,9 +767,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X86-64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $15, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $16, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -816,9 +793,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001
+; WIN64-NEXT: movabsq $281479271743489, %rdx # imm = 0x1000100010001
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $15, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $16, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -847,18 +823,18 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-281470681808895, %r8 # imm = 0xFFFF0000FFFF0001
+; X86-64-NEXT: movabsq $281470681808895, %rdx # imm = 0xFFFF0000FFFF
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; X86-64-NEXT: shrq $16, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: shlq $16, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-281470681808895, %r8 # imm = 0xFFFF0000FFFF0001
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -872,18 +848,18 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001
+; WIN64-NEXT: movabsq $281470681808895, %rdx # imm = 0xFFFF0000FFFF
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; WIN64-NEXT: shrq $16, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: shlq $16, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-281470681808896, %r9 # imm = 0xFFFF0000FFFF0000
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -903,16 +879,16 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: shrq %rdx
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -928,16 +904,16 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %r8, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: shrq %rdx
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
diff --git a/llvm/test/CodeGen/X86/divrem-by-select.ll b/llvm/test/CodeGen/X86/divrem-by-select.ll
index f9582bb7343ba..d9cb6a506e90d 100644
--- a/llvm/test/CodeGen/X86/divrem-by-select.ll
+++ b/llvm/test/CodeGen/X86/divrem-by-select.ll
@@ -28,14 +28,13 @@ define <2 x i64> @udiv_identity_const(<2 x i1> %c, <2 x i64> %x) {
; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1
; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rdx
-; CHECK-X64-V4-NEXT: movabsq $3353953467947191203, %rax # imm = 0x2E8BA2E8BA2E8BA3
+; CHECK-X64-V4-NEXT: movabsq $1676976733973595602, %rax # imm = 0x1745D1745D1745D2
; CHECK-X64-V4-NEXT: mulxq %rax, %rcx, %rcx
; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0
; CHECK-X64-V4-NEXT: vmovq %xmm1, %rdx
; CHECK-X64-V4-NEXT: mulxq %rax, %rax, %rax
; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2
-; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; CHECK-X64-V4-NEXT: vpsrlq $1, %xmm0, %xmm1 {%k1}
+; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm2[0],xmm0[0]
; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-X64-V4-NEXT: retq
%d = select <2 x i1> %c, <2 x i64> <i64 11, i64 11>, <2 x i64> <i64 1, i64 1>
@@ -70,14 +69,14 @@ define <2 x i64> @udiv_identity_const_todo_getter_nonzero(<2 x i1> %c, <2 x i64>
; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1
; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rdx
-; CHECK-X64-V4-NEXT: movabsq $-3689348814741910323, %rax # imm = 0xCCCCCCCCCCCCCCCD
+; CHECK-X64-V4-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; CHECK-X64-V4-NEXT: mulxq %rax, %rcx, %rcx
; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0
; CHECK-X64-V4-NEXT: vmovq %xmm1, %rdx
; CHECK-X64-V4-NEXT: mulxq %rax, %rax, %rax
; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2
; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; CHECK-X64-V4-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1}
+; CHECK-X64-V4-NEXT: vpsrlq $1, %xmm0, %xmm1 {%k1}
; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-X64-V4-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 3196f8177cc9b..3d2b1360121f1 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -127,11 +127,10 @@ define i32 @freeze_zext(i64 %a) nounwind {
; X86ASM: # %bb.0: # %entry
; X86ASM-NEXT: movq %rdi, %rax
; X86ASM-NEXT: movl %eax, %ecx
-; X86ASM-NEXT: movl $3435973837, %edx # imm = 0xCCCCCCCD
-; X86ASM-NEXT: imulq %rcx, %rdx
-; X86ASM-NEXT: shrq $35, %rdx
-; X86ASM-NEXT: addl %edx, %edx
-; X86ASM-NEXT: leal (%rdx,%rdx,4), %ecx
+; X86ASM-NEXT: imulq $858993459, %rcx, %rcx # imm = 0x33333333
+; X86ASM-NEXT: shrq $33, %rcx
+; X86ASM-NEXT: addl %ecx, %ecx
+; X86ASM-NEXT: leal (%rcx,%rcx,4), %ecx
; X86ASM-NEXT: subl %ecx, %eax
; X86ASM-NEXT: # kill: def $eax killed $eax killed $rax
; X86ASM-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index 9741f6f0a5e2d..0a337216c476b 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -8,9 +8,10 @@ define void @knownbits_zext_in_reg(ptr) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl (%eax), %ecx
-; X86-NEXT: imull $101, %ecx, %eax
-; X86-NEXT: shrl $14, %eax
-; X86-NEXT: imull $177, %ecx, %edx
+; X86-NEXT: leal (%ecx,%ecx,4), %eax
+; X86-NEXT: leal (%eax,%eax,4), %eax
+; X86-NEXT: shrl $12, %eax
+; X86-NEXT: imull $175, %ecx, %edx
; X86-NEXT: shrl $14, %edx
; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: xorl %ebx, %ebx
@@ -31,9 +32,10 @@ define void @knownbits_zext_in_reg(ptr) nounwind {
; X64-LABEL: knownbits_zext_in_reg:
; X64: # %bb.0: # %BB
; X64-NEXT: movzbl (%rdi), %eax
-; X64-NEXT: imull $101, %eax, %ecx
-; X64-NEXT: shrl $14, %ecx
-; X64-NEXT: imull $177, %eax, %edx
+; X64-NEXT: leal (%rax,%rax,4), %ecx
+; X64-NEXT: leal (%rcx,%rcx,4), %ecx
+; X64-NEXT: shrl $12, %ecx
+; X64-NEXT: imull $175, %eax, %edx
; X64-NEXT: shrl $14, %edx
; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: xorl %esi, %esi
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d..715c95ee2b44c 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -24,7 +24,7 @@ define <4 x i32> @pow2_non_splat_vec(<4 x i32> %x) {
define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
; CHECK-LABEL: pow2_non_splat_vec_fail0:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [954437177,1073741824,268435456,67108864]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1908874353,1073741824,268435456,67108864]
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
@@ -32,7 +32,7 @@ define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; CHECK-NEXT: movdqa %xmm1, %xmm3
-; CHECK-NEXT: psrld $1, %xmm3
+; CHECK-NEXT: psrld $2, %xmm3
; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
index 3edbcd1fe18eb..ea5b241f3184d 100644
--- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
+++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
@@ -470,21 +470,17 @@ define <4 x i32> @udiv_op0_constant(ptr %p) nounwind {
define <2 x i64> @udiv_op1_constant(ptr %p) nounwind {
; SSE-LABEL: udiv_op1_constant:
; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D
-; SSE-NEXT: mulq %rcx
-; SSE-NEXT: shrq $4, %rdx
+; SSE-NEXT: movabsq $-4392081922311798005, %rax # imm = 0xC30C30C30C30C30B
+; SSE-NEXT: mulq (%rdi)
+; SSE-NEXT: shrq $5, %rdx
; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: udiv_op1_constant:
; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D
-; AVX-NEXT: mulq %rcx
-; AVX-NEXT: shrq $4, %rdx
+; AVX-NEXT: movabsq $-4392081922311798005, %rax # imm = 0xC30C30C30C30C30B
+; AVX-NEXT: mulq (%rdi)
+; AVX-NEXT: shrq $5, %rdx
; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: retq
%x = load i64, ptr %p
@@ -519,11 +515,8 @@ define <16 x i8> @urem_op1_constant(ptr %p) nounwind {
; SSE-LABEL: urem_op1_constant:
; SSE: # %bb.0:
; SSE-NEXT: movzbl (%rdi), %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrb %cl
-; SSE-NEXT: movzbl %cl, %ecx
-; SSE-NEXT: imull $49, %ecx, %ecx
-; SSE-NEXT: shrl $10, %ecx
+; SSE-NEXT: imull $97, %eax, %ecx
+; SSE-NEXT: shrl $12, %ecx
; SSE-NEXT: imull $42, %ecx, %ecx
; SSE-NEXT: subb %cl, %al
; SSE-NEXT: movzbl %al, %eax
@@ -533,11 +526,8 @@ define <16 x i8> @urem_op1_constant(ptr %p) nounwind {
; AVX-LABEL: urem_op1_constant:
; AVX: # %bb.0:
; AVX-NEXT: movzbl (%rdi), %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrb %cl
-; AVX-NEXT: movzbl %cl, %ecx
-; AVX-NEXT: imull $49, %ecx, %ecx
-; AVX-NEXT: shrl $10, %ecx
+; AVX-NEXT: imull $97, %eax, %ecx
+; AVX-NEXT: shrl $12, %ecx
; AVX-NEXT: imull $42, %ecx, %ecx
; AVX-NEXT: subb %cl, %al
; AVX-NEXT: vmovd %eax, %xmm0
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 9e398096bfcc5..09717fb0bf37c 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -284,7 +284,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE2-LABEL: p8_vector_urem_by_const__nonsplat_undef3:
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655765,1431655765,1431655765]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -292,7 +292,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psrld $2, %xmm2
+; SSE2-NEXT: psrld $1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
@@ -309,12 +309,12 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1431655765,1431655765,1431655765,1431655765]
; SSE4-NEXT: pmuludq %xmm2, %xmm1
; SSE4-NEXT: pmuludq %xmm0, %xmm2
; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT: psrld $2, %xmm2
+; SSE4-NEXT: psrld $1, %xmm2
; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE4-NEXT: psubd %xmm2, %xmm0
; SSE4-NEXT: pxor %xmm1, %xmm1
@@ -326,12 +326,12 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655765,1431655765,1431655765,1431655765]
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll
index 0b7d64f38c780..34f0ade956c3b 100644
--- a/llvm/test/CodeGen/X86/pr35636.ll
+++ b/llvm/test/CodeGen/X86/pr35636.ll
@@ -5,10 +5,10 @@
define void @_Z15uint64_to_asciimPc(i64 %arg) {
; HSW-LABEL: _Z15uint64_to_asciimPc:
; HSW: # %bb.0: # %bb
-; HSW-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; HSW-NEXT: movabsq $6490371073168534535, %rax # imm = 0x5A126E1A84AE6C07
; HSW-NEXT: movq %rdi, %rdx
; HSW-NEXT: mulxq %rax, %rax, %rax
-; HSW-NEXT: shrq $42, %rax
+; HSW-NEXT: shrq $45, %rax
; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
; HSW-NEXT: shrq $20, %rax
; HSW-NEXT: leal (%rax,%rax,4), %eax
@@ -22,10 +22,10 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
;
; ZN-LABEL: _Z15uint64_to_asciimPc:
; ZN: # %bb.0: # %bb
-; ZN-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; ZN-NEXT: movabsq $6490371073168534535, %rax # imm = 0x5A126E1A84AE6C07
; ZN-NEXT: movq %rdi, %rdx
; ZN-NEXT: mulxq %rax, %rax, %rax
-; ZN-NEXT: shrq $42, %rax
+; ZN-NEXT: shrq $45, %rax
; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
; ZN-NEXT: shrq $20, %rax
; ZN-NEXT: leal 5(%rax,%rax,4), %eax
diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll
index f1538f3598aec..ce3f8805a6083 100644
--- a/llvm/test/CodeGen/X86/pr38217.ll
+++ b/llvm/test/CodeGen/X86/pr38217.ll
@@ -10,13 +10,13 @@ define dso_local void @_Z12d2s_bufferedmPc(i64 %arg, ptr nocapture %arg1) {
; CHECK-NEXT: jb .LBB0_3
; CHECK-NEXT: # %bb.1: # %bb2.preheader
; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: movabsq $3777893186295716171, %r8 # imm = 0x346DC5D63886594B
+; CHECK-NEXT: movabsq $-3335171328526686933, %r8 # imm = 0xD1B71758E219652B
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: shrq $11, %rdx
+; CHECK-NEXT: shrq $13, %rdx
; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710
; CHECK-NEXT: movq %rdi, %r9
; CHECK-NEXT: subq %rax, %r9
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index a950a13b0d8ca..3a7a01cb88124 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -12,44 +12,26 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX256BW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX256BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX256BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
-; AVX256BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX256BW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX256BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX256BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX256BW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX256BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX256BW-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX256BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX256BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX256BW-NEXT: retq
;
; AVX512BWVL-LABEL: test_div7_32i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BWVL-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BWVL-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BWVL-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
%res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
diff --git a/llvm/test/CodeGen/X86/rem.ll b/llvm/test/CodeGen/X86/rem.ll
index 893b49f9a0179..6890240f561be 100644
--- a/llvm/test/CodeGen/X86/rem.ll
+++ b/llvm/test/CodeGen/X86/rem.ll
@@ -40,10 +40,9 @@ define i32 @test3(i32 %X) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; CHECK-NEXT: movl $16843009, %edx # imm = 0x1010101
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: shrl $7, %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll $8, %eax
; CHECK-NEXT: subl %eax, %edx
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index 1ead3f98ab5d6..b96a644b803b9 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -83,14 +83,13 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
; X64-LABEL: vrolq_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: mulq %rcx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
; X64-NEXT: vprolq $57, %xmm0, %xmm0
; X64-NEXT: retq
%lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
@@ -265,7 +264,7 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rcx
-; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: vmovq %rdx, %xmm1
@@ -274,14 +273,19 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
; X64-NEXT: mulq %rdi
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
-; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
+; X64-NEXT: movabsq $-6180857105216966647, %rdi # imm = 0xAA392F35DC17F009
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: vmovq %rdx, %xmm1
+; X64-NEXT: subq %rdx, %rcx
+; X64-NEXT: shrq %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: vmovq %rcx, %xmm1
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: vmovq %rdx, %xmm2
+; X64-NEXT: subq %rdx, %rsi
+; X64-NEXT: shrq %rsi
+; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: vmovq %rsi, %xmm2
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X64-NEXT: vpsrlq $9, %xmm1, %xmm1
; X64-NEXT: vpsllq $56, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 8f046a4f5aea5..b86e1d6674340 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -82,17 +82,16 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind {
; X86-LABEL: rolb_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %eax
-; X86-NEXT: shrl $9, %eax
-; X86-NEXT: rolb $4, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: imull $85, %eax, %eax
+; X86-NEXT: rolb $4, %ah
+; X86-NEXT: movb %ah, %al
; X86-NEXT: retl
;
; X64-LABEL: rolb_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $171, %eax, %eax
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: imull $85, %eax, %eax
+; X64-NEXT: shrl $8, %eax
; X64-NEXT: rolb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -224,32 +223,23 @@ define i8 @no_extract_udiv(i8 %i) nounwind {
; X86-LABEL: no_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %ecx
-; X86-NEXT: imull $79, %eax, %edx
-; X86-NEXT: subb %dh, %al
-; X86-NEXT: shrb %al
-; X86-NEXT: addb %dh, %al
-; X86-NEXT: shrb $5, %al
-; X86-NEXT: shlb $3, %ch
-; X86-NEXT: orb %al, %ch
-; X86-NEXT: andb $-9, %ch
-; X86-NEXT: movb %ch, %al
+; X86-NEXT: imull $85, %eax, %ecx
+; X86-NEXT: imull $83, %eax, %eax
+; X86-NEXT: shlb $4, %ch
+; X86-NEXT: shrl $12, %eax
+; X86-NEXT: orb %ch, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %ecx
-; X64-NEXT: imull $171, %ecx, %eax
+; X64-NEXT: imull $85, %ecx, %eax
; X64-NEXT: shrl $8, %eax
-; X64-NEXT: imull $79, %ecx, %edx
-; X64-NEXT: shrl $8, %edx
-; X64-NEXT: subb %dl, %cl
-; X64-NEXT: shrb %cl
-; X64-NEXT: addb %dl, %cl
-; X64-NEXT: shrb $5, %cl
-; X64-NEXT: shlb $3, %al
+; X64-NEXT: imull $83, %ecx, %ecx
+; X64-NEXT: shrl $12, %ecx
+; X64-NEXT: shlb $4, %al
; X64-NEXT: orb %cl, %al
-; X64-NEXT: andb $-9, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
diff --git a/llvm/test/CodeGen/X86/urem-i8-constant.ll b/llvm/test/CodeGen/X86/urem-i8-constant.ll
index ae218405c0ef0..493b69fbf2937 100644
--- a/llvm/test/CodeGen/X86/urem-i8-constant.ll
+++ b/llvm/test/CodeGen/X86/urem-i8-constant.ll
@@ -7,8 +7,8 @@ define i8 @foo(i8 %tmp325) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: imull $111, %eax, %ecx
-; CHECK-NEXT: shrl $12, %ecx
+; CHECK-NEXT: imull $55, %eax, %ecx
+; CHECK-NEXT: shrl $11, %ecx
; CHECK-NEXT: leal (%ecx,%ecx,8), %edx
; CHECK-NEXT: leal (%ecx,%edx,4), %ecx
; CHECK-NEXT: subb %cl, %al
diff --git a/llvm/test/CodeGen/X86/urem-lkk.ll b/llvm/test/CodeGen/X86/urem-lkk.ll
index 573f875544cd4..00d7b09d013fe 100644
--- a/llvm/test/CodeGen/X86/urem-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-lkk.ll
@@ -6,13 +6,9 @@ define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: subl %ecx, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: addl %ecx, %edx
-; CHECK-NEXT: shrl $6, %edx
+; CHECK-NEXT: movl $2893451651, %edx # imm = 0xAC769183
+; CHECK-NEXT: imulq %rcx, %rdx
+; CHECK-NEXT: shrq $38, %rdx
; CHECK-NEXT: imull $95, %edx, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: retq
@@ -26,7 +22,7 @@ define i32 @fold_urem_positive_even(i32 %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3
+; CHECK-NEXT: movl $4149100481, %edx # imm = 0xF74E3FC1
; CHECK-NEXT: imulq %rcx, %rdx
; CHECK-NEXT: shrq $42, %rdx
; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424
@@ -41,17 +37,14 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: imulq $1491936009, %rax, %rcx # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: subl %ecx, %eax
-; CHECK-NEXT: shrl %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: shrl $6, %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: movl $2893451651, %eax # imm = 0xAC769183
+; CHECK-NEXT: imulq %rcx, %rax
+; CHECK-NEXT: shrq $38, %rax
; CHECK-NEXT: imull $95, %eax, %ecx
; CHECK-NEXT: subl %ecx, %edi
; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
%1 = urem i32 %x, 95
%2 = udiv i32 %x, 95
@@ -93,12 +86,14 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
define i64 @dont_fold_urem_i64(i64 %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: # %bb.0:
+; CHECK-NEXT: movabsq $188232082384791343, %rcx # imm = 0x29CBC14E5E0A72F
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: shrq $4, %rdx
-; CHECK-NEXT: imulq $98, %rdx, %rax
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rdx, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: imulq $98, %rax, %rax
; CHECK-NEXT: subq %rax, %rdi
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 2166e43fc4286..fb411b35e1b6a 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -246,21 +246,18 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_undef1:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [171798692,171798692,171798692,171798692]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [25,25,25,25]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -271,12 +268,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_undef1:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $3, %xmm2
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
@@ -287,12 +283,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_odd_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -303,12 +298,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_urem_odd_undef1:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25]
; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -320,12 +314,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-LABEL: test_urem_odd_undef1:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -341,7 +334,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_undef1:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [687194767,687194767,687194767,687194767]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -349,15 +342,19 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $4, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [100,100,100,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -366,14 +363,18 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_even_undef1:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $4, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -382,12 +383,15 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_even_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -398,12 +402,15 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_urem_even_undef1:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $4, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -415,12 +422,15 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-LABEL: test_urem_even_undef1:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 94c7892795c2b..b825d1a6931c8 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -6,81 +6,77 @@
define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; SSE-LABEL: fold_urem_vec_1:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl $2, %ecx
-; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT: shrl $19, %ecx
-; SSE-NEXT: imull $124, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movzwl %cx, %edx
-; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %edx
-; SSE-NEXT: imull $95, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT: shrl $17, %ecx
-; SSE-NEXT: imull $98, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shll $6, %ecx
+; SSE-NEXT: leal (%rcx,%rax,2), %ecx
; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $9, %edx
-; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
+; SSE-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pextrw $1, %xmm0, %ecx
+; SSE-NEXT: imull $1057, %ecx, %edx # imm = 0x421
+; SSE-NEXT: shrl $16, %edx
+; SSE-NEXT: movl %ecx, %esi
+; SSE-NEXT: subl %edx, %esi
+; SSE-NEXT: movzwl %si, %esi
+; SSE-NEXT: shrl %esi
+; SSE-NEXT: addl %edx, %esi
+; SSE-NEXT: shrl %esi
+; SSE-NEXT: imull $124, %esi, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: movzwl %dx, %esi
+; SSE-NEXT: imull $690, %esi, %esi # imm = 0x2B2
+; SSE-NEXT: shrl $16, %esi
+; SSE-NEXT: imull $95, %esi, %esi
+; SSE-NEXT: subl %esi, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: pinsrw $1, %ecx, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %ecx
+; SSE-NEXT: imull $21399, %ecx, %edx # imm = 0x5397
+; SSE-NEXT: shrl $21, %edx
+; SSE-NEXT: imull $98, %edx, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: pinsrw $2, %ecx, %xmm1
; SSE-NEXT: pinsrw $3, %eax, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fold_urem_vec_1:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT: shrl $19, %ecx
-; AVX-NEXT: imull $124, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movzwl %cx, %edx
-; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT: shrl $22, %edx
-; AVX-NEXT: imull $95, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT: shrl $17, %ecx
-; AVX-NEXT: imull $98, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shll $6, %ecx
+; AVX-NEXT: leal (%rcx,%rax,2), %ecx
; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $9, %edx
-; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
+; AVX-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpextrw $1, %xmm0, %ecx
+; AVX-NEXT: imull $1057, %ecx, %edx # imm = 0x421
+; AVX-NEXT: shrl $16, %edx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: movzwl %si, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: imull $124, %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %esi
+; AVX-NEXT: imull $690, %esi, %esi # imm = 0x2B2
+; AVX-NEXT: shrl $16, %esi
+; AVX-NEXT: imull $95, %esi, %esi
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: vmovd %edx, %xmm1
+; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-NEXT: imull $21399, %ecx, %edx # imm = 0x5397
+; AVX-NEXT: shrl $21, %edx
+; AVX-NEXT: imull $98, %edx, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
ret <4 x i16> %1
@@ -89,17 +85,15 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; SSE-LABEL: fold_urem_vec_2:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [690,690,690,690,690,690,690,690]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psrlw $6, %xmm1
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
; SSE-NEXT: psubw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fold_urem_vec_2:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
-; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [690,690,690,690,690,690,690,690]
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -112,9 +106,8 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; SSE-LABEL: combine_urem_udiv:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [690,690,690,690,690,690,690,690]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psrlw $6, %xmm1
; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
; SSE-NEXT: pmullw %xmm1, %xmm2
; SSE-NEXT: psubw %xmm2, %xmm0
@@ -123,8 +116,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
;
; AVX-LABEL: combine_urem_udiv:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
-; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [690,690,690,690,690,690,690,690]
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -148,8 +140,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; SSE-NEXT: andl $7, %eax
; SSE-NEXT: pinsrw $2, %eax, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %ecx
+; SSE-NEXT: imull $690, %eax, %ecx # imm = 0x2B2
+; SSE-NEXT: shrl $16, %ecx
; SSE-NEXT: imull $95, %ecx, %ecx
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pinsrw $3, %eax, %xmm1
@@ -166,8 +158,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX1-NEXT: shrl $22, %ecx
+; AVX1-NEXT: imull $690, %eax, %ecx # imm = 0x2B2
+; AVX1-NEXT: shrl $16, %ecx
; AVX1-NEXT: imull $95, %ecx, %ecx
; AVX1-NEXT: subl %ecx, %eax
; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -184,8 +176,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX2-NEXT: shrl $22, %ecx
+; AVX2-NEXT: imull $690, %eax, %ecx # imm = 0x2B2
+; AVX2-NEXT: shrl $16, %ecx
; AVX2-NEXT: imull $95, %ecx, %ecx
; AVX2-NEXT: subl %ecx, %eax
; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -198,30 +190,24 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; SSE-LABEL: dont_fold_urem_one:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $4, %edx
-; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT: shll $3, %ecx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: addl %eax, %edx
; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; SSE-NEXT: imull $51305, %eax, %ecx # imm = 0xC869
; SSE-NEXT: shrl $25, %ecx
; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pinsrw $2, %edx, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: imull $45589, %eax, %ecx # imm = 0xB215
+; SSE-NEXT: shrl $20, %ecx
+; SSE-NEXT: leal (%rcx,%rcx,2), %edx
+; SSE-NEXT: shll $3, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: addl %eax, %ecx
+; SSE-NEXT: pinsrw $2, %ecx, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT: shrl $26, %ecx
+; SSE-NEXT: imull $24749, %eax, %ecx # imm = 0x60AD
+; SSE-NEXT: shrl $27, %ecx
; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pinsrw $3, %eax, %xmm1
@@ -230,30 +216,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
;
; AVX-LABEL: dont_fold_urem_one:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $4, %edx
-; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX-NEXT: shll $3, %ecx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: addl %eax, %edx
; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX-NEXT: imull $51305, %eax, %ecx # imm = 0xC869
; AVX-NEXT: shrl $25, %ecx
; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %eax
+; AVX-NEXT: imull $45589, %eax, %ecx # imm = 0xB215
+; AVX-NEXT: shrl $20, %ecx
+; AVX-NEXT: leal (%rcx,%rcx,2), %edx
+; AVX-NEXT: shll $3, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX-NEXT: shrl $26, %ecx
+; AVX-NEXT: imull $24749, %eax, %ecx # imm = 0x60AD
+; AVX-NEXT: shrl $27, %ecx
; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -276,34 +256,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; SSE-LABEL: dont_fold_urem_i64:
; SSE: # %bb.0:
; SSE-NEXT: movq %xmm1, %rcx
-; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; SSE-NEXT: movabsq $-5614226457215950493, %rdx # imm = 0xB21642C8590B2163
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: mulq %rdx
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: subq %rdx, %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: addq %rdx, %rax
-; SSE-NEXT: shrq $4, %rax
-; SSE-NEXT: leaq (%rax,%rax,2), %rdx
-; SSE-NEXT: shlq $3, %rdx
-; SSE-NEXT: subq %rdx, %rax
-; SSE-NEXT: addq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm2
+; SSE-NEXT: shrq $4, %rdx
+; SSE-NEXT: leaq (%rdx,%rdx,2), %rax
+; SSE-NEXT: shlq $3, %rax
+; SSE-NEXT: subq %rax, %rdx
+; SSE-NEXT: addq %rcx, %rdx
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: pextrq $1, %xmm1, %rcx
-; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; SSE-NEXT: movabsq $3483213337908644819, %rdx # imm = 0x3056DC1372F28BD3
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: mulq %rdx
-; SSE-NEXT: shrq $12, %rdx
+; SSE-NEXT: shrq $10, %rdx
; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
; SSE-NEXT: subq %rax, %rcx
; SSE-NEXT: movq %rcx, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movabsq $1805185964399711473, %rdx # imm = 0x190D4F120190D4F1
; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
; SSE-NEXT: mulq %rdx
-; SSE-NEXT: shrq $7, %rdx
+; SSE-NEXT: shrq $6, %rdx
; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
; SSE-NEXT: subq %rax, %rcx
; SSE-NEXT: movq %rcx, %xmm0
@@ -315,34 +290,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovq %xmm1, %rcx
-; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; AVX1-NEXT: movabsq $-5614226457215950493, %rdx # imm = 0xB21642C8590B2163
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $4, %rax
-; AVX1-NEXT: leaq (%rax,%rax,2), %rdx
-; AVX1-NEXT: shlq $3, %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: shrq $4, %rdx
+; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax
+; AVX1-NEXT: shlq $3, %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; AVX1-NEXT: movabsq $3483213337908644819, %rdx # imm = 0x3056DC1372F28BD3
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: shrq $12, %rdx
+; AVX1-NEXT: shrq $10, %rdx
; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
; AVX1-NEXT: subq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movabsq $1805185964399711473, %rdx # imm = 0x190D4F120190D4F1
; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: shrq $7, %rdx
+; AVX1-NEXT: shrq $6, %rdx
; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
; AVX1-NEXT: subq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
@@ -354,34 +324,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; AVX2-NEXT: movabsq $-5614226457215950493, %rdx # imm = 0xB21642C8590B2163
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $4, %rax
-; AVX2-NEXT: leaq (%rax,%rax,2), %rdx
-; AVX2-NEXT: shlq $3, %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: shrq $4, %rdx
+; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax
+; AVX2-NEXT: shlq $3, %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; AVX2-NEXT: movabsq $3483213337908644819, %rdx # imm = 0x3056DC1372F28BD3
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $12, %rdx
+; AVX2-NEXT: shrq $10, %rdx
; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
; AVX2-NEXT: subq %rax, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movabsq $1805185964399711473, %rdx # imm = 0x190D4F120190D4F1
; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $7, %rdx
+; AVX2-NEXT: shrq $6, %rdx
; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
; AVX2-NEXT: subq %rax, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index a9427be39ca3e..c61c827898f4a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -12,67 +12,40 @@
define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_div7_2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; SSE2-NEXT: mulq %rcx
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: mulq %rcx
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: psrlq $2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rcx
-; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
-; SSE41-NEXT: movq %xmm0, %rcx
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: movq %rdx, %xmm1
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: psrlq $2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_div7_2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0
; AVX-NEXT: retq
%res = udiv <2 x i64> %a, <i64 7, i64 7>
ret <2 x i64> %res
@@ -82,17 +55,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_4i32:
@@ -100,13 +68,9 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; SSE41-NEXT: pmuludq %xmm2, %xmm1
-; SSE41-NEXT: pmuludq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE41-NEXT: psubd %xmm2, %xmm0
-; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm2, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: pmuludq %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_4i32:
@@ -114,13 +78,9 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i32:
@@ -128,13 +88,9 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
ret <4 x i32> %res
@@ -143,21 +99,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: test_div7_8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [9363,9363,9363,9363,9363,9363,9363,9363]
; SSE-NEXT: retq
;
; AVX-LABEL: test_div7_8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX-NEXT: retq
%res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <8 x i16> %res
@@ -172,37 +119,24 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT: pmullw %xmm3, %xmm4
-; SSE2-NEXT: psrlw $8, %xmm4
-; SSE2-NEXT: packuswb %xmm2, %xmm4
-; SSE2-NEXT: psubb %xmm4, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: paddb %xmm4, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw %xmm3, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw %xmm1, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: packuswb %xmm2, %xmm3
-; SSE41-NEXT: psubb %xmm3, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: paddb %xmm3, %xmm0
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_16i8:
@@ -212,46 +146,29 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2NOBW-LABEL: test_div7_16i8:
; AVX2NOBW: # %bb.0:
-; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vzeroupper
; AVX2NOBW-NEXT: retq
;
; AVX512BW-LABEL: test_div7_16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
@@ -268,33 +185,29 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,113,51,185,85,20,145]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: psubb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,0,0,0,0,128]
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,256,64,32,128,64,64,256]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,256,64,128,32,64,256,32]
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -302,38 +215,28 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-LABEL: test_divconstant_16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE41-NEXT: psllw $7, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,113,51,185,85,20,145]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $7, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: packuswb %xmm3, %xmm4
-; SSE41-NEXT: psubb %xmm4, %xmm0
+; SSE41-NEXT: packuswb %xmm2, %xmm3
+; SSE41-NEXT: psubb %xmm3, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,128]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: paddb %xmm4, %xmm2
+; SSE41-NEXT: paddb %xmm3, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,256,64,32,128,64,64,256]
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,256,64,128,32,64,256,32]
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: packuswb %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -341,35 +244,27 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-LABEL: test_divconstant_16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [145,20,85,185,51,113,113,37]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,64,32,128,64,64,256]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,256,64,128,32,64,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -377,21 +272,19 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-LABEL: test_divconstant_16i8:
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,256,64,128,32,64,256,32,32,256,64,32,128,64,64,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -400,20 +293,18 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
;
; AVX512BW-LABEL: test_divconstant_16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,2,1,3,2,0,3,3,0,2,3,1,2,2,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -434,28 +325,18 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: addq %rdx, %rax
-; SSE2-NEXT: shrq $2, %rax
-; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: addq %rcx, %rax
-; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: addq %rdx, %rax
-; SSE2-NEXT: shrq $2, %rax
-; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: addq %rcx, %rax
-; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -466,27 +347,17 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: shrq %rax
-; SSE41-NEXT: addq %rdx, %rax
-; SSE41-NEXT: shrq $2, %rax
-; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: addq %rcx, %rax
-; SSE41-NEXT: movq %rax, %xmm1
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: shrq %rax
-; SSE41-NEXT: addq %rdx, %rax
-; SSE41-NEXT: shrq $2, %rax
-; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: addq %rcx, %rax
-; SSE41-NEXT: movq %rax, %xmm0
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
@@ -496,27 +367,17 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%res = urem <2 x i64> %a, <i64 7, i64 7>
@@ -534,15 +395,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: psrld $2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pslld $3, %xmm2
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pslld $3, %xmm1
+; SSE2-NEXT: psubd %xmm1, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
@@ -553,15 +409,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: pmuludq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: psrld $1, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: pslld $3, %xmm2
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pslld $3, %xmm1
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
@@ -572,10 +423,6 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
; AVX1-NEXT: vpslld $3, %xmm1, %xmm2
; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -589,10 +436,6 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
; AVX2-NEXT: vpslld $3, %xmm1, %xmm2
; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -606,24 +449,15 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psubw %xmm1, %xmm2
-; SSE-NEXT: psrlw $1, %xmm2
-; SSE-NEXT: paddw %xmm1, %xmm2
-; SSE-NEXT: psrlw $2, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psllw $3, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm2
-; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllw $3, %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm1
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_rem7_8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1
; AVX-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -646,18 +480,11 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pmullw %xmm3, %xmm4
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: packuswb %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psubb %xmm4, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: paddb %xmm4, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psllw $3, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psllw $3, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: psubb %xmm2, %xmm1
-; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: psubb %xmm1, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_16i8:
@@ -672,18 +499,11 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: pmullw %xmm1, %xmm3
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psubb %xmm3, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: psllw $3, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: paddb %xmm3, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllw $3, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: psubb %xmm2, %xmm1
-; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: psubb %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
@@ -697,12 +517,6 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -716,12 +530,6 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -735,12 +543,6 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -761,37 +563,33 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,113,51,185,85,20,145]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,128]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: paddb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [32,256,64,32,128,64,64,256]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,13,12,11,10,9,9,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,256,64,128,32,64,256,32]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14]
; SSE2-NEXT: pand %xmm4, %xmm2
@@ -802,41 +600,31 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-LABEL: test_remconstant_16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE41-NEXT: psllw $7, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $7, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: packuswb %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubb %xmm4, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,113,51,185,85,20,145]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm4, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubb %xmm3, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: packuswb %xmm3, %xmm2
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,0,0,0,0,128]
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: packuswb %xmm2, %xmm4
+; SSE41-NEXT: paddb %xmm3, %xmm4
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [32,256,64,32,128,64,64,256]
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,256,64,128,32,64,256,32]
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: packuswb %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; SSE41-NEXT: psllw $8, %xmm1
@@ -849,35 +637,27 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-LABEL: test_remconstant_16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [145,20,85,185,51,113,113,37]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,64,32,128,64,64,256]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,256,64,128,32,64,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
@@ -891,21 +671,19 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-LABEL: test_remconstant_16i8:
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,64,128,32,64,256,32,32,256,64,32,128,64,64,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -917,20 +695,18 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
;
; AVX512BW-LABEL: test_remconstant_16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,2,1,3,2,0,3,3,0,2,3,1,2,2,0]
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 3ed716881281d..a0813134276bb 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -10,79 +10,44 @@
define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_div7_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %res
@@ -91,27 +56,19 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
@@ -119,13 +76,9 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %res
@@ -134,28 +87,16 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_div7_16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX2-NEXT: retq
%res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <16 x i16> %res
@@ -170,31 +111,17 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -205,30 +132,18 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
%res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
@@ -243,66 +158,44 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [145,20,85,185,51,113,32,37]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [185,97,51,107,113,15,16,17]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,32,32,32,128,256,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,256,64,32,128,64,256,256]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,16,16,128,64,16,256,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16,32,64,32,32,256,256,256]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [17,16,15,113,107,51,97,185]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,16,64,128,16,16,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,256,256,32,32,64,32,16]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,256,64,128,32,64,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -312,31 +205,26 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [17,16,15,113,107,51,97,185,145,20,85,185,51,113,32,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,113,51,185,85,20,145,185,97,51,107,113,15,16,17]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,256,16,64,128,16,16,32,64,64,32,32,32,128,256,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [256,256,256,32,32,64,32,16,32,256,64,32,128,64,256,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,32,16,16,128,64,16,256,32]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,256,64,128,32,64,256,32,16,32,64,32,32,256,256,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: retq
@@ -344,13 +232,12 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX512BW-LABEL: test_divconstant_32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,32,57,205,117,171,79,147,137,16,241,57,27,205,135,187,187,135,205,27,57,241,16,137,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,32,113,51,185,85,20,145,17,16,15,113,107,51,97,185,185,97,51,107,113,15,16,17,145,20,85,185,51,113,32,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
@@ -374,52 +261,32 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -431,52 +298,32 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -494,10 +341,6 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
; AVX1-NEXT: vpslld $3, %xmm2, %xmm4
; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -506,10 +349,6 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
; AVX1-NEXT: vpslld $3, %xmm2, %xmm3
; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
@@ -524,10 +363,6 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1
; AVX2-NEXT: vpslld $3, %ymm1, %ymm2
; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -542,18 +377,10 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
-; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4
; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
@@ -563,10 +390,6 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX2-LABEL: test_rem7_16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2
; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
@@ -588,18 +411,10 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $3, %xmm3, %xmm7
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
@@ -608,14 +423,8 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -632,12 +441,6 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
-; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
@@ -650,12 +453,6 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
@@ -674,77 +471,55 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [145,20,85,185,51,113,32,37]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [185,97,51,107,113,15,16,17]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,128,0,0,0,128]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,64,32,32,32,128,256,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [32,256,64,32,128,64,256,256]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,16,16,128,64,16,256,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [16,32,64,32,32,256,256,256]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
-; AVX1-NEXT: vpsllw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
+; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [17,16,15,113,107,51,97,185]
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,32,57,205,117,171,79,147]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,128,0]
-; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,128,0,0,0]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,16,64,128,16,16,32]
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,256,256,32,32,64,32,16]
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,256,64,128,32,64,256,32]
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -756,31 +531,26 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [17,16,15,113,107,51,97,185,145,20,85,185,51,113,32,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,113,51,185,85,20,145,185,97,51,107,113,15,16,17]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm3
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,256,16,64,128,16,16,32,64,64,32,32,32,128,256,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,32,32,64,32,16,32,256,64,32,128,64,256,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,32,16,16,128,64,16,256,32]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,64,128,32,64,256,32,16,32,64,32,32,256,256,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -794,13 +564,12 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX512BW-LABEL: test_remconstant_32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,32,57,205,117,171,79,147,137,16,241,57,27,205,135,187,187,135,205,27,57,241,16,137,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,32,113,51,185,85,20,145,17,16,15,113,107,51,97,185,185,97,51,107,113,15,16,17,145,20,85,185,51,113,32,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [128,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index ef6129cc85889..590e1f5240366 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -10,73 +10,40 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_div7_8i64:
; AVX: # %bb.0:
; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
-; AVX-NEXT: vmovq %xmm1, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rax
+; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX-NEXT: vpsrlq $2, %zmm0, %zmm0
; AVX-NEXT: retq
%res = udiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %res
@@ -87,14 +54,10 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
-; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
-; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm0
-; AVX-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX-NEXT: vpaddd %zmm3, %zmm0, %zmm0
-; AVX-NEXT: vpsrld $2, %zmm0, %zmm0
+; AVX-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm1
+; AVX-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
+; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
; AVX-NEXT: retq
%res = udiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -103,28 +66,16 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512BW-NEXT: retq
%res = udiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <32 x i16> %res
@@ -133,36 +84,24 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_div7_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_64i8:
@@ -172,16 +111,10 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = udiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <64 x i8> %res
@@ -197,57 +130,43 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,9,145,151,39,163,85,177,145,20,85,185,51,113,32,37]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [107,55,113,233,15,247,8,9,185,97,51,107,113,15,16,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,128,0,128,0,0,0,128,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [16,16,256,128,32,64,16,16,64,64,32,32,32,128,256,64]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,256,16,16,64,16,32,16,32,256,64,32,128,64,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [16,32,16,8,128,8,256,256,16,32,64,32,32,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,16,15,113,107,51,97,185,9,8,247,15,233,113,55,107]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,113,51,185,85,20,145,177,85,163,39,151,145,9,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,0,0,0,0,0,0,128,0,0,0,128,0,128,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,256,16,64,128,16,16,32,16,256,8,8,8,32,16,64]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,32,32,64,32,16,256,256,8,128,8,16,32,16]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,256,64,128,32,64,256,32,16,32,16,64,16,16,256,128]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -257,22 +176,18 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [17,16,15,113,107,51,97,185,9,8,247,15,233,113,55,107,17,9,145,151,39,163,85,177,145,20,85,185,51,113,32,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,113,51,185,85,20,145,177,85,163,39,151,145,9,17,107,55,113,233,15,247,8,9,185,97,51,107,113,15,16,17]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,128,0,0,0,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,0,0,0,0,0,0,128,0,0,0,128,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
@@ -300,105 +215,65 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -416,12 +291,8 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm1
-; AVX-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX-NEXT: vpaddd %zmm3, %zmm1, %zmm1
-; AVX-NEXT: vpsrld $2, %zmm1, %zmm1
-; AVX-NEXT: vpslld $3, %zmm1, %zmm2
-; AVX-NEXT: vpsubd %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpslld $3, %zmm3, %zmm1
+; AVX-NEXT: vpsubd %zmm1, %zmm3, %zmm1
; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = urem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -434,18 +305,10 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm4
; AVX512F-NEXT: vpsubw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -455,10 +318,6 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512BW-LABEL: test_rem7_32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@@ -480,18 +339,10 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5
-; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm5
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm7
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
@@ -500,14 +351,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -524,12 +369,6 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
@@ -549,68 +388,54 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,9,145,151,39,163,85,177,145,20,85,185,51,113,32,37]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [107,55,113,233,15,247,8,9,185,97,51,107,113,15,16,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,128,0,128,0,0,0,128,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm4
; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [16,16,256,128,32,64,16,16,64,64,32,32,32,128,256,64]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [128,256,16,16,64,16,32,16,32,256,64,32,128,64,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [16,32,16,8,128,8,256,256,16,32,64,32,32,256,256,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm4
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
-; AVX512F-NEXT: vpsllw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
+; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,16,15,113,107,51,97,185,9,8,247,15,233,113,55,107]
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,113,51,185,85,20,145,177,85,163,39,151,145,9,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,128,0,0,0,128,0,128,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
-; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [32,256,16,64,128,16,16,32,16,256,8,8,8,32,16,64]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16]
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,32,32,64,32,16,256,256,8,128,8,16,32,16]
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,64,128,32,64,256,32,16,32,16,64,16,16,256,128]
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
@@ -622,22 +447,18 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [17,16,15,113,107,51,97,185,9,8,247,15,233,113,55,107,17,9,145,151,39,163,85,177,145,20,85,185,51,113,32,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,113,51,185,85,20,145,177,85,163,39,151,145,9,17,107,55,113,233,15,247,8,9,185,97,51,107,113,15,16,17]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm1[8],zmm3[9],zmm1[9],zmm3[10],zmm1[10],zmm3[11],zmm1[11],zmm3[12],zmm1[12],zmm3[13],zmm1[13],zmm3[14],zmm1[14],zmm3[15],zmm1[15],zmm3[24],zmm1[24],zmm3[25],zmm1[25],zmm3[26],zmm1[26],zmm3[27],zmm1[27],zmm3[28],zmm1[28],zmm3[29],zmm1[29],zmm3[30],zmm1[30],zmm3[31],zmm1[31],zmm3[40],zmm1[40],zmm3[41],zmm1[41],zmm3[42],zmm1[42],zmm3[43],zmm1[43],zmm3[44],zmm1[44],zmm3[45],zmm1[45],zmm3[46],zmm1[46],zmm3[47],zmm1[47],zmm3[56],zmm1[56],zmm3[57],zmm1[57],zmm3[58],zmm1[58],zmm3[59],zmm1[59],zmm3[60],zmm1[60],zmm3[61],zmm1[61],zmm3[62],zmm1[62],zmm3[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,128,0,0,0,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm1[0],zmm3[1],zmm1[1],zmm3[2],zmm1[2],zmm3[3],zmm1[3],zmm3[4],zmm1[4],zmm3[5],zmm1[5],zmm3[6],zmm1[6],zmm3[7],zmm1[7],zmm3[16],zmm1[16],zmm3[17],zmm1[17],zmm3[18],zmm1[18],zmm3[19],zmm1[19],zmm3[20],zmm1[20],zmm3[21],zmm1[21],zmm3[22],zmm1[22],zmm3[23],zmm1[23],zmm3[32],zmm1[32],zmm3[33],zmm1[33],zmm3[34],zmm1[34],zmm3[35],zmm1[35],zmm3[36],zmm1[36],zmm3[37],zmm1[37],zmm3[38],zmm1[38],zmm3[39],zmm1[39],zmm3[48],zmm1[48],zmm3[49],zmm1[49],zmm3[50],zmm1[50],zmm3[51],zmm1[51],zmm3[52],zmm1[52],zmm3[53],zmm1[53],zmm3[54],zmm1[54],zmm3[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,0,128,0,0,0,128,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpaddb %zmm2, %zmm3, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
index 10a840218c864..27c7cd0c9be97 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -7,17 +7,12 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: psubd %xmm2, %xmm0
-; X64-NEXT: psrld $1, %xmm0
-; X64-NEXT: paddd %xmm2, %xmm0
-; X64-NEXT: psrld $2, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
@@ -30,16 +25,11 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: psubd %xmm2, %xmm0
-; X86-NEXT: psrld $1, %xmm0
-; X86-NEXT: paddd %xmm2, %xmm0
-; X86-NEXT: psrld $2, %xmm0
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
%a = load <2 x i32>, ptr %x
%b = udiv <2 x i32> %a, <i32 7, i32 7>
@@ -59,16 +49,11 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
; X64-NEXT: pmuludq %xmm1, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psubd %xmm2, %xmm1
-; X64-NEXT: psrld $1, %xmm1
-; X64-NEXT: paddd %xmm2, %xmm1
-; X64-NEXT: psrld $2, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pslld $3, %xmm2
-; X64-NEXT: psubd %xmm2, %xmm1
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: movq %xmm1, (%rsi)
+; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: pslld $3, %xmm1
+; X64-NEXT: psubd %xmm1, %xmm2
+; X64-NEXT: paddd %xmm0, %xmm2
+; X64-NEXT: movq %xmm2, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem7_v2i32:
@@ -85,16 +70,11 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: pmuludq %xmm1, %xmm3
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: psubd %xmm2, %xmm1
-; X86-NEXT: psrld $1, %xmm1
-; X86-NEXT: paddd %xmm2, %xmm1
-; X86-NEXT: psrld $2, %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pslld $3, %xmm2
-; X86-NEXT: psubd %xmm2, %xmm1
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: movdqa %xmm2, %xmm1
+; X86-NEXT: pslld $3, %xmm1
+; X86-NEXT: psubd %xmm1, %xmm2
+; X86-NEXT: paddd %xmm0, %xmm2
+; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
%a = load <2 x i32>, ptr %x
%b = urem <2 x i32> %a, <i32 7, i32 7>
diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
index 6007c4f0b0231..f12f525fd5995 100644
--- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
@@ -8,21 +8,19 @@
define <8 x i8> @vshli_target_constant(<8 x i16> %arg, <8 x i32> %arg1) {
; CHECK-LABEL: vshli_target_constant:
; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1431655765,1431655765,1431655765,1431655765]
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-NEXT: psrld $1, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT: psrld $1, %xmm2
; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; CHECK-NEXT: pand %xmm3, %xmm2
; CHECK-NEXT: pand %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll b/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll
index df48a29156caa..5cb0e7e08ea6d 100644
--- a/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll
+++ b/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll
@@ -1,9 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; Formerly there were two shifts. rdar://8771012.
define i32 @f9188_mul365384439_shift27(i32 %A) nounwind {
-; CHECK: imulq $365384439,
-; CHECK: shrq $59, %rax
+; CHECK-LABEL: f9188_mul365384439_shift27:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: imulq $1461537755, %rax, %rax ## imm = 0x571D4BDB
+; CHECK-NEXT: shrq $61, %rax
+; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
+; CHECK-NEXT: retq
%tmp1 = udiv i32 %A, 1577682821 ; <i32> [#uses=1]
ret i32 %tmp1
}
diff --git a/llvm/unittests/Support/DivisionByConstantTest.cpp b/llvm/unittests/Support/DivisionByConstantTest.cpp
index 2b17f98bb75b2..260899d92c8fc 100644
--- a/llvm/unittests/Support/DivisionByConstantTest.cpp
+++ b/llvm/unittests/Support/DivisionByConstantTest.cpp
@@ -96,8 +96,7 @@ APInt MULHU(APInt X, APInt Y) {
}
APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
- bool LZOptimization,
- bool AllowEvenDivisorOptimization, bool ForceNPQ,
+ bool LZOptimization, bool ForceNPQ,
UnsignedDivisionByConstantInfo Magics) {
assert(!Divisor.isOne() && "Division by 1 is not supported using Magic.");
@@ -108,8 +107,7 @@ APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
// Clip to the number of leading zeros in the divisor.
LeadingZeros = std::min(LeadingZeros, Divisor.countl_zero());
if (LeadingZeros > 0) {
- Magics = UnsignedDivisionByConstantInfo::get(
- Divisor, LeadingZeros, AllowEvenDivisorOptimization);
+ Magics = UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
assert(!Magics.IsAdd && "Should use cheap fixup now");
}
}
@@ -166,21 +164,17 @@ TEST(UnsignedDivisionByConstantTest, Test) {
EnumerateAPInts(Bits, [Divisor, Magics, Bits](const APInt &Numerator) {
APInt NativeResult = Numerator.udiv(Divisor);
for (bool LZOptimization : {true, false}) {
- for (bool AllowEvenDivisorOptimization : {true, false}) {
- for (bool ForceNPQ : {false, true}) {
- APInt MagicResult = UnsignedDivideUsingMagic(
- Numerator, Divisor, LZOptimization,
- AllowEvenDivisorOptimization, ForceNPQ, Magics);
- ASSERT_EQ(MagicResult, NativeResult)
- << " ... given the operation: urem i" << Bits << " "
- << Numerator << ", " << Divisor
- << " (allow LZ optimization = "
- << LZOptimization << ", allow even divisior optimization = "
- << AllowEvenDivisorOptimization << ", force NPQ = "
- << ForceNPQ << ")";
- }
+ for (bool ForceNPQ : {false, true}) {
+ APInt MagicResult = UnsignedDivideUsingMagic(
+ Numerator, Divisor, LZOptimization, ForceNPQ, Magics);
+ ASSERT_EQ(MagicResult, NativeResult)
+ << " ... given the operation: urem i" << Bits << " "
+ << Numerator << ", " << Divisor
+ << " (allow LZ optimization = " << LZOptimization
+ << ", force NPQ = " << ForceNPQ << ")";
}
}
+ }
});
});
}
More information about the llvm-commits
mailing list