[llvm] [CodeGen] Use round-down algorithm for uncooperative constants (PR #99666)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 19 10:05:28 PDT 2024
https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/99666
>From 638f1c73738a3e4874e4ebd47f40501b52ee2676 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Thu, 18 Jul 2024 12:23:13 -0400
Subject: [PATCH 1/3] [CodeGen] Use round-down algorithm for uncooperative
constants
This is inspired by https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
---
.../llvm/Support/DivisionByConstantInfo.h | 5 +-
llvm/lib/Support/DivisionByConstantInfo.cpp | 99 +-
.../AArch64/GlobalISel/combine-udiv.ll | 100 +-
.../AArch64/GlobalISel/combine-udiv.mir | 185 ++--
...izer-combiner-divrem-insertpt-conflict.mir | 6 +-
.../prelegalizercombiner-trivial-arith.mir | 34 +-
.../CodeGen/AArch64/arm64-neon-mul-div-cte.ll | 20 +-
llvm/test/CodeGen/AArch64/rotate-extract.ll | 15 +-
llvm/test/CodeGen/AArch64/sve-expand-div.ll | 17 +-
...sve-streaming-mode-fixed-length-int-div.ll | 944 ++++++++----------
llvm/test/CodeGen/AArch64/urem-lkk.ll | 38 +-
.../CodeGen/AArch64/urem-seteq-vec-splat.ll | 13 +-
llvm/test/CodeGen/AArch64/urem-vector-lkk.ll | 142 ++-
.../CodeGen/AMDGPU/GlobalISel/udiv.i32.ll | 39 +-
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 90 +-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 36 +-
.../CodeGen/AMDGPU/combine-reg-or-const.ll | 4 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 357 ++++++-
llvm/test/CodeGen/AMDGPU/udiv.ll | 145 ++-
llvm/test/CodeGen/AMDGPU/urem.ll | 1 -
.../PowerPC/loop-instr-form-prepare.ll | 6 +-
llvm/test/CodeGen/PowerPC/urem-lkk.ll | 16 +-
llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll | 222 ++--
llvm/test/CodeGen/RISCV/div-by-constant.ll | 241 +++--
llvm/test/CodeGen/RISCV/div.ll | 62 +-
llvm/test/CodeGen/RISCV/pr51206.ll | 5 +-
llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll | 20 +-
.../rvv/fixed-vectors-buildvec-of-binop.ll | 18 +-
.../RISCV/rvv/fixed-vectors-extract.ll | 26 +-
.../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 371 +++----
llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll | 156 ++-
llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll | 150 +--
llvm/test/CodeGen/RISCV/select.ll | 49 +-
.../CodeGen/RISCV/split-udiv-by-constant.ll | 401 ++++----
.../CodeGen/RISCV/split-urem-by-constant.ll | 142 ++-
llvm/test/CodeGen/RISCV/urem-lkk.ll | 45 +-
llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 30 +-
llvm/test/CodeGen/SystemZ/int-div-06.ll | 62 +-
llvm/test/CodeGen/SystemZ/int-mul-13.ll | 6 +-
.../test/CodeGen/Thumb2/mve-blockplacement.ll | 84 +-
llvm/test/CodeGen/Thumb2/thumb2-select.ll | 119 ++-
llvm/test/CodeGen/VE/Scalar/div.ll | 126 ++-
llvm/test/CodeGen/VE/Scalar/rem.ll | 126 ++-
llvm/test/CodeGen/VE/Vector/vec_divrem.ll | 34 +-
llvm/test/CodeGen/X86/and-encoding.ll | 9 +-
llvm/test/CodeGen/X86/atomic-unordered.ll | 15 +-
llvm/test/CodeGen/X86/bug80500.ll | 3 +-
llvm/test/CodeGen/X86/combine-pmuldq.ll | 100 +-
llvm/test/CodeGen/X86/combine-udiv.ll | 178 ++--
llvm/test/CodeGen/X86/divide-by-constant.ll | 300 +++---
llvm/test/CodeGen/X86/divmod128.ll | 240 ++---
llvm/test/CodeGen/X86/divrem-by-select.ll | 9 +-
llvm/test/CodeGen/X86/freeze.ll | 9 +-
llvm/test/CodeGen/X86/known-bits.ll | 14 +-
llvm/test/CodeGen/X86/known-pow2.ll | 30 +-
.../test/CodeGen/X86/load-scalar-as-vector.ll | 30 +-
...of-two-or-zero-when-comparing-with-zero.ll | 12 +-
llvm/test/CodeGen/X86/pr35636.ll | 8 +-
llvm/test/CodeGen/X86/pr38217.ll | 4 +-
.../CodeGen/X86/prefer-avx256-wide-mul.ll | 42 +-
llvm/test/CodeGen/X86/rem.ll | 3 +-
.../test/CodeGen/X86/rotate-extract-vector.ll | 18 +-
llvm/test/CodeGen/X86/rotate-extract.ll | 40 +-
llvm/test/CodeGen/X86/urem-i8-constant.ll | 4 +-
llvm/test/CodeGen/X86/urem-lkk.ll | 35 +-
llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 90 +-
llvm/test/CodeGen/X86/urem-vector-lkk.ll | 283 +++---
llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll | 590 ++++-------
llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 567 ++++-------
llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 467 +++------
llvm/test/CodeGen/X86/vector-idiv-v2i32.ll | 60 +-
.../X86/vshli-simplify-demanded-bits.ll | 4 +-
llvm/test/CodeGen/X86/x86_64-mul-by-const.ll | 10 +-
.../Support/DivisionByConstantTest.cpp | 28 +-
74 files changed, 3751 insertions(+), 4258 deletions(-)
diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
index caa0b35e71447..fb0c1382ce821 100644
--- a/llvm/include/llvm/Support/DivisionByConstantInfo.h
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -26,9 +26,8 @@ struct SignedDivisionByConstantInfo {
/// Magic data for optimising unsigned division by a constant.
struct UnsignedDivisionByConstantInfo {
- static UnsignedDivisionByConstantInfo
- get(const APInt &D, unsigned LeadingZeros = 0,
- bool AllowEvenDivisorOptimization = true);
+ static UnsignedDivisionByConstantInfo get(const APInt &D,
+ unsigned LeadingZeros = 0);
APInt Magic; ///< magic number
bool IsAdd; ///< add indicator
unsigned PostShift; ///< post-shift amount
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index b0e503003a680..3af537cc39686 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -71,85 +71,70 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
/// LeadingZeros can be used to simplify the calculation if the upper bits
/// of the divided value are known zero.
UnsignedDivisionByConstantInfo
-UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
- bool AllowEvenDivisorOptimization) {
+UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
assert(!D.isZero() && !D.isOne() && "Precondition violation.");
assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
- APInt Delta;
struct UnsignedDivisionByConstantInfo Retval;
- Retval.IsAdd = false; // initialize "add" indicator
- APInt AllOnes =
- APInt::getLowBitsSet(D.getBitWidth(), D.getBitWidth() - LeadingZeros);
- APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
// Calculate NC, the largest dividend such that NC.urem(D) == D-1.
- APInt NC = AllOnes - (AllOnes + 1 - D).urem(D);
- assert(NC.urem(D) == D - 1 && "Unexpected NC value");
- unsigned P = D.getBitWidth() - 1; // initialize P
- APInt Q1, R1, Q2, R2;
- // initialize Q1 = 2P/NC; R1 = rem(2P,NC)
- APInt::udivrem(SignedMin, NC, Q1, R1);
- // initialize Q2 = (2P-1)/D; R2 = rem((2P-1),D)
+ APInt Q2, R2;
+ // initialize Q = (2P-1)/D; R2 = rem((2P-1),D)
APInt::udivrem(SignedMax, D, Q2, R2);
- do {
- P = P + 1;
- if (R1.uge(NC - R1)) {
- // update Q1
- Q1 <<= 1;
- ++Q1;
- // update R1
- R1 <<= 1;
- R1 -= NC;
- } else {
- Q1 <<= 1; // update Q1
- R1 <<= 1; // update R1
- }
- if ((R2 + 1).uge(D - R2)) {
- if (Q2.uge(SignedMax))
- Retval.IsAdd = true;
- // update Q2
+
+ APInt down_multiplier = APInt::getZero(D.getBitWidth());
+ unsigned down_exponent = 0;
+ bool hasMagicDown = false;
+
+ unsigned Log2D = D.ceilLogBase2();
+ unsigned Exponent = 0;
+
+ for (;; Exponent++) {
+ if (R2.uge(D - R2)) {
Q2 <<= 1;
++Q2;
- // update R2
R2 <<= 1;
- ++R2;
R2 -= D;
} else {
- if (Q2.uge(SignedMin))
- Retval.IsAdd = true;
- // update Q2
Q2 <<= 1;
- // update R2
R2 <<= 1;
- ++R2;
}
- // Delta = D - 1 - R2
- Delta = D;
- --Delta;
- Delta -= R2;
- } while (P < D.getBitWidth() * 2 &&
- (Q1.ult(Delta) || (Q1 == Delta && R1.isZero())));
- if (Retval.IsAdd && !D[0] && AllowEvenDivisorOptimization) {
+ APInt Ule = APInt::getOneBitSet(D.getBitWidth(), Exponent + LeadingZeros);
+
+ if (Exponent + LeadingZeros >= Log2D || (D - R2).ule(Ule))
+ break;
+
+ // Set magic_down if we have not set it yet and this exponent works for the
+ // round_down algorithm
+ if (!hasMagicDown && R2.ule(Ule)) {
+ hasMagicDown = true;
+ down_multiplier = Q2;
+ down_exponent = Exponent;
+ }
+ }
+
+ if (Exponent < Log2D) {
+ // Do the normal values
+ Retval.Magic = Q2 + 1;
+ Retval.PreShift = 0;
+ Retval.PostShift = Exponent;
+ Retval.IsAdd = false;
+ } else if (!D[0]) {
+ //
+ Retval.Magic = down_multiplier;
+ Retval.PreShift = 0;
+ Retval.PostShift = down_exponent;
+ Retval.IsAdd = true;
+ } else {
unsigned PreShift = D.countr_zero();
APInt ShiftedD = D.lshr(PreShift);
- Retval =
- UnsignedDivisionByConstantInfo::get(ShiftedD, LeadingZeros + PreShift);
+ Retval = UnsignedDivisionByConstantInfo::get(
+ ShiftedD, D.getBitWidth() - LeadingZeros - PreShift);
assert(Retval.IsAdd == 0 && Retval.PreShift == 0);
Retval.PreShift = PreShift;
- return Retval;
}
- Retval.Magic = std::move(Q2); // resulting magic number
- ++Retval.Magic;
- Retval.PostShift = P - D.getBitWidth(); // resulting shift
- // Reduce shift amount for IsAdd.
- if (Retval.IsAdd) {
- assert(Retval.PostShift > 0 && "Unexpected shift");
- Retval.PostShift -= 1;
- }
- Retval.PreShift = 0;
return Retval;
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index d465e0237201b..2269f2ae3815e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -6,14 +6,12 @@
define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_uniform:
; SDAG: // %bb.0:
-; SDAG-NEXT: mov w8, #25645 // =0x642d
+; SDAG-NEXT: mov w8, #45589 // =0xb215
; SDAG-NEXT: dup v1.8h, w8
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: usra v1.8h, v0.8h, #1
-; SDAG-NEXT: ushr v0.8h, v1.8h, #4
+; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; SDAG-NEXT: ushr v0.8h, v0.8h, #4
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_uniform:
@@ -21,11 +19,9 @@ define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; GISEL-NEXT: adrp x8, .LCPI0_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: usra v1.8h, v0.8h, #1
-; GISEL-NEXT: ushr v0.8h, v1.8h, #4
+; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: ushr v0.8h, v0.8h, #4
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
ret <8 x i16> %1
@@ -37,37 +33,30 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SDAG-NEXT: adrp x8, .LCPI1_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; SDAG-NEXT: adrp x8, .LCPI1_1
+; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; SDAG-NEXT: adrp x8, .LCPI1_2
-; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h
-; SDAG-NEXT: umull2 v3.4s, v1.8h, v2.8h
-; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
-; SDAG-NEXT: adrp x8, .LCPI1_3
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
+; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_2]
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform:
; GISEL: // %bb.0:
-; GISEL-NEXT: adrp x8, .LCPI1_3
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
; GISEL-NEXT: adrp x8, .LCPI1_2
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_2]
; GISEL-NEXT: adrp x8, .LCPI1_1
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h
-; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h
+; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; GISEL-NEXT: adrp x8, .LCPI1_0
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
@@ -87,13 +76,17 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SDAG-NEXT: adrp x8, .LCPI2_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; SDAG-NEXT: adrp x8, .LCPI2_1
-; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
-; SDAG-NEXT: adrp x8, .LCPI2_2
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; SDAG-NEXT: adrp x8, .LCPI2_2
+; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
+; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
-; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
@@ -102,15 +95,18 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; GISEL-NEXT: adrp x8, .LCPI2_2
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
; GISEL-NEXT: adrp x8, .LCPI2_1
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
-; GISEL-NEXT: adrp x8, .LCPI2_0
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; GISEL-NEXT: adrp x8, .LCPI2_0
+; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
+; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; GISEL-NEXT: add v0.8h, v0.8h, v1.8h
+; GISEL-NEXT: neg v1.8h, v2.8h
; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -124,12 +120,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; SDAG-NEXT: adrp x8, .LCPI3_1
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: usra v1.8h, v0.8h, #1
-; SDAG-NEXT: ldr q0, [x8, :lo12:.LCPI3_1]
-; SDAG-NEXT: ushl v0.8h, v1.8h, v0.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform3:
@@ -138,13 +132,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
; GISEL-NEXT: adrp x8, .LCPI3_0
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
-; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: usra v1.8h, v0.8h, #1
-; GISEL-NEXT: neg v0.8h, v2.8h
-; GISEL-NEXT: ushl v0.8h, v1.8h, v0.8h
+; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; GISEL-NEXT: neg v1.8h, v1.8h
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
ret <8 x i16> %1
@@ -153,7 +145,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform4:
; SDAG: // %bb.0:
-; SDAG-NEXT: movi v1.16b, #171
+; SDAG-NEXT: movi v1.16b, #85
; SDAG-NEXT: adrp x8, .LCPI4_0
; SDAG-NEXT: adrp x9, .LCPI4_1
; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_1]
@@ -162,7 +154,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
-; SDAG-NEXT: ushr v1.16b, v1.16b, #7
+; SDAG-NEXT: ushr v1.16b, v1.16b, #6
; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
index f8578a694e2d4..fdb5a76887143 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
@@ -9,13 +9,11 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 818089009
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[C1]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1636178017
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
%0:_(s32) = COPY $w0
%cst:_(s32) = G_CONSTANT i32 42
%2:_(s32) = G_UDIV %0(s32), %cst(s32)
@@ -35,18 +33,13 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
@@ -72,30 +65,28 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 512
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639
- ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C8]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C7]](s16), [[C1]](s16), [[C1]](s16), [[C12]](s16), [[C12]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<8 x s16>)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 3855
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32757
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 4681
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
+ ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 257
+ ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+ ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C8]](s16), [[C10]](s16), [[C5]](s16), [[C12]](s16), [[C10]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C]](s16), [[C]](s16), [[C1]](s16), [[C1]](s16), [[C]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C7]](s16), [[C9]](s16), [[C11]](s16), [[C1]](s16), [[C13]](s16), [[C14]](s16)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]]
+ ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
@@ -126,26 +117,31 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5617
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2049
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 11
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5619
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7281
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10347
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13107
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32747
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C12]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C5]](s16), [[C3]](s16), [[C5]](s16), [[C8]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<8 x s16>)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7283
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10349
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 13107
+ ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
+ ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197
+ ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C12]](s16), [[C14]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C5]](s16), [[C11]](s16), [[C13]](s16), [[C15]](s16)
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
+ ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 -34
@@ -176,28 +172,23 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 517
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C8]](s16), [[C8]](s16), [[C11]](s16)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 20971
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -26701
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 -20917
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 517
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C8]](s16), [[C10]](s16), [[C11]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C3]](s16), [[C5]](s16), [[C3]](s16), [[C]](s16), [[C9]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 7
@@ -229,8 +220,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 -85
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 7
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 85
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 6
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<16 x s8>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
@@ -265,21 +256,21 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4957
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 551
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8081
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8079
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 4103
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 12
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 16385
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -29991
- ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C4]](s16), [[C5]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C3]](s16), [[C3]](s16), [[C6]](s16), [[C8]](s16), [[C3]](s16), [[C]](s16), [[C12]](s16)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32713
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 4443
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
+ ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
+ ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+ ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C12]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C10]](s16), [[C]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C11]](s16), [[C]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
@@ -338,11 +329,15 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 660764199
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMULH]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C2]](s32)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[UMULH]]
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
index 2e879c7e1622a..4e4cc3349fb76 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir
@@ -9,13 +9,11 @@ body: |
bb.1:
; CHECK-LABEL: name: test
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483647
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[C]], [[C1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32)
; CHECK-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[C]], [[C]]
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UREM]](s32)
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LSHR]](s32)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[UMULH]](s32)
; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s8)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ZEXT]], [[SEXT]]
; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[OR]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
index 0900dd4267a2e..a695c7527a7f3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir
@@ -26,8 +26,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x + 0) -> x
- ;
; CHECK-LABEL: name: right_ident_add
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -47,8 +45,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x * 0) -> 0
- ;
; CHECK-LABEL: name: mul_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -69,8 +65,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x * 0) -> 0
- ;
; CHECK-LABEL: name: mul_0_cant_replace
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -93,8 +87,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 / x) -> 0
- ;
; CHECK-LABEL: name: sdiv_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -114,8 +106,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 / x) -> 0
- ;
; CHECK-LABEL: name: udiv_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -135,8 +125,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 % x) -> 0
- ;
; CHECK-LABEL: name: srem_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -156,8 +144,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (0 % x) -> 0
- ;
; CHECK-LABEL: name: urem_0
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -178,8 +164,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x || 0) -> x
- ;
; CHECK-LABEL: name: right_ident_or
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -199,8 +183,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x | 0) -> x
- ;
; CHECK-LABEL: name: right_ident_xor
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -220,8 +202,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x << 0) -> x
- ;
; CHECK-LABEL: name: right_ident_shl
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -241,8 +221,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x ashr 0) -> x
- ;
; CHECK-LABEL: name: right_ident_ashr
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -262,8 +240,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Fold (x lshr 0) -> x
- ;
; CHECK-LABEL: name: right_ident_lshr
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -283,8 +259,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $w0
- ; Not an identity, no folding.
- ;
; CHECK-LABEL: name: dont_fold_sub
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
@@ -325,8 +299,6 @@ tracksRegLiveness: true
body: |
bb.1.entry:
liveins: $x0
- ; Fold (x + 0) -> x
- ;
; CHECK-LABEL: name: right_ident_ptr_add
; CHECK: liveins: $x0
; CHECK-NEXT: {{ $}}
@@ -476,8 +448,10 @@ body: |
; CHECK-LABEL: name: udiv_of_sext
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: $w0 = COPY [[C]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s2) = G_CONSTANT i2 1
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s2) = G_UMULH [[C]], [[C]]
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMULH]](s2)
+ ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%2:_(s1) = G_CONSTANT i1 true
%4:_(s2) = G_CONSTANT i2 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
index f1458b76c525a..1b192342953c9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
@@ -50,11 +50,11 @@ define <4 x i32> @div32xi4(<4 x i32> %x) {
define <16 x i8> @udiv16xi8(<16 x i8> %x) {
; CHECK-LABEL: udiv16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.16b, #121
+; CHECK-NEXT: movi v1.16b, #15
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: ushr v0.16b, v0.16b, #5
+; CHECK-NEXT: ushr v0.16b, v0.16b, #2
; CHECK-NEXT: ret
%div = udiv <16 x i8> %x, <i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68, i8 68>
ret <16 x i8> %div
@@ -63,14 +63,12 @@ define <16 x i8> @udiv16xi8(<16 x i8> %x) {
define <8 x i16> @udiv8xi16(<8 x i16> %x) {
; CHECK-LABEL: udiv8xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16593 // =0x40d1
+; CHECK-NEXT: mov w8, #41063 // =0xa067
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: usra v1.8h, v0.8h, #1
-; CHECK-NEXT: ushr v0.8h, v1.8h, #12
+; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ushr v0.8h, v0.8h, #12
; CHECK-NEXT: ret
%div = udiv <8 x i16> %x, <i16 6537, i16 6537, i16 6537, i16 6537, i16 6537, i16 6537, i16 6537, i16 6537>
ret <8 x i16> %div
@@ -79,13 +77,13 @@ define <8 x i16> @udiv8xi16(<8 x i16> %x) {
define <4 x i32> @udiv32xi4(<4 x i32> %x) {
; CHECK-LABEL: udiv32xi4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16747 // =0x416b
-; CHECK-NEXT: movk w8, #31439, lsl #16
+; CHECK-NEXT: mov w8, #41141 // =0xa0b5
+; CHECK-NEXT: movk w8, #15719, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: ushr v0.4s, v0.4s, #22
+; CHECK-NEXT: ushr v0.4s, v0.4s, #21
; CHECK-NEXT: ret
%div = udiv <4 x i32> %x, <i32 8743143, i32 8743143, i32 8743143, i32 8743143>
ret <4 x i32> %div
diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll
index e3eaf81245ff4..73e0161e64fd5 100644
--- a/llvm/test/CodeGen/AArch64/rotate-extract.ll
+++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll
@@ -50,10 +50,8 @@ define i32 @ror_extract_mul(i32 %i) nounwind {
define i64 @ror_extract_udiv(i64 %i) nounwind {
; CHECK-LABEL: ror_extract_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
-; CHECK-NEXT: movk x8, #43691
+; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555
; CHECK-NEXT: umulh x8, x0, x8
-; CHECK-NEXT: lsr x8, x8, #1
; CHECK-NEXT: ror x0, x8, #4
; CHECK-NEXT: ret
%lhs_div = udiv i64 %i, 3
@@ -127,15 +125,14 @@ define i64 @no_extract_mul(i64 %i) nounwind {
define i32 @no_extract_udiv(i32 %i) nounwind {
; CHECK-LABEL: no_extract_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43691 // =0xaaab
-; CHECK-NEXT: mov w9, #33437 // =0x829d
-; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: movk w9, #21399, lsl #16
+; CHECK-NEXT: mov w9, #30762 // =0x782a
+; CHECK-NEXT: mov w8, #1431655765 // =0x55555555
+; CHECK-NEXT: movk w9, #1337, lsl #16
; CHECK-NEXT: umull x8, w0, w8
; CHECK-NEXT: umull x9, w0, w9
-; CHECK-NEXT: lsr x8, x8, #33
+; CHECK-NEXT: lsr x8, x8, #32
; CHECK-NEXT: lsr x9, x9, #32
-; CHECK-NEXT: extr w0, w8, w9, #4
+; CHECK-NEXT: orr w0, w9, w8, lsl #28
; CHECK-NEXT: ret
%lhs_div = udiv i32 %i, 3
%rhs_div = udiv i32 %i, 49
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 180c64e0a7de1..9a51f1d1b8e9e 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -71,10 +71,9 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
; CHECK-LABEL: udiv_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov z1.b, #-85 // =0xffffffffffffffab
+; CHECK-NEXT: mov z1.b, #85 // =0x55
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
-; CHECK-NEXT: lsr z0.b, z0.b, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 16 x i8> %a, splat (i8 3)
ret <vscale x 16 x i8> %div
@@ -83,11 +82,9 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
; CHECK-LABEL: udiv_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-21845 // =0xffffaaab
+; CHECK-NEXT: dupm z1.b, #0x55
; CHECK-NEXT: ptrue p0.h
-; CHECK-NEXT: mov z1.h, w8
; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT: lsr z0.h, z0.h, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 8 x i16> %a, splat (i16 3)
ret <vscale x 8 x i16> %div
@@ -96,12 +93,9 @@ define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
; CHECK-LABEL: udiv_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43691 // =0xaaab
+; CHECK-NEXT: dupm z1.b, #0x55
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: movk w8, #43690, lsl #16
-; CHECK-NEXT: mov z1.s, w8
; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: lsr z0.s, z0.s, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 4 x i32> %a, splat (i32 3)
ret <vscale x 4 x i32> %div
@@ -110,12 +104,9 @@ define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
define <vscale x 2 x i64> @udiv_i64(<vscale x 2 x i64> %a) #0 {
; CHECK-LABEL: udiv_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
+; CHECK-NEXT: dupm z1.b, #0x55
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: movk x8, #43691
-; CHECK-NEXT: mov z1.d, x8
; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT: lsr z0.d, z0.d, #1
; CHECK-NEXT: ret
%div = udiv <vscale x 2 x i64> %a, splat (i64 3)
ret <vscale x 2 x i64> %div
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 516772b8ca664..6d28f343c32db 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -26,19 +26,6 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8
-; NEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8
-; NEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -63,6 +50,18 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8
+; NEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8
+; NEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = sdiv <4 x i8> %op1, %op2
ret <4 x i8> %res
}
@@ -91,21 +90,6 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -146,6 +130,20 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v8i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
+; NEON-NOSVE-NEXT: ret
%res = sdiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
@@ -192,30 +190,6 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v16i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll2 v2.8h, v1.16b, #0
-; NEON-NOSVE-NEXT: sshll2 v3.8h, v0.16b, #0
-; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sshll2 v4.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -287,6 +261,29 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v16i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll2 v2.8h, v1.16b, #0
+; NEON-NOSVE-NEXT: sshll2 v3.8h, v0.16b, #0
+; NEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sshll2 v4.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sdivr z3.s, p0/m, z3.s, z5.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; NEON-NOSVE-NEXT: ret
%res = sdiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
@@ -365,53 +362,6 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v32i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
-; NEON-NOSVE-NEXT: sshll2 v1.8h, v3.16b, #0
-; NEON-NOSVE-NEXT: sshll2 v4.8h, v2.16b, #0
-; NEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0
-; NEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0
-; NEON-NOSVE-NEXT: sshll2 v7.8h, v6.16b, #0
-; NEON-NOSVE-NEXT: sshll v6.8h, v6.8b, #0
-; NEON-NOSVE-NEXT: sshll2 v0.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: sshll2 v17.4s, v7.8h, #0
-; NEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0
-; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z5.s
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: sdivr z1.s, p0/m, z1.s, z4.s
-; NEON-NOSVE-NEXT: sshll2 v4.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: ldr q5, [x0]
-; NEON-NOSVE-NEXT: sshll2 v16.8h, v5.16b, #0
-; NEON-NOSVE-NEXT: sshll v5.8h, v5.8b, #0
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; NEON-NOSVE-NEXT: sshll2 v18.4s, v16.8h, #0
-; NEON-NOSVE-NEXT: sshll v16.4s, v16.4h, #0
-; NEON-NOSVE-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
-; NEON-NOSVE-NEXT: sshll2 v18.4s, v5.8h, #0
-; NEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0
-; NEON-NOSVE-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
-; NEON-NOSVE-NEXT: sshll2 v16.4s, v6.8h, #0
-; NEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0
-; NEON-NOSVE-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
-; NEON-NOSVE-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
-; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
-; NEON-NOSVE-NEXT: stp q2, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v32i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -552,6 +502,52 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v32i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
+; NEON-NOSVE-NEXT: sshll2 v1.8h, v3.16b, #0
+; NEON-NOSVE-NEXT: sshll2 v4.8h, v2.16b, #0
+; NEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0
+; NEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0
+; NEON-NOSVE-NEXT: sshll2 v7.8h, v6.16b, #0
+; NEON-NOSVE-NEXT: sshll v6.8h, v6.8b, #0
+; NEON-NOSVE-NEXT: sshll2 v0.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: sshll2 v17.4s, v7.8h, #0
+; NEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0
+; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z5.s
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: sdivr z1.s, p0/m, z1.s, z4.s
+; NEON-NOSVE-NEXT: sshll2 v4.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: sdivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: ldr q5, [x0]
+; NEON-NOSVE-NEXT: sshll2 v16.8h, v5.16b, #0
+; NEON-NOSVE-NEXT: sshll v5.8h, v5.8b, #0
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; NEON-NOSVE-NEXT: sshll2 v18.4s, v16.8h, #0
+; NEON-NOSVE-NEXT: sshll v16.4s, v16.4h, #0
+; NEON-NOSVE-NEXT: sdivr z17.s, p0/m, z17.s, z18.s
+; NEON-NOSVE-NEXT: sshll2 v18.4s, v5.8h, #0
+; NEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0
+; NEON-NOSVE-NEXT: sdivr z7.s, p0/m, z7.s, z16.s
+; NEON-NOSVE-NEXT: sshll2 v16.4s, v6.8h, #0
+; NEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0
+; NEON-NOSVE-NEXT: sdivr z16.s, p0/m, z16.s, z18.s
+; NEON-NOSVE-NEXT: sdiv z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
+; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; NEON-NOSVE-NEXT: stp q2, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = sdiv <32 x i8> %op1, %op2
@@ -571,17 +567,6 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v2i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16
-; NEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16
-; NEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v2i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -597,6 +582,16 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v2i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16
+; NEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16
+; NEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <2 x i16> %op1, %op2
ret <2 x i16> %res
}
@@ -614,15 +609,6 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -647,6 +633,14 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = sdiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
@@ -672,18 +666,6 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -723,6 +705,17 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v8i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: ret
%res = sdiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
@@ -760,29 +753,6 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v16i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
-; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: ldr q3, [x0]
-; NEON-NOSVE-NEXT: sshll2 v6.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: stp q1, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v16i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -859,6 +829,28 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v16i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
+; NEON-NOSVE-NEXT: sshll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: sshll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: sdivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: ldr q3, [x0]
+; NEON-NOSVE-NEXT: sshll2 v6.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: sdivr z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: sdiv z3.s, p0/m, z3.s, z4.s
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = sdiv <16 x i16> %op1, %op2
@@ -876,15 +868,6 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v2i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v2i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -899,6 +882,14 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v2i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
@@ -913,15 +904,6 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -941,6 +923,14 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
@@ -957,17 +947,6 @@ define void @sdiv_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v8i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: sdiv z1.s, p0/m, z1.s, z3.s
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v8i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1004,6 +983,16 @@ define void @sdiv_v8i32(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v8i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: sdivr z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: sdiv z1.s, p0/m, z1.s, z3.s
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = sdiv <8 x i32> %op1, %op2
@@ -1021,15 +1010,6 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v1i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl1
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v1i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #16
@@ -1041,6 +1021,14 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v1i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl1
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
@@ -1055,15 +1043,6 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v2i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v2i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -1077,6 +1056,14 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v2i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: sdiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = sdiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
@@ -1093,17 +1080,6 @@ define void @sdiv_v4i64(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: sdiv_v4i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: sdivr z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: sdiv z1.d, p0/m, z1.d, z3.d
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: sdiv_v4i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1128,6 +1104,16 @@ define void @sdiv_v4i64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: sdiv_v4i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: sdivr z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: sdiv z1.d, p0/m, z1.d, z3.d
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = sdiv <4 x i64> %op1, %op2
@@ -1154,17 +1140,6 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: bic v0.4h, #255, lsl #8
-; NEON-NOSVE-NEXT: bic v1.4h, #255, lsl #8
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1189,6 +1164,16 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: bic v0.4h, #255, lsl #8
+; NEON-NOSVE-NEXT: bic v1.4h, #255, lsl #8
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = udiv <4 x i8> %op1, %op2
ret <4 x i8> %res
}
@@ -1217,21 +1202,6 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v8i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v8i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1272,6 +1242,20 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v8i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: xtn v0.8b, v0.8h
+; NEON-NOSVE-NEXT: ret
%res = udiv <8 x i8> %op1, %op2
ret <8 x i8> %res
}
@@ -1318,30 +1302,6 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v16i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll2 v2.8h, v1.16b, #0
-; NEON-NOSVE-NEXT: ushll2 v3.8h, v0.16b, #0
-; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
-; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ushll2 v4.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: udivr z3.s, p0/m, z3.s, z5.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v16i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -1413,6 +1373,29 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v16i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll2 v2.8h, v1.16b, #0
+; NEON-NOSVE-NEXT: ushll2 v3.8h, v0.16b, #0
+; NEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0
+; NEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ushll2 v4.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: udivr z3.s, p0/m, z3.s, z5.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; NEON-NOSVE-NEXT: ret
%res = udiv <16 x i8> %op1, %op2
ret <16 x i8> %res
}
@@ -1491,53 +1474,6 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v32i8:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
-; NEON-NOSVE-NEXT: ushll2 v1.8h, v3.16b, #0
-; NEON-NOSVE-NEXT: ushll2 v4.8h, v2.16b, #0
-; NEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0
-; NEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0
-; NEON-NOSVE-NEXT: ushll2 v7.8h, v6.16b, #0
-; NEON-NOSVE-NEXT: ushll v6.8h, v6.8b, #0
-; NEON-NOSVE-NEXT: ushll2 v0.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: ushll2 v17.4s, v7.8h, #0
-; NEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0
-; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z5.s
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v2.8h, #0
-; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
-; NEON-NOSVE-NEXT: udivr z1.s, p0/m, z1.s, z4.s
-; NEON-NOSVE-NEXT: ushll2 v4.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
-; NEON-NOSVE-NEXT: ldr q5, [x0]
-; NEON-NOSVE-NEXT: ushll2 v16.8h, v5.16b, #0
-; NEON-NOSVE-NEXT: ushll v5.8h, v5.8b, #0
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; NEON-NOSVE-NEXT: ushll2 v18.4s, v16.8h, #0
-; NEON-NOSVE-NEXT: ushll v16.4s, v16.4h, #0
-; NEON-NOSVE-NEXT: udivr z17.s, p0/m, z17.s, z18.s
-; NEON-NOSVE-NEXT: ushll2 v18.4s, v5.8h, #0
-; NEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0
-; NEON-NOSVE-NEXT: udivr z7.s, p0/m, z7.s, z16.s
-; NEON-NOSVE-NEXT: ushll2 v16.4s, v6.8h, #0
-; NEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0
-; NEON-NOSVE-NEXT: udivr z16.s, p0/m, z16.s, z18.s
-; NEON-NOSVE-NEXT: udiv z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: udiv z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
-; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
-; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
-; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
-; NEON-NOSVE-NEXT: stp q2, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v32i8:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1678,6 +1614,52 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v32i8:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q6, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q2, [x0, #16]
+; NEON-NOSVE-NEXT: ushll2 v1.8h, v3.16b, #0
+; NEON-NOSVE-NEXT: ushll2 v4.8h, v2.16b, #0
+; NEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0
+; NEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0
+; NEON-NOSVE-NEXT: ushll2 v7.8h, v6.16b, #0
+; NEON-NOSVE-NEXT: ushll v6.8h, v6.8b, #0
+; NEON-NOSVE-NEXT: ushll2 v0.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: ushll2 v17.4s, v7.8h, #0
+; NEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0
+; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z5.s
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v2.8h, #0
+; NEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0
+; NEON-NOSVE-NEXT: udivr z1.s, p0/m, z1.s, z4.s
+; NEON-NOSVE-NEXT: ushll2 v4.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: udivr z4.s, p0/m, z4.s, z5.s
+; NEON-NOSVE-NEXT: ldr q5, [x0]
+; NEON-NOSVE-NEXT: ushll2 v16.8h, v5.16b, #0
+; NEON-NOSVE-NEXT: ushll v5.8h, v5.8b, #0
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; NEON-NOSVE-NEXT: ushll2 v18.4s, v16.8h, #0
+; NEON-NOSVE-NEXT: ushll v16.4s, v16.4h, #0
+; NEON-NOSVE-NEXT: udivr z17.s, p0/m, z17.s, z18.s
+; NEON-NOSVE-NEXT: ushll2 v18.4s, v5.8h, #0
+; NEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0
+; NEON-NOSVE-NEXT: udivr z7.s, p0/m, z7.s, z16.s
+; NEON-NOSVE-NEXT: ushll2 v16.4s, v6.8h, #0
+; NEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0
+; NEON-NOSVE-NEXT: udivr z16.s, p0/m, z16.s, z18.s
+; NEON-NOSVE-NEXT: udiv z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: udiv z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: uzp1 v3.8h, v7.8h, v17.8h
+; NEON-NOSVE-NEXT: uzp1 v5.8h, v5.8h, v16.8h
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v4.8h
+; NEON-NOSVE-NEXT: uzp1 v2.16b, v5.16b, v3.16b
+; NEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; NEON-NOSVE-NEXT: stp q2, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = udiv <32 x i8> %op1, %op2
@@ -1697,16 +1679,6 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v2i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b
-; NEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v2i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1722,6 +1694,15 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v2i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b
+; NEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <2 x i16> %op1, %op2
ret <2 x i16> %res
}
@@ -1739,15 +1720,6 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -1772,6 +1744,14 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: xtn v0.4h, v0.4s
+; NEON-NOSVE-NEXT: ret
%res = udiv <4 x i16> %op1, %op2
ret <4 x i16> %res
}
@@ -1797,18 +1777,6 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v8i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v8i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -1848,6 +1816,17 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v8i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: ret
%res = udiv <8 x i16> %op1, %op2
ret <8 x i16> %res
}
@@ -1885,29 +1864,6 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v16i16:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
-; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
-; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
-; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
-; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
-; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
-; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
-; NEON-NOSVE-NEXT: ldr q3, [x0]
-; NEON-NOSVE-NEXT: ushll2 v6.4s, v3.8h, #0
-; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
-; NEON-NOSVE-NEXT: udivr z5.s, p0/m, z5.s, z6.s
-; NEON-NOSVE-NEXT: udiv z3.s, p0/m, z3.s, z4.s
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
-; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
-; NEON-NOSVE-NEXT: stp q1, q0, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v16i16:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -1984,6 +1940,28 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v16i16:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q4, q1, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldr q0, [x0, #16]
+; NEON-NOSVE-NEXT: ushll2 v2.4s, v1.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v3.4s, v0.8h, #0
+; NEON-NOSVE-NEXT: ushll2 v5.4s, v4.8h, #0
+; NEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0
+; NEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0
+; NEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0
+; NEON-NOSVE-NEXT: udivr z2.s, p0/m, z2.s, z3.s
+; NEON-NOSVE-NEXT: ldr q3, [x0]
+; NEON-NOSVE-NEXT: ushll2 v6.4s, v3.8h, #0
+; NEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0
+; NEON-NOSVE-NEXT: udivr z5.s, p0/m, z5.s, z6.s
+; NEON-NOSVE-NEXT: udiv z3.s, p0/m, z3.s, z4.s
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v5.8h
+; NEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h
+; NEON-NOSVE-NEXT: stp q1, q0, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = udiv <16 x i16> %op1, %op2
@@ -2001,15 +1979,6 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v2i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl2
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v2i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #32
@@ -2024,6 +1993,14 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v2i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl2
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <2 x i32> %op1, %op2
ret <2 x i32> %res
}
@@ -2038,15 +2015,6 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -2066,6 +2034,14 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <4 x i32> %op1, %op2
ret <4 x i32> %res
}
@@ -2082,17 +2058,6 @@ define void @udiv_v8i32(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v8i32:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.s, vl4
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z1.s
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: udiv z1.s, p0/m, z1.s, z3.s
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v8i32:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -2129,6 +2094,16 @@ define void @udiv_v8i32(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v8i32:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.s, vl4
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: udivr z0.s, p0/m, z0.s, z1.s
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: udiv z1.s, p0/m, z1.s, z3.s
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = udiv <8 x i32> %op1, %op2
@@ -2146,15 +2121,6 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v1i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl1
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v1i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #16
@@ -2166,6 +2132,14 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
; NONEON-NOSVE-NEXT: add sp, sp, #16
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v1i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl1
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <1 x i64> %op1, %op2
ret <1 x i64> %res
}
@@ -2180,15 +2154,6 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v2i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
-; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v2i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]!
@@ -2202,6 +2167,14 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
; NONEON-NOSVE-NEXT: ldr q0, [sp, #32]
; NONEON-NOSVE-NEXT: add sp, sp, #48
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v2i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; NEON-NOSVE-NEXT: // kill: def $q1 killed $q1 def $z1
+; NEON-NOSVE-NEXT: udiv z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; NEON-NOSVE-NEXT: ret
%res = udiv <2 x i64> %op1, %op2
ret <2 x i64> %res
}
@@ -2218,17 +2191,6 @@ define void @udiv_v4i64(ptr %a, ptr %b) {
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
;
-; NEON-NOSVE-LABEL: udiv_v4i64:
-; NEON-NOSVE: // %bb.0:
-; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
-; NEON-NOSVE-NEXT: ptrue p0.d, vl2
-; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
-; NEON-NOSVE-NEXT: udivr z0.d, p0/m, z0.d, z1.d
-; NEON-NOSVE-NEXT: movprfx z1, z2
-; NEON-NOSVE-NEXT: udiv z1.d, p0/m, z1.d, z3.d
-; NEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NEON-NOSVE-NEXT: ret
-;
; NONEON-NOSVE-LABEL: udiv_v4i64:
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: sub sp, sp, #96
@@ -2253,6 +2215,16 @@ define void @udiv_v4i64(ptr %a, ptr %b) {
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
; NONEON-NOSVE-NEXT: add sp, sp, #96
; NONEON-NOSVE-NEXT: ret
+; NEON-NOSVE-LABEL: udiv_v4i64:
+; NEON-NOSVE: // %bb.0:
+; NEON-NOSVE-NEXT: ldp q0, q3, [x1]
+; NEON-NOSVE-NEXT: ptrue p0.d, vl2
+; NEON-NOSVE-NEXT: ldp q1, q2, [x0]
+; NEON-NOSVE-NEXT: udivr z0.d, p0/m, z0.d, z1.d
+; NEON-NOSVE-NEXT: movprfx z1, z2
+; NEON-NOSVE-NEXT: udiv z1.d, p0/m, z1.d, z3.d
+; NEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NEON-NOSVE-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = udiv <4 x i64> %op1, %op2
@@ -2263,20 +2235,13 @@ define void @udiv_v4i64(ptr %a, ptr %b) {
define void @udiv_constantsplat_v8i32(ptr %a) {
; SVE-LABEL: udiv_constantsplat_v8i32:
; SVE: // %bb.0:
-; SVE-NEXT: mov w8, #8969 // =0x2309
+; SVE-NEXT: mov w8, #37251 // =0x9183
; SVE-NEXT: ldp q1, q2, [x0]
-; SVE-NEXT: movk w8, #22765, lsl #16
+; SVE-NEXT: movk w8, #44150, lsl #16
; SVE-NEXT: ptrue p0.s, vl4
; SVE-NEXT: mov z0.s, w8
-; SVE-NEXT: movprfx z3, z1
-; SVE-NEXT: umulh z3.s, p0/m, z3.s, z0.s
+; SVE-NEXT: umulh z1.s, p0/m, z1.s, z0.s
; SVE-NEXT: umulh z0.s, p0/m, z0.s, z2.s
-; SVE-NEXT: sub z1.s, z1.s, z3.s
-; SVE-NEXT: sub z2.s, z2.s, z0.s
-; SVE-NEXT: lsr z1.s, z1.s, #1
-; SVE-NEXT: lsr z2.s, z2.s, #1
-; SVE-NEXT: add z1.s, z1.s, z3.s
-; SVE-NEXT: add z0.s, z2.s, z0.s
; SVE-NEXT: lsr z1.s, z1.s, #6
; SVE-NEXT: lsr z0.s, z0.s, #6
; SVE-NEXT: stp q1, q0, [x0]
@@ -2284,21 +2249,58 @@ define void @udiv_constantsplat_v8i32(ptr %a) {
;
; SVE2-LABEL: udiv_constantsplat_v8i32:
; SVE2: // %bb.0:
-; SVE2-NEXT: mov w8, #8969 // =0x2309
+; SVE2-NEXT: mov w8, #37251 // =0x9183
; SVE2-NEXT: ldp q1, q2, [x0]
-; SVE2-NEXT: movk w8, #22765, lsl #16
+; SVE2-NEXT: movk w8, #44150, lsl #16
; SVE2-NEXT: mov z0.s, w8
-; SVE2-NEXT: umulh z3.s, z1.s, z0.s
+; SVE2-NEXT: umulh z1.s, z1.s, z0.s
; SVE2-NEXT: umulh z0.s, z2.s, z0.s
-; SVE2-NEXT: sub z1.s, z1.s, z3.s
-; SVE2-NEXT: sub z2.s, z2.s, z0.s
-; SVE2-NEXT: usra z3.s, z1.s, #1
-; SVE2-NEXT: usra z0.s, z2.s, #1
-; SVE2-NEXT: lsr z1.s, z3.s, #6
+; SVE2-NEXT: lsr z1.s, z1.s, #6
; SVE2-NEXT: lsr z0.s, z0.s, #6
; SVE2-NEXT: stp q1, q0, [x0]
; SVE2-NEXT: ret
;
+; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
+; NONEON-NOSVE: // %bb.0:
+; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
+; NONEON-NOSVE-NEXT: mov w8, #37251 // =0x9183
+; NONEON-NOSVE-NEXT: movk w8, #44150, lsl #16
+; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
+; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #28]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x10, x9, #38
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #24]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #20]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x11, x9, #38
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #16]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: str w9, [sp, #44]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #8]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: str w9, [sp, #40]
+; NONEON-NOSVE-NEXT: ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT: umull x9, w9, w8
+; NONEON-NOSVE-NEXT: lsr x9, x9, #38
+; NONEON-NOSVE-NEXT: str w9, [sp, #36]
+; NONEON-NOSVE-NEXT: ldr w9, [sp]
+; NONEON-NOSVE-NEXT: umull x8, w9, w8
+; NONEON-NOSVE-NEXT: lsr x8, x8, #38
+; NONEON-NOSVE-NEXT: str w8, [sp, #32]
+; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
+; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
+; NONEON-NOSVE-NEXT: add sp, sp, #64
+; NONEON-NOSVE-NEXT: ret
; NEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
; NEON-NOSVE: // %bb.0:
; NEON-NOSVE-NEXT: mov w8, #8969 // =0x2309
@@ -2319,70 +2321,6 @@ define void @udiv_constantsplat_v8i32(ptr %a) {
; NEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #6
; NEON-NOSVE-NEXT: stp q1, q0, [x0]
; NEON-NOSVE-NEXT: ret
-;
-; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32:
-; NONEON-NOSVE: // %bb.0:
-; NONEON-NOSVE-NEXT: ldp q1, q0, [x0]
-; NONEON-NOSVE-NEXT: mov w8, #8969 // =0x2309
-; NONEON-NOSVE-NEXT: movk w8, #22765, lsl #16
-; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]!
-; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #28]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #24]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #20]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #16]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #12]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #8]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w9, w9, #6
-; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #40]
-; NONEON-NOSVE-NEXT: ldr w9, [sp, #4]
-; NONEON-NOSVE-NEXT: umull x10, w9, w8
-; NONEON-NOSVE-NEXT: lsr x10, x10, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w10
-; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w11, w9, #6
-; NONEON-NOSVE-NEXT: ldr w9, [sp]
-; NONEON-NOSVE-NEXT: umull x8, w9, w8
-; NONEON-NOSVE-NEXT: lsr x8, x8, #32
-; NONEON-NOSVE-NEXT: sub w9, w9, w8
-; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #1
-; NONEON-NOSVE-NEXT: lsr w8, w8, #6
-; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32]
-; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32]
-; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
-; NONEON-NOSVE-NEXT: add sp, sp, #64
-; NONEON-NOSVE-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>
store <8 x i32> %res, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 2212e0a633414..0e70596318ef3 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -4,14 +4,11 @@
define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8969 // =0x2309
-; CHECK-NEXT: movk w8, #22765, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #32
-; CHECK-NEXT: sub w9, w0, w8
-; CHECK-NEXT: add w8, w8, w9, lsr #1
+; CHECK-NEXT: mov w8, #37251 // =0x9183
; CHECK-NEXT: mov w9, #95 // =0x5f
-; CHECK-NEXT: lsr w8, w8, #6
+; CHECK-NEXT: movk w8, #44150, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #38
; CHECK-NEXT: msub w0, w8, w9, w0
; CHECK-NEXT: ret
%1 = urem i32 %x, 95
@@ -22,7 +19,7 @@ define i32 @fold_urem_positive_odd(i32 %x) {
define i32 @fold_urem_positive_even(i32 %x) {
; CHECK-LABEL: fold_urem_positive_even:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #16323 // =0x3fc3
+; CHECK-NEXT: mov w8, #16321 // =0x3fc1
; CHECK-NEXT: mov w9, #1060 // =0x424
; CHECK-NEXT: movk w8, #63310, lsl #16
; CHECK-NEXT: umull x8, w0, w8
@@ -38,14 +35,11 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8969 // =0x2309
-; CHECK-NEXT: movk w8, #22765, lsl #16
-; CHECK-NEXT: umull x8, w0, w8
-; CHECK-NEXT: lsr x8, x8, #32
-; CHECK-NEXT: sub w9, w0, w8
-; CHECK-NEXT: add w8, w8, w9, lsr #1
+; CHECK-NEXT: mov w8, #37251 // =0x9183
; CHECK-NEXT: mov w9, #95 // =0x5f
-; CHECK-NEXT: lsr w8, w8, #6
+; CHECK-NEXT: movk w8, #44150, lsl #16
+; CHECK-NEXT: umull x8, w0, w8
+; CHECK-NEXT: lsr x8, x8, #38
; CHECK-NEXT: msub w9, w8, w9, w0
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
@@ -88,14 +82,14 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
define i64 @dont_fold_urem_i64(i64 %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x9, #58849 // =0xe5e1
-; CHECK-NEXT: lsr x8, x0, #1
-; CHECK-NEXT: movk x9, #48148, lsl #16
-; CHECK-NEXT: movk x9, #33436, lsl #32
-; CHECK-NEXT: movk x9, #21399, lsl #48
-; CHECK-NEXT: umulh x8, x8, x9
+; CHECK-NEXT: mov x8, #42799 // =0xa72f
+; CHECK-NEXT: movk x8, #58848, lsl #16
+; CHECK-NEXT: movk x8, #48148, lsl #32
+; CHECK-NEXT: movk x8, #668, lsl #48
+; CHECK-NEXT: umulh x8, x0, x8
+; CHECK-NEXT: sub x9, x0, x8
+; CHECK-NEXT: add x8, x8, x9, lsr #1
; CHECK-NEXT: mov w9, #98 // =0x62
-; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: msub x0, x8, x9, x0
; CHECK-NEXT: ret
%1 = urem i64 %x, 98
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index ab67be9445ed3..80f3da01db42a 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -94,14 +94,13 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_odd_undef1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #34079 // =0x851f
-; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: mov w8, #28836 // =0x70a4
+; CHECK-NEXT: movk w8, #2621, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v2.4s, #25
-; CHECK-NEXT: ushr v1.4s, v1.4s, #3
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
@@ -116,14 +115,16 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-LABEL: test_urem_even_undef1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #34079 // =0x851f
-; CHECK-NEXT: movk w8, #20971, lsl #16
+; CHECK-NEXT: mov w8, #49807 // =0xc28f
+; CHECK-NEXT: movk w8, #10485, lsl #16
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: sub v2.4s, v0.4s, v1.4s
+; CHECK-NEXT: usra v1.4s, v2.4s, #1
; CHECK-NEXT: movi v2.4s, #100
-; CHECK-NEXT: ushr v1.4s, v1.4s, #5
+; CHECK-NEXT: ushr v1.4s, v1.4s, #4
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
index 468a33ce5bfcf..8d6e6251b1ed2 100644
--- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
@@ -5,23 +5,20 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; CHECK-LABEL: fold_urem_vec_1:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: movi v2.2s, #128, lsl #24
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: adrp x8, .LCPI0_1
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1]
-; CHECK-NEXT: adrp x8, .LCPI0_2
-; CHECK-NEXT: ushl v1.4h, v0.4h, v1.4h
-; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: movi d2, #0000000000000000
-; CHECK-NEXT: shrn v1.4h, v1.4s, #16
+; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: fneg d2, d2
+; CHECK-NEXT: shrn v1.4h, v1.4s, #16
; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h
; CHECK-NEXT: shrn v2.4h, v2.4s, #16
; CHECK-NEXT: add v1.4h, v2.4h, v1.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2]
-; CHECK-NEXT: adrp x8, .LCPI0_3
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1]
+; CHECK-NEXT: adrp x8, .LCPI0_2
; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2]
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
@@ -31,12 +28,11 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; CHECK-LABEL: fold_urem_vec_2:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #44151 // =0xac77
+; CHECK-NEXT: mov w8, #690 // =0x2b2
; CHECK-NEXT: movi v2.4h, #95
; CHECK-NEXT: dup v1.4h, w8
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: ushr v1.4s, v1.4s, #22
-; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: shrn v1.4h, v1.4s, #16
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -48,12 +44,11 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #44151 // =0xac77
+; CHECK-NEXT: mov w8, #690 // =0x2b2
; CHECK-NEXT: movi v2.4h, #95
; CHECK-NEXT: dup v1.4h, w8
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: ushr v1.4s, v1.4s, #22
-; CHECK-NEXT: xtn v1.4h, v1.4s
+; CHECK-NEXT: shrn v1.4h, v1.4s, #16
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
@@ -69,12 +64,18 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; CHECK-LABEL: dont_fold_urem_power_of_two:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI3_0
+; CHECK-NEXT: movi v2.4h, #128, lsl #8
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: adrp x8, .LCPI3_1
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: adrp x8, .LCPI3_2
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT: fneg d2, d2
; CHECK-NEXT: shrn v1.4h, v1.4s, #16
+; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h
+; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h
+; CHECK-NEXT: shrn v2.4h, v2.4s, #16
+; CHECK-NEXT: add v1.4h, v2.4h, v1.4h
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT: adrp x8, .LCPI3_2
; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h
; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_2]
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
@@ -88,25 +89,19 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; CHECK-LABEL: dont_fold_urem_one:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI4_0
-; CHECK-NEXT: movi d4, #0x0000000000ffff
+; CHECK-NEXT: movi d3, #0x0000000000ffff
+; CHECK-NEXT: movi d4, #0xffffffffffff0000
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0]
; CHECK-NEXT: adrp x8, .LCPI4_1
-; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_1]
; CHECK-NEXT: adrp x8, .LCPI4_2
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: shrn v1.4h, v1.4s, #16
-; CHECK-NEXT: sub v2.4h, v0.4h, v1.4h
-; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h
-; CHECK-NEXT: movi d3, #0xffffffffffff0000
-; CHECK-NEXT: shrn v2.4h, v2.4s, #16
-; CHECK-NEXT: add v1.4h, v2.4h, v1.4h
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT: adrp x8, .LCPI4_3
; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT: and v2.8b, v0.8b, v4.8b
-; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: and v2.8b, v0.8b, v3.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v4.8b
; CHECK-NEXT: orr v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2]
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
@@ -126,36 +121,33 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #17097 // =0x42c9
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: mov x10, v1.d[1]
-; CHECK-NEXT: movk x8, #45590, lsl #16
+; CHECK-NEXT: mov x8, #8547 // =0x2163
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: movk x8, #22795, lsl #16
+; CHECK-NEXT: mov x12, #35795 // =0x8bd3
; CHECK-NEXT: mov x11, v0.d[1]
-; CHECK-NEXT: mov x12, #12109 // =0x2f4d
-; CHECK-NEXT: movk x8, #34192, lsl #32
-; CHECK-NEXT: movk x12, #52170, lsl #16
+; CHECK-NEXT: movk x8, #17096, lsl #32
+; CHECK-NEXT: movk x12, #29426, lsl #16
+; CHECK-NEXT: mov x13, #54513 // =0xd4f1
+; CHECK-NEXT: movk x8, #45590, lsl #48
+; CHECK-NEXT: movk x12, #56339, lsl #32
+; CHECK-NEXT: movk x13, #400, lsl #16
+; CHECK-NEXT: umulh x8, x10, x8
+; CHECK-NEXT: movk x12, #12374, lsl #48
+; CHECK-NEXT: movk x13, #20242, lsl #32
+; CHECK-NEXT: movk x13, #6413, lsl #48
+; CHECK-NEXT: mov w14, #23 // =0x17
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: movk x8, #25644, lsl #48
-; CHECK-NEXT: movk x12, #28749, lsl #32
-; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: movk x12, #49499, lsl #48
-; CHECK-NEXT: lsr x13, x11, #1
-; CHECK-NEXT: umulh x12, x10, x12
-; CHECK-NEXT: sub x14, x9, x8
-; CHECK-NEXT: add x8, x8, x14, lsr #1
-; CHECK-NEXT: mov x14, #21445 // =0x53c5
-; CHECK-NEXT: movk x14, #1603, lsl #16
-; CHECK-NEXT: movk x14, #15432, lsl #32
+; CHECK-NEXT: umulh x12, x9, x12
+; CHECK-NEXT: umulh x13, x11, x13
; CHECK-NEXT: lsr x8, x8, #4
-; CHECK-NEXT: movk x14, #25653, lsl #48
-; CHECK-NEXT: umulh x13, x13, x14
-; CHECK-NEXT: mov w14, #23 // =0x17
-; CHECK-NEXT: msub x8, x8, x14, x9
-; CHECK-NEXT: lsr x9, x12, #12
+; CHECK-NEXT: msub x8, x8, x14, x10
+; CHECK-NEXT: lsr x10, x12, #10
; CHECK-NEXT: mov w12, #5423 // =0x152f
-; CHECK-NEXT: msub x9, x9, x12, x10
+; CHECK-NEXT: msub x9, x10, x12, x9
+; CHECK-NEXT: lsr x10, x13, #6
; CHECK-NEXT: mov w12, #654 // =0x28e
-; CHECK-NEXT: lsr x10, x13, #7
; CHECK-NEXT: msub x10, x10, x12, x11
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v1.d[1], x9
@@ -168,12 +160,12 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
define <16 x i8> @fold_urem_v16i8(<16 x i8> %x) {
; CHECK-LABEL: fold_urem_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.16b, #205
+; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b
; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b
; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: movi v2.16b, #10
-; CHECK-NEXT: ushr v1.16b, v1.16b, #3
+; CHECK-NEXT: ushr v1.16b, v1.16b, #1
; CHECK-NEXT: mls v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%1 = urem <16 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -183,11 +175,11 @@ define <16 x i8> @fold_urem_v16i8(<16 x i8> %x) {
define <8 x i8> @fold_urem_v8i8(<8 x i8> %x) {
; CHECK-LABEL: fold_urem_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.8b, #205
+; CHECK-NEXT: movi v1.8b, #51
; CHECK-NEXT: movi v2.8b, #10
; CHECK-NEXT: umull v1.8h, v0.8b, v1.8b
; CHECK-NEXT: shrn v1.8b, v1.8h, #8
-; CHECK-NEXT: ushr v1.8b, v1.8b, #3
+; CHECK-NEXT: ushr v1.8b, v1.8b, #1
; CHECK-NEXT: mls v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
%1 = urem <8 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -197,13 +189,12 @@ define <8 x i8> @fold_urem_v8i8(<8 x i8> %x) {
define <8 x i16> @fold_urem_v8i16(<8 x i16> %x) {
; CHECK-LABEL: fold_urem_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: dup v1.8h, w8
+; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT: movi v2.8h, #10
-; CHECK-NEXT: ushr v1.8h, v1.8h, #3
+; CHECK-NEXT: ushr v1.8h, v1.8h, #1
; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h
; CHECK-NEXT: ret
%1 = urem <8 x i16> %x, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -213,11 +204,10 @@ define <8 x i16> @fold_urem_v8i16(<8 x i16> %x) {
define <4 x i16> @fold_urem_v4i16(<4 x i16> %x) {
; CHECK-LABEL: fold_urem_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
+; CHECK-NEXT: movi v1.8b, #51
; CHECK-NEXT: movi v2.4h, #10
-; CHECK-NEXT: dup v1.4h, w8
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT: ushr v1.4s, v1.4s, #19
+; CHECK-NEXT: ushr v1.4s, v1.4s, #17
; CHECK-NEXT: xtn v1.4h, v1.4s
; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h
; CHECK-NEXT: ret
@@ -228,14 +218,12 @@ define <4 x i16> @fold_urem_v4i16(<4 x i16> %x) {
define <4 x i32> @fold_urem_v4i32(<4 x i32> %x) {
; CHECK-LABEL: fold_urem_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
-; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: dup v1.4s, w8
+; CHECK-NEXT: movi v1.16b, #51
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
; CHECK-NEXT: movi v2.4s, #10
-; CHECK-NEXT: ushr v1.4s, v1.4s, #3
+; CHECK-NEXT: ushr v1.4s, v1.4s, #1
; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
; CHECK-NEXT: ret
%1 = urem <4 x i32> %x, <i32 10, i32 10, i32 10, i32 10>
@@ -245,12 +233,10 @@ define <4 x i32> @fold_urem_v4i32(<4 x i32> %x) {
define <2 x i32> @fold_urem_v2i32(<2 x i32> %x) {
; CHECK-LABEL: fold_urem_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #52429 // =0xcccd
+; CHECK-NEXT: movi v1.8b, #51
; CHECK-NEXT: movi v2.2s, #10
-; CHECK-NEXT: movk w8, #52428, lsl #16
-; CHECK-NEXT: dup v1.2s, w8
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
-; CHECK-NEXT: ushr v1.2d, v1.2d, #35
+; CHECK-NEXT: ushr v1.2d, v1.2d, #33
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: mls v0.2s, v1.2s, v2.2s
; CHECK-NEXT: ret
@@ -262,15 +248,14 @@ define <2 x i64> @fold_urem_v2i64(<2 x i64> %x) {
; CHECK-LABEL: fold_urem_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov x10, d0
-; CHECK-NEXT: mov x8, #-3689348814741910324 // =0xcccccccccccccccc
+; CHECK-NEXT: mov x8, #3689348814741910323 // =0x3333333333333333
; CHECK-NEXT: mov x9, v0.d[1]
-; CHECK-NEXT: movk x8, #52429
; CHECK-NEXT: mov w12, #10 // =0xa
; CHECK-NEXT: umulh x11, x10, x8
; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: lsr x11, x11, #3
+; CHECK-NEXT: lsr x11, x11, #1
; CHECK-NEXT: msub x10, x11, x12, x10
-; CHECK-NEXT: lsr x8, x8, #3
+; CHECK-NEXT: lsr x8, x8, #1
; CHECK-NEXT: msub x8, x8, x12, x9
; CHECK-NEXT: fmov d0, x10
; CHECK-NEXT: mov v0.d[1], x8
@@ -284,11 +269,10 @@ define <1 x i64> @fold_urem_v1i64(<1 x i64> %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: mov x8, #-3689348814741910324 // =0xcccccccccccccccc
+; CHECK-NEXT: mov x8, #3689348814741910323 // =0x3333333333333333
; CHECK-NEXT: mov w10, #10 // =0xa
-; CHECK-NEXT: movk x8, #52429
; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: lsr x8, x8, #3
+; CHECK-NEXT: lsr x8, x8, #1
; CHECK-NEXT: msub x8, x8, x10, x9
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index cd01148fa7dd7..07477849f6455 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -206,7 +206,12 @@ define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
; CHECK-LABEL: v_udiv_i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0
+; CHECK-NEXT: v_bfrev_b32_e32 v1, -2
+; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 11, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i32 %num, 4096
ret i32 %result
@@ -216,8 +221,17 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v1, 12, v1
+; CHECK-NEXT: v_bfrev_b32_e32 v2, -2
+; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v0, 11, v0
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 11, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
@@ -227,11 +241,8 @@ define i32 @v_udiv_i32_oddk_denom(i32 %num) {
; CHECK-LABEL: v_udiv_i32_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881
-; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0xd952843f
+; CHECK-NEXT: v_mul_hi_u32 v0, v0, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i32 %num, 1235195
@@ -242,15 +253,9 @@ define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881
-; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_mov_b32_e32 v2, 0xd952843f
+; CHECK-NEXT: v_mul_hi_u32 v0, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, v2
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index d15551365707b..6f99bf79fc4bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -959,7 +959,34 @@ define i64 @v_udiv_i64_pow2k_denom(i64 %num) {
; CHECK-LABEL: v_udiv_i64_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 12
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; CHECK-NEXT: v_bfrev_b32_e32 v3, -2
+; CHECK-NEXT: v_mul_hi_u32 v4, v0, -1
+; CHECK-NEXT: v_mul_hi_u32 v5, v1, -1
+; CHECK-NEXT: v_mul_lo_u32 v6, v0, v3
+; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 11
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i64 %num, 4096
ret i64 %result
@@ -969,8 +996,61 @@ define <2 x i64> @v_udiv_v2i64_pow2k_denom(<2 x i64> %num) {
; CHECK-LABEL: v_udiv_v2i64_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 12
-; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 12
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; CHECK-NEXT: v_bfrev_b32_e32 v5, -2
+; CHECK-NEXT: v_mul_hi_u32 v6, v0, -1
+; CHECK-NEXT: v_mul_hi_u32 v7, v1, -1
+; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; CHECK-NEXT: v_mul_hi_u32 v9, v2, -1
+; CHECK-NEXT: v_mul_hi_u32 v10, v3, -1
+; CHECK-NEXT: v_mul_lo_u32 v11, v0, v5
+; CHECK-NEXT: v_mul_lo_u32 v12, v1, v5
+; CHECK-NEXT: v_mul_hi_u32 v13, v0, v5
+; CHECK-NEXT: v_mul_hi_u32 v14, v1, v5
+; CHECK-NEXT: v_mul_lo_u32 v15, v2, v5
+; CHECK-NEXT: v_mul_lo_u32 v16, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v17, v2, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v3, v5
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v15
+; CHECK-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v16, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v13
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v17
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v15, v8
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v16, v10
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v9
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
+; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 11
+; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 11
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
@@ -980,7 +1060,7 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
; CHECK-LABEL: v_udiv_i64_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c2f
; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440
; CHECK-NEXT: v_mul_lo_u32 v4, v1, v2
; CHECK-NEXT: v_mul_lo_u32 v5, v0, v3
@@ -1013,7 +1093,7 @@ define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
; CHECK-LABEL: v_udiv_v2i64_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
+; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c2f
; CHECK-NEXT: v_mov_b32_e32 v5, 0xd9528440
; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4
; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 7cf18171a6cd7..42c8d9dc002d5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -5399,14 +5399,11 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881
+; GFX6-NEXT: v_mov_b32_e32 v0, 0xd952843f
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -5417,11 +5414,8 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881
-; GFX9-NEXT: s_sub_i32 s3, s4, s2
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_lshr_b32 s2, s3, 20
+; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xd952843f
+; GFX9-NEXT: s_lshr_b32 s2, s2, 20
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -5560,14 +5554,10 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT: v_mul_hi_u32 v1, s3, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: s_lshr_b32 s0, s2, 12
; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
@@ -5577,12 +5567,8 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_add_i32 s3, s3, s4
; GFX9-NEXT: s_lshr_b32 s2, s2, 12
-; GFX9-NEXT: s_lshr_b32 s3, s3, 11
+; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x100101
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -5786,14 +5772,11 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s4, s[2:3], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881
+; GFX6-NEXT: v_mov_b32_e32 v0, 0xd952843f
; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -5807,11 +5790,8 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xb2a50881
-; GFX9-NEXT: s_sub_i32 s3, s4, s2
-; GFX9-NEXT: s_lshr_b32 s3, s3, 1
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_lshr_b32 s2, s3, 20
+; GFX9-NEXT: s_mul_hi_u32 s2, s4, 0xd952843f
+; GFX9-NEXT: s_lshr_b32 s2, s2, 20
; GFX9-NEXT: s_mul_i32 s2, s2, 0x12d8fb
; GFX9-NEXT: s_sub_i32 s2, s4, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 5fbcd0bf66999..7eec4bf885642 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -13,9 +13,9 @@ define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocaptu
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; CHECK-NEXT: s_and_b32 s4, s0, 0xffff
; CHECK-NEXT: s_mov_b32 s1, 0
-; CHECK-NEXT: s_mul_i32 s6, s4, 0xaaab
+; CHECK-NEXT: s_mul_i32 s6, s4, 0x5555
; CHECK-NEXT: s_lshl_b64 s[4:5], s[0:1], 2
-; CHECK-NEXT: s_lshr_b32 s1, s6, 19
+; CHECK-NEXT: s_lshr_b32 s1, s6, 18
; CHECK-NEXT: s_mul_i32 s1, s1, 12
; CHECK-NEXT: s_sub_i32 s6, s0, s1
; CHECK-NEXT: s_and_b32 s7, s6, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f0ab3a5342e01..1eaddf6c34664 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4644,35 +4644,354 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
; GFX9-G-LABEL: v_udiv_i128_v_pow2k:
; GFX9-G: ; %bb.0:
; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-G-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4
-; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3
+; GFX9-G-NEXT: v_mul_hi_u32 v6, v0, -1
+; GFX9-G-NEXT: v_sub_u32_e32 v4, 0, v1
+; GFX9-G-NEXT: v_sub_u32_e32 v5, 0, v0
+; GFX9-G-NEXT: v_add_co_u32_e32 v7, vcc, v4, v5
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v7, vcc, v7, v6
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-G-NEXT: v_mul_hi_u32 v9, v1, -1
+; GFX9-G-NEXT: v_add_u32_e32 v7, v8, v7
+; GFX9-G-NEXT: v_sub_u32_e32 v8, 0, v2
+; GFX9-G-NEXT: v_add_co_u32_e32 v10, vcc, v8, v4
+; GFX9-G-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5
+; GFX9-G-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-G-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-G-NEXT: v_add3_u32 v10, v11, v10, v12
+; GFX9-G-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX9-G-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-G-NEXT: v_add3_u32 v5, v10, v11, v5
+; GFX9-G-NEXT: v_bfrev_b32_e32 v10, -2
+; GFX9-G-NEXT: v_mul_lo_u32 v11, v0, v10
+; GFX9-G-NEXT: v_sub_u32_e32 v7, 0, v3
+; GFX9-G-NEXT: v_mul_hi_u32 v12, v2, -1
+; GFX9-G-NEXT: v_add_co_u32_e32 v8, vcc, v7, v8
+; GFX9-G-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v4, v11
+; GFX9-G-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v4, v12
+; GFX9-G-NEXT: v_add3_u32 v11, v13, v14, v11
+; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
+; GFX9-G-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-G-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-G-NEXT: v_add3_u32 v11, v11, v14, v15
+; GFX9-G-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-G-NEXT: v_mul_lo_u32 v5, v1, v10
+; GFX9-G-NEXT: v_add3_u32 v4, v11, v6, v4
+; GFX9-G-NEXT: v_mul_hi_u32 v6, v3, -1
+; GFX9-G-NEXT: v_mul_hi_u32 v11, v0, v10
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v12
+; GFX9-G-NEXT: v_add3_u32 v8, v13, v8, v14
+; GFX9-G-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v5, v11
+; GFX9-G-NEXT: v_mul_lo_u32 v11, v2, v10
+; GFX9-G-NEXT: v_add3_u32 v8, v8, v13, v9
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-G-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-G-NEXT: v_add3_u32 v5, v8, v9, v5
+; GFX9-G-NEXT: v_mul_hi_u32 v8, v1, v10
+; GFX9-G-NEXT: v_add_co_u32_e32 v7, vcc, v7, v11
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v7, vcc, v7, v6
+; GFX9-G-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12
+; GFX9-G-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX9-G-NEXT: v_add3_u32 v9, v9, v11, v12
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-G-NEXT: v_mul_lo_u32 v11, v3, v10
+; GFX9-G-NEXT: v_add3_u32 v7, v9, v8, v7
+; GFX9-G-NEXT: v_mul_hi_u32 v8, v2, v10
+; GFX9-G-NEXT: v_add_co_u32_e32 v6, vcc, v11, v6
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-G-NEXT: v_add_u32_e32 v8, v9, v8
+; GFX9-G-NEXT: v_mul_hi_u32 v9, v3, v10
+; GFX9-G-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX9-G-NEXT: v_add3_u32 v7, v8, v7, v9
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
+; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-G-NEXT: v_lshl_or_b32 v8, v2, 31, v1
+; GFX9-G-NEXT: v_lshrrev_b64 v[1:2], 1, v[2:3]
+; GFX9-G-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX9-G-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v5, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v7, vcc
; GFX9-G-NEXT: v_mov_b32_e32 v3, 0
; GFX9-G-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-G-O0-LABEL: v_udiv_i128_v_pow2k:
; GFX9-G-O0: ; %bb.0:
; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b32 s7, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s6, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s5, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0x7fffffff
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v4, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v5, v6, v5
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[8:9]
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v2, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v2, v0
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v4, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v8, v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v9, v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v5, v6, v5
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v8, v9
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[8:9], v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[8:9], v5, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v7, v2, v4, v5
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v2, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v4, v0
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v10, v5, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_mul_lo_u32 v11, v6, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v8, v0, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v9, v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v5, v6, v5
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v10, v11
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v8, v9
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[8:9], v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[8:9], v5, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v7, v2, v4, v5
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v2, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v4, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_mul_lo_u32 v10, v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v11, v1, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v8, v0, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v9, v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v5, v6, v5
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v10, v11
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[8:9], v4, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v8, v9
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[8:9], v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[8:9]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v7, s[8:9], v5, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[8:9]
+; GFX9-G-O0-NEXT: v_add3_u32 v8, v2, v4, v5
+; GFX9-G-O0-NEXT: s_mov_b32 s7, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s7
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v2, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_mul_lo_u32 v4, v0, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v9, v1, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v10, v0, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v5, v3, v5
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v11, s[6:7]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[6:7]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7]
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v9, v10
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[6:7]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v5, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GFX9-G-O0-NEXT: v_add3_u32 v8, v2, v4, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_mul_lo_u32 v2, v1, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v4, v1, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s4
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v9, v0, v9
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[6:7]
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
+; GFX9-G-O0-NEXT: v_add_u32_e64 v2, v2, v9
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s4
+; GFX9-G-O0-NEXT: v_mul_hi_u32 v9, v1, v9
+; GFX9-G-O0-NEXT: v_add3_u32 v2, v2, v8, v9
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v10, s[4:5], v6, v7
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v0, s[4:5], v0, v4, s[4:5]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v2, s[4:5]
; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1
+; GFX9-G-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[11:12], v1, v[10:11]
+; GFX9-G-O0-NEXT: s_mov_b32 s5, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-G-O0-NEXT: s_mov_b32 s5, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v12
+; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v10
+; GFX9-G-O0-NEXT: v_lshl_or_b32 v0, v0, v1, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[8:9], v1, v[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v9
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v6, s[4:5], v6, v7
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v5, s[4:5]
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v4, s[4:5]
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v2, s[4:5], v1, v2, s[4:5]
; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5]
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4
-; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2
-; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
-; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3
+; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v3
; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4
; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index dfd9a650ff0e9..e291f829498a0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -1192,7 +1192,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: s_mov_b32 s2, 0xfabbd9c1
+; SI-NEXT: s_mov_b32 s2, 0xfabbd9bf
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -1212,7 +1212,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1
+; VI-NEXT: s_mov_b32 s2, 0xfabbd9bf
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1228,7 +1228,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: s_mov_b32 s2, 0xfabbd9c1
+; GCN-NEXT: s_mov_b32 s2, 0xfabbd9bf
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9c1, v1
+; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9bf, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 25, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
@@ -1263,7 +1263,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MULHI * T0.X, T0.X, literal.x,
-; EG-NEXT: -88352319(-4.876880e+35), 0(0.000000e+00)
+; EG-NEXT: -88352321(-4.876880e+35), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 25(3.503246e-44), 2(2.802597e-45)
@@ -1286,12 +1286,12 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; SI-NEXT: s_mov_b32 s8, s2
; SI-NEXT: s_mov_b32 s9, s3
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; SI-NEXT: s_mov_b32 s2, 0x7d5deca3
+; SI-NEXT: s_mov_b32 s2, 0x3eaef651
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mul_hi_u32 v0, v0, s2
-; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 23, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1306,12 +1306,12 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0x7d5deca3
+; VI-NEXT: s_mov_b32 s2, 0x3eaef651
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_hi_u32 v0, v0, s2
-; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
+; VI-NEXT: v_lshrrev_b32_e32 v0, 23, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -1322,12 +1322,12 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_load_dword v0, v[0:1]
-; GCN-NEXT: s_mov_b32 s2, 0x7d5deca3
+; GCN-NEXT: s_mov_b32 s2, 0x3eaef651
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v2, v0, s2
; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 23, v2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
;
@@ -1338,8 +1338,8 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_mul_hi_u32 v1, 0x7d5deca3, v1
-; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 24, v1
+; GFX1030-NEXT: v_mul_hi_u32 v1, 0x3eaef651, v1
+; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 23, v1
; GFX1030-NEXT: global_store_dword v0, v1, s[0:1]
; GFX1030-NEXT: s_endpgm
;
@@ -1357,10 +1357,10 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MULHI * T0.X, T0.X, literal.x,
-; EG-NEXT: 2103307427(1.843675e+37), 0(0.000000e+00)
+; EG-NEXT: 1051653713(3.417230e-01), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45)
+; EG-NEXT: 23(3.222986e-44), 2(2.802597e-45)
%b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
%a = load i32, ptr addrspace(1) %in
%result = udiv i32 %a, 34259183
@@ -2055,22 +2055,18 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; SI-NEXT: s_mov_b32 s0, 0x1389c755
+; SI-NEXT: s_mov_b32 s0, 0x4e271d53
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_mul_hi_u32 v0, v0, s0
; SI-NEXT: v_mul_hi_u32 v1, v1, s0
; SI-NEXT: v_mul_hi_u32 v2, v2, s0
; SI-NEXT: v_mul_hi_u32 v3, v3, s0
-; SI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; SI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; SI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; SI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; SI-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; SI-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; SI-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; SI-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -2083,22 +2079,18 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s0, 0x1389c755
+; VI-NEXT: s_mov_b32 s0, 0x4e271d53
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; VI-NEXT: v_mul_hi_u32 v0, v0, s0
; VI-NEXT: v_mul_hi_u32 v1, v1, s0
; VI-NEXT: v_mul_hi_u32 v2, v2, s0
; VI-NEXT: v_mul_hi_u32 v3, v3, s0
-; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; VI-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; VI-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; VI-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; VI-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
@@ -2109,22 +2101,18 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GCN-NEXT: s_mov_b32 s0, 0x1389c755
+; GCN-NEXT: s_mov_b32 s0, 0x4e271d53
; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; GCN-NEXT: v_mul_hi_u32 v0, v0, s0
; GCN-NEXT: v_mul_hi_u32 v1, v1, s0
; GCN-NEXT: v_mul_hi_u32 v2, v2, s0
; GCN-NEXT: v_mul_hi_u32 v3, v3, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; GCN-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
;
@@ -2135,18 +2123,14 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1
-; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3
-; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0
-; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1
-; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2
-; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3
-; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0
-; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1
-; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2
-; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 10, v3
+; GFX1030-NEXT: v_mul_hi_u32 v0, 0x4e271d53, v0
+; GFX1030-NEXT: v_mul_hi_u32 v1, 0x4e271d53, v1
+; GFX1030-NEXT: v_mul_hi_u32 v2, 0x4e271d53, v2
+; GFX1030-NEXT: v_mul_hi_u32 v3, 0x4e271d53, v3
+; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 14, v2
+; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 14, v3
; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX1030-NEXT: s_endpgm
;
@@ -2154,7 +2138,7 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -2163,27 +2147,20 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Y,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHR T0.W, T0.W, literal.x,
-; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
-; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT: MULHI * T0.Z, PV.W, literal.x,
-; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
-; EG-NEXT: LSHR T1.Z, T0.Y, literal.x,
-; EG-NEXT: LSHR T0.W, PS, literal.y,
-; EG-NEXT: MULHI * T0.Y, T1.W, literal.z,
-; EG-NEXT: 2(2.802597e-45), 10(1.401298e-44)
-; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
+; EG-NEXT: MULHI * T0.W, T0.W, literal.x,
+; EG-NEXT: 1311186259(7.009292e+08), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PS, literal.x,
+; EG-NEXT: MULHI * T0.Z, T0.Z, literal.y,
+; EG-NEXT: 14(1.961818e-44), 1311186259(7.009292e+08)
; EG-NEXT: LSHR T0.Z, PS, literal.x,
-; EG-NEXT: LSHR T1.W, T0.X, literal.y,
-; EG-NEXT: MULHI * T0.X, PV.Z, literal.z,
-; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45)
-; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00)
+; EG-NEXT: MULHI * T0.Y, T0.Y, literal.y,
+; EG-NEXT: 14(1.961818e-44), 1311186259(7.009292e+08)
; EG-NEXT: LSHR T0.Y, PS, literal.x,
-; EG-NEXT: MULHI * T0.X, PV.W, literal.y,
-; EG-NEXT: 10(1.401298e-44), 327796565(3.478022e-27)
+; EG-NEXT: MULHI * T0.X, T0.X, literal.y,
+; EG-NEXT: 14(1.961818e-44), 1311186259(7.009292e+08)
; EG-NEXT: LSHR T0.X, PS, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.y,
-; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45)
+; EG-NEXT: 14(1.961818e-44), 2(2.802597e-45)
%1 = load <4 x i32>, ptr addrspace(1) %in, align 16
%2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
store <4 x i32> %2, ptr addrspace(1) %out, align 16
@@ -2254,12 +2231,11 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; SI-LABEL: test_udiv_3_mulhu:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s0, s[2:3], 0x9
-; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; SI-NEXT: v_mov_b32_e32 v0, 0x55555555
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mul_hi_u32 v0, s0, v0
-; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
@@ -2267,12 +2243,11 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; VI-LABEL: test_udiv_3_mulhu:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[2:3], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; VI-NEXT: v_mov_b32_e32 v0, 0x55555555
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_hi_u32 v0, s0, v0
-; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -2280,10 +2255,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GCN-LABEL: test_udiv_3_mulhu:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT: v_mov_b32_e32 v0, 0x55555555
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; GCN-NEXT: flat_store_dword v[0:1], v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
@@ -2292,8 +2266,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab
-; GFX1030-NEXT: s_lshr_b32 s0, s0, 1
+; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0x55555555
; GFX1030-NEXT: v_mov_b32_e32 v0, s0
; GFX1030-NEXT: global_store_dword v[0:1], v0, off
; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0
@@ -2301,16 +2274,14 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
;
; EG-LABEL: test_udiv_3_mulhu:
; EG: ; %bb.0:
-; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
-; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: MULHI * T0.X, KC0[2].Y, literal.x,
-; EG-NEXT: -1431655765(-3.031649e-13), 0(0.000000e+00)
-; EG-NEXT: LSHR T0.X, PS, 1,
-; EG-NEXT: MOV * T1.X, literal.x,
-; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, literal.x,
+; EG-NEXT: MULHI * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 0(0.000000e+00), 1431655765(1.466015e+13)
%i = udiv i32 %p, 3
store volatile i32 %i, ptr addrspace(1) undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll
index 4b8127fef822d..d3f0b758be794 100644
--- a/llvm/test/CodeGen/AMDGPU/urem.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem.ll
@@ -21,7 +21,6 @@ define amdgpu_kernel void @test_urem_i32(ptr addrspace(1) %out, ptr addrspace(1)
; FUNC-LABEL: {{^}}test_urem_i32_7:
; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x24924925
; SI: v_mul_hi_u32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]]
-; SI: v_sub_{{[iu]}}32
; SI: v_mul_lo_u32
; SI: v_subrev_{{[iu]}}32
; SI: buffer_store_dword
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index cc38e250f183f..8a7d5ffff6fd6 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -621,9 +621,9 @@ define i64 @test_ds_cross_basic_blocks(ptr %arg, i32 signext %arg1) {
; CHECK-NEXT: .LBB6_4: # %bb5
; CHECK-NEXT: #
; CHECK-NEXT: lbzu r30, 1(r5)
-; CHECK-NEXT: mulli r29, r30, 171
-; CHECK-NEXT: rlwinm r28, r29, 24, 8, 30
-; CHECK-NEXT: srwi r29, r29, 9
+; CHECK-NEXT: mulli r29, r30, 85
+; CHECK-NEXT: rlwinm r28, r29, 25, 7, 30
+; CHECK-NEXT: srwi r29, r29, 8
; CHECK-NEXT: add r29, r29, r28
; CHECK-NEXT: sub r30, r30, r29
; CHECK-NEXT: clrlwi r30, r30, 24
diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
index 43a1e5a2faf6d..b75bcd82d4468 100644
--- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll
@@ -5,12 +5,9 @@
define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK-LABEL: fold_urem_positive_odd:
; CHECK: # %bb.0:
-; CHECK-NEXT: lis 4, 22765
-; CHECK-NEXT: ori 4, 4, 8969
+; CHECK-NEXT: lis 4, -21386
+; CHECK-NEXT: ori 4, 4, 37251
; CHECK-NEXT: mulhwu 4, 3, 4
-; CHECK-NEXT: sub 5, 3, 4
-; CHECK-NEXT: srwi 5, 5, 1
-; CHECK-NEXT: add 4, 5, 4
; CHECK-NEXT: srwi 4, 4, 6
; CHECK-NEXT: mulli 4, 4, 95
; CHECK-NEXT: sub 3, 3, 4
@@ -24,7 +21,7 @@ define i32 @fold_urem_positive_even(i32 %x) {
; CHECK-LABEL: fold_urem_positive_even:
; CHECK: # %bb.0:
; CHECK-NEXT: lis 4, -2226
-; CHECK-NEXT: ori 4, 4, 16323
+; CHECK-NEXT: ori 4, 4, 16321
; CHECK-NEXT: mulhwu 4, 3, 4
; CHECK-NEXT: srwi 4, 4, 10
; CHECK-NEXT: mulli 4, 4, 1060
@@ -39,12 +36,9 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: # %bb.0:
-; CHECK-NEXT: lis 4, 22765
-; CHECK-NEXT: ori 4, 4, 8969
+; CHECK-NEXT: lis 4, -21386
+; CHECK-NEXT: ori 4, 4, 37251
; CHECK-NEXT: mulhwu 4, 3, 4
-; CHECK-NEXT: sub 5, 3, 4
-; CHECK-NEXT: srwi 5, 5, 1
-; CHECK-NEXT: add 4, 5, 4
; CHECK-NEXT: srwi 4, 4, 6
; CHECK-NEXT: mulli 5, 4, 95
; CHECK-NEXT: sub 3, 3, 5
diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index a2ad2946cc8ec..8863a9da800f0 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -810,39 +810,35 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9LE-LABEL: dont_fold_urem_i64:
; P9LE: # %bb.0:
-; P9LE-NEXT: lis r4, 1602
+; P9LE-NEXT: lis r4, 5698
; P9LE-NEXT: mfvsrld r3, v3
+; P9LE-NEXT: lis r5, 12374
; P9LE-NEXT: ori r4, r4, 51289
-; P9LE-NEXT: rldic r4, r4, 36, 1
-; P9LE-NEXT: oris r4, r4, 45590
-; P9LE-NEXT: ori r4, r4, 17097
+; P9LE-NEXT: ori r5, r5, 56339
+; P9LE-NEXT: rldic r4, r4, 35, 0
+; P9LE-NEXT: rldic r5, r5, 32, 2
+; P9LE-NEXT: oris r4, r4, 22795
+; P9LE-NEXT: oris r5, r5, 29426
+; P9LE-NEXT: ori r4, r4, 8547
+; P9LE-NEXT: ori r5, r5, 35795
; P9LE-NEXT: mulhdu r4, r3, r4
-; P9LE-NEXT: sub r5, r3, r4
-; P9LE-NEXT: rldicl r5, r5, 63, 1
-; P9LE-NEXT: add r4, r5, r4
-; P9LE-NEXT: lis r5, -16037
; P9LE-NEXT: rldicl r4, r4, 60, 4
-; P9LE-NEXT: ori r5, r5, 28749
; P9LE-NEXT: mulli r4, r4, 23
-; P9LE-NEXT: rldic r5, r5, 32, 0
-; P9LE-NEXT: oris r5, r5, 52170
-; P9LE-NEXT: ori r5, r5, 12109
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: mfvsrd r4, v3
; P9LE-NEXT: mulhdu r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 52, 12
+; P9LE-NEXT: rldicl r5, r5, 54, 10
; P9LE-NEXT: mulli r5, r5, 5423
; P9LE-NEXT: sub r4, r4, r5
-; P9LE-NEXT: lis r5, 3206
-; P9LE-NEXT: ori r5, r5, 42889
; P9LE-NEXT: mtvsrdd v3, r4, r3
+; P9LE-NEXT: lis r4, 3206
; P9LE-NEXT: mfvsrd r3, v2
-; P9LE-NEXT: rldic r5, r5, 35, 1
-; P9LE-NEXT: rldicl r4, r3, 63, 1
-; P9LE-NEXT: oris r5, r5, 1603
-; P9LE-NEXT: ori r5, r5, 21445
-; P9LE-NEXT: mulhdu r4, r4, r5
-; P9LE-NEXT: rldicl r4, r4, 57, 7
+; P9LE-NEXT: ori r4, r4, 42889
+; P9LE-NEXT: rldic r4, r4, 33, 3
+; P9LE-NEXT: oris r4, r4, 400
+; P9LE-NEXT: ori r4, r4, 54513
+; P9LE-NEXT: mulhdu r4, r3, r4
+; P9LE-NEXT: rldicl r4, r4, 58, 6
; P9LE-NEXT: mulli r4, r4, 654
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: li r4, 0
@@ -851,39 +847,35 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
;
; P9BE-LABEL: dont_fold_urem_i64:
; P9BE: # %bb.0:
-; P9BE-NEXT: lis r4, 1602
-; P9BE-NEXT: mfvsrd r3, v3
-; P9BE-NEXT: ori r4, r4, 51289
-; P9BE-NEXT: rldic r4, r4, 36, 1
-; P9BE-NEXT: oris r4, r4, 45590
-; P9BE-NEXT: ori r4, r4, 17097
+; P9BE-NEXT: lis r4, 12374
+; P9BE-NEXT: mfvsrld r3, v3
+; P9BE-NEXT: lis r5, 5698
+; P9BE-NEXT: ori r4, r4, 56339
+; P9BE-NEXT: ori r5, r5, 51289
+; P9BE-NEXT: rldic r4, r4, 32, 2
+; P9BE-NEXT: rldic r5, r5, 35, 0
+; P9BE-NEXT: oris r4, r4, 29426
+; P9BE-NEXT: oris r5, r5, 22795
+; P9BE-NEXT: ori r4, r4, 35795
+; P9BE-NEXT: ori r5, r5, 8547
; P9BE-NEXT: mulhdu r4, r3, r4
-; P9BE-NEXT: sub r5, r3, r4
-; P9BE-NEXT: rldicl r5, r5, 63, 1
-; P9BE-NEXT: add r4, r5, r4
-; P9BE-NEXT: lis r5, -16037
-; P9BE-NEXT: rldicl r4, r4, 60, 4
-; P9BE-NEXT: ori r5, r5, 28749
-; P9BE-NEXT: mulli r4, r4, 23
-; P9BE-NEXT: rldic r5, r5, 32, 0
-; P9BE-NEXT: oris r5, r5, 52170
-; P9BE-NEXT: ori r5, r5, 12109
+; P9BE-NEXT: rldicl r4, r4, 54, 10
+; P9BE-NEXT: mulli r4, r4, 5423
; P9BE-NEXT: sub r3, r3, r4
-; P9BE-NEXT: mfvsrld r4, v3
+; P9BE-NEXT: mfvsrd r4, v3
; P9BE-NEXT: mulhdu r5, r4, r5
-; P9BE-NEXT: rldicl r5, r5, 52, 12
-; P9BE-NEXT: mulli r5, r5, 5423
+; P9BE-NEXT: rldicl r5, r5, 60, 4
+; P9BE-NEXT: mulli r5, r5, 23
; P9BE-NEXT: sub r4, r4, r5
-; P9BE-NEXT: lis r5, 3206
-; P9BE-NEXT: ori r5, r5, 42889
-; P9BE-NEXT: mtvsrdd v3, r3, r4
+; P9BE-NEXT: mtvsrdd v3, r4, r3
+; P9BE-NEXT: lis r4, 3206
; P9BE-NEXT: mfvsrld r3, v2
-; P9BE-NEXT: rldic r5, r5, 35, 1
-; P9BE-NEXT: rldicl r4, r3, 63, 1
-; P9BE-NEXT: oris r5, r5, 1603
-; P9BE-NEXT: ori r5, r5, 21445
-; P9BE-NEXT: mulhdu r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 57, 7
+; P9BE-NEXT: ori r4, r4, 42889
+; P9BE-NEXT: rldic r4, r4, 33, 3
+; P9BE-NEXT: oris r4, r4, 400
+; P9BE-NEXT: ori r4, r4, 54513
+; P9BE-NEXT: mulhdu r4, r3, r4
+; P9BE-NEXT: rldicl r4, r4, 58, 6
; P9BE-NEXT: mulli r4, r4, 654
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: mtvsrdd v2, 0, r3
@@ -891,94 +883,86 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
;
; P8LE-LABEL: dont_fold_urem_i64:
; P8LE: # %bb.0:
-; P8LE-NEXT: lis r3, 1602
+; P8LE-NEXT: lis r3, 5698
; P8LE-NEXT: xxswapd vs0, v3
-; P8LE-NEXT: lis r5, 3206
-; P8LE-NEXT: mfvsrd r6, v2
-; P8LE-NEXT: mfvsrd r8, v3
+; P8LE-NEXT: mfvsrd r5, v3
; P8LE-NEXT: ori r3, r3, 51289
+; P8LE-NEXT: mffprd r4, f0
+; P8LE-NEXT: mfvsrd r6, v2
+; P8LE-NEXT: rldic r3, r3, 35, 0
+; P8LE-NEXT: oris r3, r3, 22795
+; P8LE-NEXT: ori r3, r3, 8547
+; P8LE-NEXT: mulhdu r3, r4, r3
+; P8LE-NEXT: rldicl r3, r3, 60, 4
+; P8LE-NEXT: mulli r3, r3, 23
+; P8LE-NEXT: sub r3, r4, r3
+; P8LE-NEXT: lis r4, 12374
+; P8LE-NEXT: ori r4, r4, 56339
+; P8LE-NEXT: mtfprd f0, r3
+; P8LE-NEXT: li r3, 0
+; P8LE-NEXT: rldic r4, r4, 32, 2
+; P8LE-NEXT: oris r4, r4, 29426
+; P8LE-NEXT: ori r4, r4, 35795
+; P8LE-NEXT: mulhdu r4, r5, r4
+; P8LE-NEXT: rldicl r4, r4, 54, 10
+; P8LE-NEXT: mulli r4, r4, 5423
+; P8LE-NEXT: sub r4, r5, r4
+; P8LE-NEXT: lis r5, 3206
; P8LE-NEXT: ori r5, r5, 42889
-; P8LE-NEXT: rldic r4, r3, 36, 1
-; P8LE-NEXT: mffprd r3, f0
-; P8LE-NEXT: rldic r5, r5, 35, 1
-; P8LE-NEXT: rldicl r7, r6, 63, 1
-; P8LE-NEXT: oris r4, r4, 45590
-; P8LE-NEXT: oris r5, r5, 1603
-; P8LE-NEXT: ori r4, r4, 17097
-; P8LE-NEXT: ori r5, r5, 21445
-; P8LE-NEXT: mulhdu r4, r3, r4
-; P8LE-NEXT: mulhdu r5, r7, r5
-; P8LE-NEXT: sub r7, r3, r4
-; P8LE-NEXT: rldicl r5, r5, 57, 7
-; P8LE-NEXT: rldicl r7, r7, 63, 1
+; P8LE-NEXT: mtfprd f1, r4
+; P8LE-NEXT: rldic r5, r5, 33, 3
+; P8LE-NEXT: oris r5, r5, 400
+; P8LE-NEXT: ori r5, r5, 54513
+; P8LE-NEXT: mulhdu r5, r6, r5
+; P8LE-NEXT: rldicl r5, r5, 58, 6
; P8LE-NEXT: mulli r5, r5, 654
-; P8LE-NEXT: add r4, r7, r4
-; P8LE-NEXT: lis r7, -16037
-; P8LE-NEXT: ori r7, r7, 28749
-; P8LE-NEXT: rldicl r4, r4, 60, 4
; P8LE-NEXT: sub r5, r6, r5
-; P8LE-NEXT: rldic r7, r7, 32, 0
-; P8LE-NEXT: mulli r4, r4, 23
-; P8LE-NEXT: oris r7, r7, 52170
-; P8LE-NEXT: ori r7, r7, 12109
-; P8LE-NEXT: sub r3, r3, r4
-; P8LE-NEXT: mulhdu r7, r8, r7
+; P8LE-NEXT: xxmrghd v3, vs1, vs0
; P8LE-NEXT: mtfprd f1, r3
-; P8LE-NEXT: li r3, 0
-; P8LE-NEXT: rldicl r7, r7, 52, 12
-; P8LE-NEXT: mulli r7, r7, 5423
-; P8LE-NEXT: sub r7, r8, r7
-; P8LE-NEXT: mtfprd f0, r7
-; P8LE-NEXT: xxmrghd v3, vs0, vs1
; P8LE-NEXT: mtfprd f0, r5
-; P8LE-NEXT: mtfprd f1, r3
; P8LE-NEXT: xxmrghd v2, vs0, vs1
; P8LE-NEXT: blr
;
; P8BE-LABEL: dont_fold_urem_i64:
; P8BE: # %bb.0:
-; P8BE-NEXT: lis r3, 1602
-; P8BE-NEXT: mfvsrd r4, v3
+; P8BE-NEXT: lis r3, 12374
+; P8BE-NEXT: xxswapd vs0, v3
+; P8BE-NEXT: mfvsrd r5, v3
+; P8BE-NEXT: ori r3, r3, 56339
+; P8BE-NEXT: mffprd r4, f0
+; P8BE-NEXT: xxswapd vs1, v2
+; P8BE-NEXT: mffprd r6, f1
+; P8BE-NEXT: rldic r3, r3, 32, 2
+; P8BE-NEXT: oris r3, r3, 29426
+; P8BE-NEXT: ori r3, r3, 35795
+; P8BE-NEXT: mulhdu r3, r4, r3
+; P8BE-NEXT: rldicl r3, r3, 54, 10
+; P8BE-NEXT: mulli r3, r3, 5423
+; P8BE-NEXT: sub r3, r4, r3
+; P8BE-NEXT: lis r4, 5698
+; P8BE-NEXT: ori r4, r4, 51289
+; P8BE-NEXT: mtfprd f0, r3
+; P8BE-NEXT: li r3, 0
+; P8BE-NEXT: rldic r4, r4, 35, 0
+; P8BE-NEXT: oris r4, r4, 22795
+; P8BE-NEXT: ori r4, r4, 8547
+; P8BE-NEXT: mulhdu r4, r5, r4
+; P8BE-NEXT: rldicl r4, r4, 60, 4
+; P8BE-NEXT: mulli r4, r4, 23
+; P8BE-NEXT: sub r4, r5, r4
; P8BE-NEXT: lis r5, 3206
-; P8BE-NEXT: xxswapd vs0, v2
-; P8BE-NEXT: xxswapd vs1, v3
-; P8BE-NEXT: ori r3, r3, 51289
; P8BE-NEXT: ori r5, r5, 42889
-; P8BE-NEXT: mffprd r6, f0
-; P8BE-NEXT: mffprd r8, f1
-; P8BE-NEXT: rldic r3, r3, 36, 1
-; P8BE-NEXT: rldic r5, r5, 35, 1
-; P8BE-NEXT: oris r3, r3, 45590
-; P8BE-NEXT: oris r5, r5, 1603
-; P8BE-NEXT: rldicl r7, r6, 63, 1
-; P8BE-NEXT: ori r3, r3, 17097
-; P8BE-NEXT: ori r5, r5, 21445
-; P8BE-NEXT: mulhdu r3, r4, r3
-; P8BE-NEXT: mulhdu r5, r7, r5
-; P8BE-NEXT: sub r7, r4, r3
-; P8BE-NEXT: rldicl r5, r5, 57, 7
-; P8BE-NEXT: rldicl r7, r7, 63, 1
+; P8BE-NEXT: mtfprd f1, r4
+; P8BE-NEXT: rldic r5, r5, 33, 3
+; P8BE-NEXT: oris r5, r5, 400
+; P8BE-NEXT: ori r5, r5, 54513
+; P8BE-NEXT: mulhdu r5, r6, r5
+; P8BE-NEXT: rldicl r5, r5, 58, 6
; P8BE-NEXT: mulli r5, r5, 654
-; P8BE-NEXT: add r3, r7, r3
-; P8BE-NEXT: lis r7, -16037
-; P8BE-NEXT: ori r7, r7, 28749
-; P8BE-NEXT: rldicl r3, r3, 60, 4
; P8BE-NEXT: sub r5, r6, r5
-; P8BE-NEXT: rldic r7, r7, 32, 0
-; P8BE-NEXT: mulli r3, r3, 23
-; P8BE-NEXT: oris r7, r7, 52170
-; P8BE-NEXT: ori r7, r7, 12109
-; P8BE-NEXT: sub r3, r4, r3
-; P8BE-NEXT: mulhdu r7, r8, r7
-; P8BE-NEXT: mtfprd f1, r3
-; P8BE-NEXT: li r3, 0
-; P8BE-NEXT: rldicl r7, r7, 52, 12
-; P8BE-NEXT: mulli r7, r7, 5423
-; P8BE-NEXT: sub r7, r8, r7
-; P8BE-NEXT: mtfprd f0, r7
; P8BE-NEXT: xxmrghd v3, vs1, vs0
-; P8BE-NEXT: mtfprd f0, r5
; P8BE-NEXT: mtfprd f1, r3
+; P8BE-NEXT: mtfprd f0, r5
; P8BE-NEXT: xxmrghd v2, vs1, vs0
; P8BE-NEXT: blr
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 91ac7c5ddae3f..bc0ea9db9a1af 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -14,21 +14,29 @@
define i32 @udiv_constant_no_add(i32 %a) nounwind {
; RV32-LABEL: udiv_constant_no_add:
; RV32: # %bb.0:
-; RV32-NEXT: lui a1, 838861
-; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: mulhu a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: ret
;
-; RV64-LABEL: udiv_constant_no_add:
-; RV64: # %bb.0:
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addi a1, a1, -819
-; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 34
-; RV64-NEXT: ret
+; RV64IM-LABEL: udiv_constant_no_add:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addi a1, a1, 819
+; RV64IM-NEXT: slli a1, a1, 32
+; RV64IM-NEXT: mulhu a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: ret
+;
+; RV64IMZB-LABEL: udiv_constant_no_add:
+; RV64IMZB: # %bb.0:
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 209715
+; RV64IMZB-NEXT: addiw a1, a1, 819
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 32
+; RV64IMZB-NEXT: ret
%1 = udiv i32 %a, 5
ret i32 %1
}
@@ -39,76 +47,88 @@ define i32 @udiv_constant_add(i32 %a) nounwind {
; RV32: # %bb.0:
; RV32-NEXT: lui a1, 149797
; RV32-NEXT: addi a1, a1, -1755
-; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: sub a0, a0, a1
-; RV32-NEXT: srli a0, a0, 1
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: mulhu a0, a0, a1
; RV32-NEXT: ret
;
; RV64IM-LABEL: udiv_constant_add:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 149797
-; RV64IM-NEXT: addi a2, a2, -1755
-; RV64IM-NEXT: slli a2, a2, 32
-; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 1
-; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: lui a1, 149797
+; RV64IM-NEXT: addi a1, a1, -1755
+; RV64IM-NEXT: slli a1, a1, 32
+; RV64IM-NEXT: mulhu a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 32
; RV64IM-NEXT: ret
;
; RV64IMZB-LABEL: udiv_constant_add:
; RV64IMZB: # %bb.0:
-; RV64IMZB-NEXT: zext.w a1, a0
-; RV64IMZB-NEXT: lui a2, 149797
-; RV64IMZB-NEXT: addiw a2, a2, -1755
-; RV64IMZB-NEXT: mul a1, a1, a2
-; RV64IMZB-NEXT: srli a1, a1, 32
-; RV64IMZB-NEXT: subw a0, a0, a1
-; RV64IMZB-NEXT: srliw a0, a0, 1
-; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: zext.w a0, a0
+; RV64IMZB-NEXT: lui a1, 149797
+; RV64IMZB-NEXT: addiw a1, a1, -1755
+; RV64IMZB-NEXT: mul a0, a0, a1
+; RV64IMZB-NEXT: srli a0, a0, 32
; RV64IMZB-NEXT: ret
%1 = udiv i32 %a, 7
ret i32 %1
}
define i64 @udiv64_constant_no_add(i64 %a) nounwind {
-; RV32-LABEL: udiv64_constant_no_add:
-; RV32: # %bb.0:
-; RV32-NEXT: add a2, a0, a1
-; RV32-NEXT: sltu a3, a2, a0
-; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 838861
-; RV32-NEXT: addi a4, a3, -819
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 2
-; RV32-NEXT: andi a5, a5, -4
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -820
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
-; RV32-NEXT: sltu a0, a0, a2
-; RV32-NEXT: sub a1, a1, a0
-; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
-; RV32-NEXT: ret
+; RV32IM-LABEL: udiv64_constant_no_add:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: add a2, a0, a1
+; RV32IM-NEXT: sltu a3, a2, a0
+; RV32IM-NEXT: add a2, a2, a3
+; RV32IM-NEXT: lui a3, 209715
+; RV32IM-NEXT: addi a3, a3, 819
+; RV32IM-NEXT: mulhu a3, a2, a3
+; RV32IM-NEXT: slli a4, a3, 2
+; RV32IM-NEXT: add a3, a4, a3
+; RV32IM-NEXT: sub a2, a2, a3
+; RV32IM-NEXT: sub a3, a0, a2
+; RV32IM-NEXT: lui a4, 838861
+; RV32IM-NEXT: addi a5, a4, -820
+; RV32IM-NEXT: mul a5, a3, a5
+; RV32IM-NEXT: addi a4, a4, -819
+; RV32IM-NEXT: mulhu a6, a3, a4
+; RV32IM-NEXT: add a5, a6, a5
+; RV32IM-NEXT: sltu a0, a0, a2
+; RV32IM-NEXT: sub a1, a1, a0
+; RV32IM-NEXT: mul a1, a1, a4
+; RV32IM-NEXT: add a1, a5, a1
+; RV32IM-NEXT: mul a0, a3, a4
+; RV32IM-NEXT: ret
+;
+; RV32IMZB-LABEL: udiv64_constant_no_add:
+; RV32IMZB: # %bb.0:
+; RV32IMZB-NEXT: add a2, a0, a1
+; RV32IMZB-NEXT: sltu a3, a2, a0
+; RV32IMZB-NEXT: add a2, a2, a3
+; RV32IMZB-NEXT: lui a3, 209715
+; RV32IMZB-NEXT: addi a3, a3, 819
+; RV32IMZB-NEXT: mulhu a3, a2, a3
+; RV32IMZB-NEXT: sh2add a3, a3, a3
+; RV32IMZB-NEXT: sub a2, a2, a3
+; RV32IMZB-NEXT: sub a3, a0, a2
+; RV32IMZB-NEXT: lui a4, 838861
+; RV32IMZB-NEXT: addi a5, a4, -820
+; RV32IMZB-NEXT: mul a5, a3, a5
+; RV32IMZB-NEXT: addi a4, a4, -819
+; RV32IMZB-NEXT: mulhu a6, a3, a4
+; RV32IMZB-NEXT: add a5, a6, a5
+; RV32IMZB-NEXT: sltu a0, a0, a2
+; RV32IMZB-NEXT: sub a1, a1, a0
+; RV32IMZB-NEXT: mul a1, a1, a4
+; RV32IMZB-NEXT: add a1, a5, a1
+; RV32IMZB-NEXT: mul a0, a3, a4
+; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_no_add:
; RV64: # %bb.0:
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: lui a1, 209715
+; RV64-NEXT: addiw a1, a1, 819
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
; RV64-NEXT: ret
%1 = udiv i64 %a, 5
ret i64 %1
@@ -130,11 +150,7 @@ define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV64: # %bb.0:
; RV64-NEXT: lui a1, %hi(.LCPI3_0)
; RV64-NEXT: ld a1, %lo(.LCPI3_0)(a1)
-; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: sub a0, a0, a1
-; RV64-NEXT: srli a0, a0, 1
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: mulhu a0, a0, a1
; RV64-NEXT: ret
%1 = udiv i64 %a, 7
ret i64 %1
@@ -144,17 +160,17 @@ define i8 @udiv8_constant_no_add(i8 %a) nounwind {
; RV32-LABEL: udiv8_constant_no_add:
; RV32: # %bb.0:
; RV32-NEXT: andi a0, a0, 255
-; RV32-NEXT: li a1, 205
+; RV32-NEXT: li a1, 51
; RV32-NEXT: mul a0, a0, a1
-; RV32-NEXT: srli a0, a0, 10
+; RV32-NEXT: srli a0, a0, 8
; RV32-NEXT: ret
;
; RV64-LABEL: udiv8_constant_no_add:
; RV64: # %bb.0:
; RV64-NEXT: andi a0, a0, 255
-; RV64-NEXT: li a1, 205
+; RV64-NEXT: li a1, 51
; RV64-NEXT: mul a0, a0, a1
-; RV64-NEXT: srli a0, a0, 10
+; RV64-NEXT: srli a0, a0, 8
; RV64-NEXT: ret
%1 = udiv i8 %a, 5
ret i8 %1
@@ -163,54 +179,34 @@ define i8 @udiv8_constant_no_add(i8 %a) nounwind {
define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV32IM-LABEL: udiv8_constant_add:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: andi a1, a0, 255
-; RV32IM-NEXT: li a2, 37
-; RV32IM-NEXT: mul a1, a1, a2
-; RV32IM-NEXT: srli a1, a1, 8
-; RV32IM-NEXT: sub a0, a0, a1
-; RV32IM-NEXT: slli a0, a0, 24
-; RV32IM-NEXT: srli a0, a0, 25
-; RV32IM-NEXT: add a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 2
+; RV32IM-NEXT: andi a0, a0, 255
+; RV32IM-NEXT: li a1, 37
+; RV32IM-NEXT: mul a0, a0, a1
+; RV32IM-NEXT: srli a0, a0, 8
; RV32IM-NEXT: ret
;
; RV32IMZB-LABEL: udiv8_constant_add:
; RV32IMZB: # %bb.0:
-; RV32IMZB-NEXT: andi a1, a0, 255
-; RV32IMZB-NEXT: sh3add a2, a1, a1
-; RV32IMZB-NEXT: sh2add a1, a2, a1
-; RV32IMZB-NEXT: srli a1, a1, 8
-; RV32IMZB-NEXT: sub a0, a0, a1
-; RV32IMZB-NEXT: slli a0, a0, 24
-; RV32IMZB-NEXT: srli a0, a0, 25
-; RV32IMZB-NEXT: add a0, a0, a1
-; RV32IMZB-NEXT: srli a0, a0, 2
+; RV32IMZB-NEXT: andi a0, a0, 255
+; RV32IMZB-NEXT: sh3add a1, a0, a0
+; RV32IMZB-NEXT: sh2add a0, a1, a0
+; RV32IMZB-NEXT: srli a0, a0, 8
; RV32IMZB-NEXT: ret
;
; RV64IM-LABEL: udiv8_constant_add:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: andi a1, a0, 255
-; RV64IM-NEXT: li a2, 37
-; RV64IM-NEXT: mul a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 8
-; RV64IM-NEXT: subw a0, a0, a1
-; RV64IM-NEXT: slli a0, a0, 56
-; RV64IM-NEXT: srli a0, a0, 57
-; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
+; RV64IM-NEXT: andi a0, a0, 255
+; RV64IM-NEXT: li a1, 37
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 8
; RV64IM-NEXT: ret
;
; RV64IMZB-LABEL: udiv8_constant_add:
; RV64IMZB: # %bb.0:
-; RV64IMZB-NEXT: andi a1, a0, 255
-; RV64IMZB-NEXT: sh3add a2, a1, a1
-; RV64IMZB-NEXT: sh2add a1, a2, a1
-; RV64IMZB-NEXT: srli a1, a1, 8
-; RV64IMZB-NEXT: subw a0, a0, a1
-; RV64IMZB-NEXT: slli a0, a0, 56
-; RV64IMZB-NEXT: srli a0, a0, 57
-; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: srli a0, a0, 2
+; RV64IMZB-NEXT: andi a0, a0, 255
+; RV64IMZB-NEXT: sh3add a1, a0, a0
+; RV64IMZB-NEXT: sh2add a0, a1, a0
+; RV64IMZB-NEXT: srli a0, a0, 8
; RV64IMZB-NEXT: ret
%1 = udiv i8 %a, 7
ret i8 %1
@@ -220,18 +216,17 @@ define i16 @udiv16_constant_no_add(i16 %a) nounwind {
; RV32-LABEL: udiv16_constant_no_add:
; RV32: # %bb.0:
; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: lui a1, 838864
+; RV32-NEXT: lui a1, 209712
; RV32-NEXT: mulhu a0, a0, a1
-; RV32-NEXT: srli a0, a0, 18
+; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: udiv16_constant_no_add:
; RV64: # %bb.0:
-; RV64-NEXT: lui a1, 52429
-; RV64-NEXT: slli a1, a1, 4
; RV64-NEXT: slli a0, a0, 48
+; RV64-NEXT: lui a1, 209712
; RV64-NEXT: mulhu a0, a0, a1
-; RV64-NEXT: srli a0, a0, 18
+; RV64-NEXT: srli a0, a0, 16
; RV64-NEXT: ret
%1 = udiv i16 %a, 5
ret i16 %1
@@ -240,28 +235,18 @@ define i16 @udiv16_constant_no_add(i16 %a) nounwind {
define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV32-LABEL: udiv16_constant_add:
; RV32: # %bb.0:
-; RV32-NEXT: slli a1, a0, 16
-; RV32-NEXT: lui a2, 149808
-; RV32-NEXT: mulhu a1, a1, a2
-; RV32-NEXT: srli a1, a1, 16
-; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 17
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: srli a0, a0, 2
+; RV32-NEXT: lui a1, 149808
+; RV32-NEXT: mulhu a0, a0, a1
+; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: ret
;
; RV64-LABEL: udiv16_constant_add:
; RV64: # %bb.0:
-; RV64-NEXT: slli a1, a0, 48
-; RV64-NEXT: lui a2, 149808
-; RV64-NEXT: mulhu a1, a1, a2
-; RV64-NEXT: srli a1, a1, 16
-; RV64-NEXT: subw a0, a0, a1
; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: srli a0, a0, 49
-; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: srli a0, a0, 2
+; RV64-NEXT: lui a1, 149808
+; RV64-NEXT: mulhu a0, a0, a1
+; RV64-NEXT: srli a0, a0, 16
; RV64-NEXT: ret
%1 = udiv i16 %a, 7
ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index 99c83b99497dd..d96d4983c18f1 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -47,10 +47,9 @@ define i32 @udiv_constant(i32 %a) nounwind {
;
; RV32IM-LABEL: udiv_constant:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a1, 838861
-; RV32IM-NEXT: addi a1, a1, -819
+; RV32IM-NEXT: lui a1, 209715
+; RV32IM-NEXT: addi a1, a1, 819
; RV32IM-NEXT: mulhu a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 2
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv_constant:
@@ -68,11 +67,11 @@ define i32 @udiv_constant(i32 %a) nounwind {
; RV64IM-LABEL: udiv_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a0, a0, 32
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addi a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addi a1, a1, 819
; RV64IM-NEXT: slli a1, a1, 32
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 34
+; RV64IM-NEXT: srli a0, a0, 32
; RV64IM-NEXT: ret
%1 = udiv i32 %a, 5
ret i32 %1
@@ -184,23 +183,24 @@ define i64 @udiv64_constant(i64 %a) nounwind {
; RV32IM-NEXT: add a2, a0, a1
; RV32IM-NEXT: sltu a3, a2, a0
; RV32IM-NEXT: add a2, a2, a3
-; RV32IM-NEXT: lui a3, 838861
-; RV32IM-NEXT: addi a4, a3, -819
-; RV32IM-NEXT: mulhu a5, a2, a4
-; RV32IM-NEXT: srli a6, a5, 2
-; RV32IM-NEXT: andi a5, a5, -4
-; RV32IM-NEXT: add a5, a5, a6
-; RV32IM-NEXT: sub a2, a2, a5
-; RV32IM-NEXT: sub a5, a0, a2
-; RV32IM-NEXT: addi a3, a3, -820
-; RV32IM-NEXT: mul a3, a5, a3
-; RV32IM-NEXT: mulhu a6, a5, a4
-; RV32IM-NEXT: add a3, a6, a3
+; RV32IM-NEXT: lui a3, 209715
+; RV32IM-NEXT: addi a3, a3, 819
+; RV32IM-NEXT: mulhu a3, a2, a3
+; RV32IM-NEXT: slli a4, a3, 2
+; RV32IM-NEXT: add a3, a4, a3
+; RV32IM-NEXT: sub a2, a2, a3
+; RV32IM-NEXT: sub a3, a0, a2
+; RV32IM-NEXT: lui a4, 838861
+; RV32IM-NEXT: addi a5, a4, -820
+; RV32IM-NEXT: mul a5, a3, a5
+; RV32IM-NEXT: addi a4, a4, -819
+; RV32IM-NEXT: mulhu a6, a3, a4
+; RV32IM-NEXT: add a5, a6, a5
; RV32IM-NEXT: sltu a0, a0, a2
; RV32IM-NEXT: sub a1, a1, a0
; RV32IM-NEXT: mul a1, a1, a4
-; RV32IM-NEXT: add a1, a3, a1
-; RV32IM-NEXT: mul a0, a5, a4
+; RV32IM-NEXT: add a1, a5, a1
+; RV32IM-NEXT: mul a0, a3, a4
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv64_constant:
@@ -210,12 +210,11 @@ define i64 @udiv64_constant(i64 %a) nounwind {
;
; RV64IM-LABEL: udiv64_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addiw a1, a1, 819
; RV64IM-NEXT: slli a2, a1, 32
; RV64IM-NEXT: add a1, a1, a2
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
; RV64IM-NEXT: ret
%1 = udiv i64 %a, 5
ret i64 %1
@@ -318,9 +317,9 @@ define i8 @udiv8_constant(i8 %a) nounwind {
; RV32IM-LABEL: udiv8_constant:
; RV32IM: # %bb.0:
; RV32IM-NEXT: andi a0, a0, 255
-; RV32IM-NEXT: li a1, 205
+; RV32IM-NEXT: li a1, 51
; RV32IM-NEXT: mul a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 10
+; RV32IM-NEXT: srli a0, a0, 8
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv8_constant:
@@ -337,9 +336,9 @@ define i8 @udiv8_constant(i8 %a) nounwind {
; RV64IM-LABEL: udiv8_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: andi a0, a0, 255
-; RV64IM-NEXT: li a1, 205
+; RV64IM-NEXT: li a1, 51
; RV64IM-NEXT: mul a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 10
+; RV64IM-NEXT: srli a0, a0, 8
; RV64IM-NEXT: ret
%1 = udiv i8 %a, 5
ret i8 %1
@@ -477,9 +476,9 @@ define i16 @udiv16_constant(i16 %a) nounwind {
; RV32IM-LABEL: udiv16_constant:
; RV32IM: # %bb.0:
; RV32IM-NEXT: slli a0, a0, 16
-; RV32IM-NEXT: lui a1, 838864
+; RV32IM-NEXT: lui a1, 209712
; RV32IM-NEXT: mulhu a0, a0, a1
-; RV32IM-NEXT: srli a0, a0, 18
+; RV32IM-NEXT: srli a0, a0, 16
; RV32IM-NEXT: ret
;
; RV64I-LABEL: udiv16_constant:
@@ -496,11 +495,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
;
; RV64IM-LABEL: udiv16_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 52429
-; RV64IM-NEXT: slli a1, a1, 4
; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: lui a1, 209712
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 18
+; RV64IM-NEXT: srli a0, a0, 16
; RV64IM-NEXT: ret
%1 = udiv i16 %a, 5
ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index 8aa145f6ac5ef..af44ce92a89e0 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -20,10 +20,9 @@ define signext i32 @wobble() nounwind {
; CHECK-NEXT: sw a0, %lo(global.1)(a2)
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: slli a1, a0, 48
-; CHECK-NEXT: lui a2, 52429
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: lui a2, 209712
; CHECK-NEXT: mulhu a1, a1, a2
-; CHECK-NEXT: srli a1, a1, 18
+; CHECK-NEXT: srli a1, a1, 16
; CHECK-NEXT: lui a2, %hi(global.3)
; CHECK-NEXT: li a3, 5
; CHECK-NEXT: sw a1, %lo(global.3)(a2)
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
index 17d9e9cefe117..1abace8bbba0e 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
@@ -42,11 +42,11 @@ define i32 @udiv_constant(i32 %a) nounwind {
; RV64IM-LABEL: udiv_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a0, a0, 32
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addi a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addi a1, a1, 819
; RV64IM-NEXT: slli a1, a1, 32
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 34
+; RV64IM-NEXT: srli a0, a0, 32
; RV64IM-NEXT: ret
%1 = udiv i32 %a, 5
ret i32 %1
@@ -109,12 +109,11 @@ define i64 @udiv64_constant(i64 %a) nounwind {
;
; RV64IM-LABEL: udiv64_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 838861
-; RV64IM-NEXT: addiw a1, a1, -819
+; RV64IM-NEXT: lui a1, 209715
+; RV64IM-NEXT: addiw a1, a1, 819
; RV64IM-NEXT: slli a2, a1, 32
; RV64IM-NEXT: add a1, a1, a2
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srli a0, a0, 2
; RV64IM-NEXT: ret
%1 = udiv i64 %a, 5
ret i64 %1
@@ -173,9 +172,9 @@ define i8 @udiv8_constant(i8 %a) nounwind {
; RV64IM-LABEL: udiv8_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: andi a0, a0, 255
-; RV64IM-NEXT: li a1, 205
+; RV64IM-NEXT: li a1, 51
; RV64IM-NEXT: mul a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 10
+; RV64IM-NEXT: srliw a0, a0, 8
; RV64IM-NEXT: ret
%1 = udiv i8 %a, 5
ret i8 %1
@@ -260,11 +259,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
;
; RV64IM-LABEL: udiv16_constant:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: lui a1, 52429
-; RV64IM-NEXT: slli a1, a1, 4
; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: lui a1, 209712
; RV64IM-NEXT: mulhu a0, a0, a1
-; RV64IM-NEXT: srliw a0, a0, 18
+; RV64IM-NEXT: srliw a0, a0, 16
; RV64IM-NEXT: ret
%1 = udiv i16 %a, 5
ret i16 %1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index 65a1035fd815c..3621be0126c34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -116,24 +116,18 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vslide1down.vx v8, v8, a1
+; CHECK-NEXT: vslide1down.vx v8, v8, a2
; CHECK-NEXT: lui a0, %hi(.LCPI4_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0)
; CHECK-NEXT: vle32.v v9, (a0)
-; CHECK-NEXT: vslide1down.vx v8, v8, a1
-; CHECK-NEXT: vslide1down.vx v8, v8, a2
-; CHECK-NEXT: vslide1down.vx v8, v8, a3
-; CHECK-NEXT: vmulhu.vv v9, v8, v9
-; CHECK-NEXT: vsub.vv v10, v8, v9
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: lui a0, 524288
-; CHECK-NEXT: vslide1down.vx v11, v11, a0
; CHECK-NEXT: lui a0, %hi(.LCPI4_1)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_1)
-; CHECK-NEXT: vle32.v v12, (a0)
-; CHECK-NEXT: vmulhu.vv v10, v10, v11
-; CHECK-NEXT: vadd.vv v9, v10, v9
+; CHECK-NEXT: vle32.v v10, (a0)
+; CHECK-NEXT: vslide1down.vx v8, v8, a3
+; CHECK-NEXT: vmulhu.vv v9, v8, v9
; CHECK-NEXT: vmv.v.i v0, 4
-; CHECK-NEXT: vsrl.vv v9, v9, v12
+; CHECK-NEXT: vsrl.vv v9, v9, v10
; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0
; CHECK-NEXT: ret
%e0 = udiv i32 %a, 23
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index d309da6df7dc7..dd3c1644b1f22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -1016,14 +1016,12 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
; RV32NOM-LABEL: extractelt_udiv_v4i32:
; RV32NOM: # %bb.0:
+; RV32NOM-NEXT: lui a0, 80660
+; RV32NOM-NEXT: addi a0, a0, -1260
; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32NOM-NEXT: vsrl.vi v8, v8, 0
-; RV32NOM-NEXT: lui a0, 322639
-; RV32NOM-NEXT: addi a0, a0, -945
; RV32NOM-NEXT: vmulhu.vx v8, v8, a0
; RV32NOM-NEXT: vslidedown.vi v8, v8, 2
; RV32NOM-NEXT: vmv.x.s a0, v8
-; RV32NOM-NEXT: srli a0, a0, 2
; RV32NOM-NEXT: ret
;
; RV32M-LABEL: extractelt_udiv_v4i32:
@@ -1031,36 +1029,32 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV32M-NEXT: vslidedown.vi v8, v8, 2
; RV32M-NEXT: vmv.x.s a0, v8
-; RV32M-NEXT: lui a1, 322639
-; RV32M-NEXT: addi a1, a1, -945
+; RV32M-NEXT: lui a1, 80660
+; RV32M-NEXT: addi a1, a1, -1260
; RV32M-NEXT: mulhu a0, a0, a1
-; RV32M-NEXT: srli a0, a0, 2
; RV32M-NEXT: ret
;
; RV64NOM-LABEL: extractelt_udiv_v4i32:
; RV64NOM: # %bb.0:
+; RV64NOM-NEXT: lui a0, 80660
+; RV64NOM-NEXT: addi a0, a0, -1260
; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64NOM-NEXT: vsrl.vi v8, v8, 0
-; RV64NOM-NEXT: lui a0, 322639
-; RV64NOM-NEXT: addi a0, a0, -945
; RV64NOM-NEXT: vmulhu.vx v8, v8, a0
; RV64NOM-NEXT: vslidedown.vi v8, v8, 2
; RV64NOM-NEXT: vmv.x.s a0, v8
-; RV64NOM-NEXT: slli a0, a0, 33
-; RV64NOM-NEXT: srli a0, a0, 35
; RV64NOM-NEXT: ret
;
; RV64M-LABEL: extractelt_udiv_v4i32:
; RV64M: # %bb.0:
-; RV64M-NEXT: lui a0, 322639
-; RV64M-NEXT: addi a0, a0, -945
-; RV64M-NEXT: slli a0, a0, 32
+; RV64M-NEXT: lui a0, 20165
+; RV64M-NEXT: addi a0, a0, -315
+; RV64M-NEXT: slli a0, a0, 34
; RV64M-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV64M-NEXT: vslidedown.vi v8, v8, 2
; RV64M-NEXT: vmv.x.s a1, v8
; RV64M-NEXT: slli a1, a1, 32
; RV64M-NEXT: mulhu a0, a1, a0
-; RV64M-NEXT: srli a0, a0, 34
+; RV64M-NEXT: srli a0, a0, 32
; RV64M-NEXT: ret
%bo = udiv <4 x i32> %x, <i32 11, i32 12, i32 13, i32 14>
%ext = extractelement <4 x i32> %bo, i32 2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index ea2cdae903e5a..e9b4af22a364e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1099,47 +1099,25 @@ define void @urem_v2i64(ptr %x, ptr %y) {
define void @mulhu_v16i8(ptr %x) {
; CHECK-LABEL: mulhu_v16i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; CHECK-NEXT: vle8.v v9, (a0)
-; CHECK-NEXT: lui a1, 3
-; CHECK-NEXT: addi a1, a1, -2044
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: lui a1, 1
-; CHECK-NEXT: addi a2, a1, 32
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v8, a2
-; CHECK-NEXT: lui a2, %hi(.LCPI65_0)
-; CHECK-NEXT: addi a2, a2, %lo(.LCPI65_0)
-; CHECK-NEXT: vle8.v v11, (a2)
-; CHECK-NEXT: li a2, -128
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmerge.vxm v12, v10, a2, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT: vsrl.vv v8, v9, v8
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: lui a1, %hi(.LCPI65_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI65_0)
+; CHECK-NEXT: vle8.v v9, (a1)
+; CHECK-NEXT: li a1, -128
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vmv.v.i v11, 0
+; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
+; CHECK-NEXT: vslideup.vi v11, v10, 5
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmulhu.vv v9, v8, v9
+; CHECK-NEXT: lui a1, %hi(.LCPI65_1)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI65_1)
+; CHECK-NEXT: vle8.v v10, (a1)
+; CHECK-NEXT: vsub.vv v8, v8, v9
; CHECK-NEXT: vmulhu.vv v8, v8, v11
-; CHECK-NEXT: vsub.vv v9, v9, v8
-; CHECK-NEXT: vmulhu.vv v9, v9, v12
-; CHECK-NEXT: vadd.vv v9, v9, v8
-; CHECK-NEXT: li a2, 513
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a2
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 4
-; CHECK-NEXT: vmerge.vim v10, v8, 1, v0
-; CHECK-NEXT: addi a1, a1, 78
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: lui a1, 8
-; CHECK-NEXT: addi a1, a1, 304
-; CHECK-NEXT: vmv.s.x v8, a1
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vmerge.vim v10, v10, 3, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v10, 2, v0
-; CHECK-NEXT: vsrl.vv v8, v9, v8
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: vsrl.vv v8, v8, v10
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
@@ -1153,32 +1131,23 @@ define void @mulhu_v8i16(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vmv.v.i v9, 0
-; CHECK-NEXT: lui a1, 1048568
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT: vmv.s.x v10, a1
; CHECK-NEXT: lui a1, %hi(.LCPI66_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_0)
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v11, (a1)
-; CHECK-NEXT: vmv.v.i v12, 1
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: lui a1, 1048568
+; CHECK-NEXT: vmv.s.x v10, a1
+; CHECK-NEXT: vmv.v.i v11, 0
; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v12, 6
+; CHECK-NEXT: vslideup.vi v11, v10, 6
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vsrl.vv v9, v8, v9
-; CHECK-NEXT: vmulhu.vv v9, v9, v11
+; CHECK-NEXT: vmulhu.vv v9, v8, v9
+; CHECK-NEXT: lui a1, %hi(.LCPI66_1)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI66_1)
+; CHECK-NEXT: vle16.v v10, (a1)
; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: vmulhu.vv v8, v8, v11
; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: li a1, 33
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: vmv.v.i v9, 3
-; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
-; CHECK-NEXT: vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v9, v12, 6
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vsrl.vv v8, v8, v9
+; CHECK-NEXT: vsrl.vv v8, v8, v10
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
@@ -1225,18 +1194,9 @@ define void @mulhu_v4i32(ptr %x) {
; CHECK-NEXT: lui a1, %hi(.LCPI68_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI68_0)
; CHECK-NEXT: vle32.v v9, (a1)
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: vmv.s.x v10, a1
-; CHECK-NEXT: vmv.v.i v11, 0
-; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v11, v10, 2
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmulhu.vv v9, v8, v9
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vmulhu.vv v8, v8, v11
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: lui a1, 4128
-; CHECK-NEXT: addi a1, a1, 514
+; CHECK-NEXT: vmulhu.vv v8, v8, v9
+; CHECK-NEXT: lui a1, 8192
+; CHECK-NEXT: addi a1, a1, 256
; CHECK-NEXT: vmv.s.x v9, a1
; CHECK-NEXT: vsext.vf4 v10, v9
; CHECK-NEXT: vsrl.vv v8, v8, v10
@@ -1253,19 +1213,16 @@ define void @mulhu_v2i64(ptr %x) {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: lui a1, %hi(.LCPI69_0)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI69_0)
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vle32.v v9, (a1)
+; RV32-NEXT: vmv.v.x v9, a1
+; RV32-NEXT: vmv.v.i v0, 3
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vmerge.vxm v9, v9, a1, v0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vmulhu.vv v8, v8, v9
-; RV32-NEXT: lui a1, 32
-; RV32-NEXT: addi a1, a1, 1
-; RV32-NEXT: vmv.s.x v9, a1
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vsext.vf4 v10, v9
-; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vv v8, v8, v10
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
@@ -1273,22 +1230,19 @@ define void @mulhu_v2i64(ptr %x) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: lui a1, 209715
+; RV64-NEXT: addiw a1, a1, 819
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: vmv.v.x v9, a1
-; RV64-NEXT: lui a1, 699051
-; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: addiw a1, a1, 1365
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma
; RV64-NEXT: vmv.s.x v9, a1
; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; RV64-NEXT: vmulhu.vv v8, v8, v9
-; RV64-NEXT: vid.v v9
-; RV64-NEXT: vadd.vi v9, v9, 1
-; RV64-NEXT: vsrl.vv v8, v8, v9
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
@@ -1302,18 +1256,18 @@ define void @mulhs_v16i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a1, -123
+; CHECK-NEXT: li a1, 33
; CHECK-NEXT: vmv.v.x v9, a1
; CHECK-NEXT: lui a1, 5
; CHECK-NEXT: addi a1, a1, -1452
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: li a1, 57
+; CHECK-NEXT: li a1, 113
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0
; CHECK-NEXT: vmulhu.vv v8, v8, v9
-; CHECK-NEXT: vmv.v.i v9, 7
-; CHECK-NEXT: vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT: vmv.v.i v9, 5
+; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
; CHECK-NEXT: vsrl.vv v8, v8, v9
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
@@ -3260,46 +3214,44 @@ define void @mulhu_v32i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT: vle8.v v10, (a0)
-; CHECK-NEXT: vmv.v.i v12, 0
-; CHECK-NEXT: lui a1, 163907
-; CHECK-NEXT: addi a1, a1, -2044
-; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: lui a1, 66049
-; CHECK-NEXT: addi a1, a1, 32
-; CHECK-NEXT: vmv.s.x v8, a1
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: lui a1, %hi(.LCPI181_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI181_0)
-; CHECK-NEXT: vle8.v v14, (a1)
+; CHECK-NEXT: vle8.v v12, (a1)
+; CHECK-NEXT: lui a1, 512
+; CHECK-NEXT: addi a1, a1, 32
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: li a1, -128
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vxm v16, v12, a1, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT: vsrl.vv v8, v10, v8
-; CHECK-NEXT: vmulhu.vv v8, v8, v14
-; CHECK-NEXT: vsub.vv v10, v10, v8
-; CHECK-NEXT: vmulhu.vv v10, v10, v16
-; CHECK-NEXT: vadd.vv v10, v10, v8
-; CHECK-NEXT: lui a1, 8208
-; CHECK-NEXT: addi a1, a1, 513
+; CHECK-NEXT: vmerge.vxm v10, v10, a1, v0
+; CHECK-NEXT: vmulhu.vv v12, v8, v12
+; CHECK-NEXT: vsub.vv v8, v8, v12
+; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: vadd.vv v10, v8, v12
+; CHECK-NEXT: vmv.v.i v12, 3
+; CHECK-NEXT: lui a1, 16528
+; CHECK-NEXT: addi a1, a1, 1033
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
+; CHECK-NEXT: lui a1, 32
+; CHECK-NEXT: addi a1, a1, 2
+; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v8, 4
-; CHECK-NEXT: vmerge.vim v12, v8, 1, v0
-; CHECK-NEXT: lui a1, 66785
-; CHECK-NEXT: addi a1, a1, 78
+; CHECK-NEXT: vmerge.vim v12, v12, 2, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT: lui a1, 3328
+; CHECK-NEXT: addi a1, a1, 208
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vmv.s.x v0, a1
-; CHECK-NEXT: lui a1, 529160
-; CHECK-NEXT: addi a1, a1, 304
+; CHECK-NEXT: lui a1, 720907
; CHECK-NEXT: vmv.s.x v8, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vim v12, v12, 3, v0
+; CHECK-NEXT: vmerge.vim v12, v12, 0, v0
; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v12, 2, v0
+; CHECK-NEXT: vmerge.vim v8, v12, 4, v0
; CHECK-NEXT: vsrl.vv v8, v10, v8
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
@@ -3313,37 +3265,36 @@ define void @mulhu_v16i16(ptr %x) {
; RV32-LABEL: mulhu_v16i16:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT: vle16.v v10, (a0)
-; RV32-NEXT: li a1, 257
-; RV32-NEXT: vmv.s.x v0, a1
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: lui a1, 1048568
-; RV32-NEXT: vmerge.vxm v12, v8, a1, v0
+; RV32-NEXT: vle16.v v8, (a0)
; RV32-NEXT: lui a1, 4
; RV32-NEXT: addi a1, a1, 64
-; RV32-NEXT: vmv.s.x v8, a1
-; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: vmv.s.x v0, a1
; RV32-NEXT: lui a1, %hi(.LCPI182_0)
; RV32-NEXT: addi a1, a1, %lo(.LCPI182_0)
-; RV32-NEXT: vle16.v v14, (a1)
-; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vmerge.vim v9, v9, 1, v0
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vsext.vf2 v16, v9
-; RV32-NEXT: vsrl.vv v16, v10, v16
-; RV32-NEXT: vmulhu.vv v14, v16, v14
-; RV32-NEXT: vsub.vv v10, v10, v14
-; RV32-NEXT: vmulhu.vv v10, v10, v12
-; RV32-NEXT: vadd.vv v10, v10, v14
+; RV32-NEXT: vle16.v v10, (a1)
+; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: lui a1, 1048568
+; RV32-NEXT: vmerge.vxm v12, v12, a1, v0
+; RV32-NEXT: vmulhu.vv v10, v8, v10
+; RV32-NEXT: vsub.vv v8, v8, v10
+; RV32-NEXT: vmulhu.vv v8, v8, v12
+; RV32-NEXT: vadd.vv v10, v8, v10
; RV32-NEXT: lui a1, 2
-; RV32-NEXT: addi a1, a1, 289
+; RV32-NEXT: addi a1, a1, 546
+; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT: vmv.v.i v8, 0
+; RV32-NEXT: vmerge.vim v9, v8, 3, v0
+; RV32-NEXT: li a1, 1028
+; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV32-NEXT: vmv.s.x v0, a1
+; RV32-NEXT: lui a1, 1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vmv.s.x v8, a1
; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT: vmv.v.i v9, 3
-; RV32-NEXT: vmerge.vim v9, v9, 2, v0
+; RV32-NEXT: vmerge.vim v9, v9, 1, v0
; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vmerge.vim v8, v9, 1, v0
+; RV32-NEXT: vmerge.vim v8, v9, 2, v0
; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV32-NEXT: vsext.vf2 v12, v8
; RV32-NEXT: vsrl.vv v8, v10, v12
@@ -3354,27 +3305,23 @@ define void @mulhu_v16i16(ptr %x) {
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; RV64-NEXT: vle16.v v8, (a0)
-; RV64-NEXT: li a1, 257
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: addi a1, a1, 64
; RV64-NEXT: vmv.s.x v0, a1
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: lui a1, 1048568
-; RV64-NEXT: vmerge.vxm v10, v10, a1, v0
; RV64-NEXT: lui a1, %hi(.LCPI182_0)
; RV64-NEXT: addi a1, a1, %lo(.LCPI182_0)
-; RV64-NEXT: vle16.v v12, (a1)
-; RV64-NEXT: li a1, 1
-; RV64-NEXT: slli a1, a1, 48
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v14, a1
-; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT: vsext.vf2 v16, v14
-; RV64-NEXT: vsrl.vv v14, v8, v16
-; RV64-NEXT: vmulhu.vv v12, v14, v12
-; RV64-NEXT: lui a1, %hi(.LCPI182_1)
-; RV64-NEXT: ld a1, %lo(.LCPI182_1)(a1)
-; RV64-NEXT: vsub.vv v8, v8, v12
-; RV64-NEXT: vmulhu.vv v8, v8, v10
-; RV64-NEXT: vadd.vv v8, v8, v12
+; RV64-NEXT: vle16.v v10, (a1)
+; RV64-NEXT: vmv.v.i v12, 0
+; RV64-NEXT: lui a1, 1048568
+; RV64-NEXT: vmerge.vxm v12, v12, a1, v0
+; RV64-NEXT: vmulhu.vv v10, v8, v10
+; RV64-NEXT: vsub.vv v8, v8, v10
+; RV64-NEXT: vmulhu.vv v8, v8, v12
+; RV64-NEXT: vadd.vv v8, v8, v10
+; RV64-NEXT: lui a1, 12320
+; RV64-NEXT: addiw a1, a1, 1
+; RV64-NEXT: slli a1, a1, 16
+; RV64-NEXT: addi a1, a1, 768
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vmv.v.x v10, a1
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
@@ -3393,20 +3340,12 @@ define void @mulhu_v8i32(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: li a1, 68
-; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: lui a1, %hi(.LCPI183_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI183_0)
; CHECK-NEXT: vle32.v v10, (a1)
-; CHECK-NEXT: vmv.v.i v12, 0
-; CHECK-NEXT: lui a1, 524288
-; CHECK-NEXT: vmerge.vxm v12, v12, a1, v0
-; CHECK-NEXT: vmulhu.vv v10, v8, v10
-; CHECK-NEXT: vsub.vv v8, v8, v10
-; CHECK-NEXT: vmulhu.vv v8, v8, v12
-; CHECK-NEXT: vadd.vv v8, v8, v10
-; CHECK-NEXT: lui a1, 4128
-; CHECK-NEXT: addi a1, a1, 514
+; CHECK-NEXT: vmulhu.vv v8, v8, v10
+; CHECK-NEXT: lui a1, 8192
+; CHECK-NEXT: addi a1, a1, 256
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v10, a1
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
@@ -3430,25 +3369,16 @@ define void @mulhu_v4i64(ptr %x) {
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle32.v v10, (a1)
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vmulhu.vv v10, v8, v10
-; RV32-NEXT: lui a1, 524288
-; RV32-NEXT: vmv.s.x v12, a1
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.i v14, 0
-; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV32-NEXT: vslideup.vi v14, v12, 5
-; RV32-NEXT: lui a1, %hi(.LCPI184_1)
-; RV32-NEXT: addi a1, a1, %lo(.LCPI184_1)
+; RV32-NEXT: vmulhu.vv v8, v8, v10
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vle8.v v12, (a1)
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vsub.vv v8, v8, v10
-; RV32-NEXT: vmulhu.vv v8, v8, v14
-; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: vmv.v.i v10, 3
+; RV32-NEXT: vmv.v.i v11, 0
+; RV32-NEXT: vsetivli zero, 7, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vi v11, v10, 6
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vsext.vf4 v10, v12
+; RV32-NEXT: vsext.vf4 v12, v11
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vv v8, v8, v10
+; RV32-NEXT: vsrl.vv v8, v8, v12
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
@@ -3459,19 +3389,8 @@ define void @mulhu_v4i64(ptr %x) {
; RV64-NEXT: lui a1, %hi(.LCPI184_0)
; RV64-NEXT: addi a1, a1, %lo(.LCPI184_0)
; RV64-NEXT: vle64.v v10, (a1)
-; RV64-NEXT: vmulhu.vv v10, v8, v10
-; RV64-NEXT: vsub.vv v8, v8, v10
-; RV64-NEXT: li a1, -1
-; RV64-NEXT: slli a1, a1, 63
-; RV64-NEXT: vmv.s.x v12, a1
-; RV64-NEXT: vmv.v.i v14, 0
-; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; RV64-NEXT: vslideup.vi v14, v12, 2
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vmulhu.vv v8, v8, v14
-; RV64-NEXT: vadd.vv v8, v8, v10
-; RV64-NEXT: lui a1, 12320
-; RV64-NEXT: addi a1, a1, 513
+; RV64-NEXT: vmulhu.vv v8, v8, v10
+; RV64-NEXT: lui a1, 12288
; RV64-NEXT: vmv.s.x v10, a1
; RV64-NEXT: vsext.vf8 v12, v10
; RV64-NEXT: vsrl.vv v8, v8, v12
@@ -3493,11 +3412,11 @@ define void @mulhs_v32i8(ptr %x) {
; CHECK-NEXT: addi a1, a1, -1452
; CHECK-NEXT: vmv.s.x v0, a1
; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 7
-; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT: li a1, -123
+; CHECK-NEXT: vmv.v.i v10, 5
+; CHECK-NEXT: vmerge.vim v10, v10, 2, v0
+; CHECK-NEXT: li a1, 33
; CHECK-NEXT: vmv.v.x v12, a1
-; CHECK-NEXT: li a1, 57
+; CHECK-NEXT: li a1, 113
; CHECK-NEXT: vmerge.vxm v12, v12, a1, v0
; CHECK-NEXT: vmulhu.vv v8, v8, v12
; CHECK-NEXT: vsrl.vv v8, v8, v10
@@ -5573,9 +5492,9 @@ define void @mulhu_vx_v16i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a1, 57
+; CHECK-NEXT: li a1, 113
; CHECK-NEXT: vmulhu.vx v8, v8, a1
-; CHECK-NEXT: vsrl.vi v8, v8, 1
+; CHECK-NEXT: vsrl.vi v8, v8, 2
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
@@ -5591,11 +5510,7 @@ define void @mulhu_vx_v8i16(ptr %x) {
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: lui a1, 2
; CHECK-NEXT: addi a1, a1, 1171
-; CHECK-NEXT: vmulhu.vx v9, v8, a1
-; CHECK-NEXT: vsub.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v8, v8, 1
-; CHECK-NEXT: vadd.vv v8, v8, v9
-; CHECK-NEXT: vsrl.vi v8, v8, 2
+; CHECK-NEXT: vmulhu.vx v8, v8, a1
; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: ret
%a = load <8 x i16>, ptr %x
@@ -5609,10 +5524,9 @@ define void @mulhu_vx_v4i32(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: lui a1, 838861
-; CHECK-NEXT: addi a1, a1, -819
+; CHECK-NEXT: lui a1, 209715
+; CHECK-NEXT: addi a1, a1, 819
; CHECK-NEXT: vmulhu.vx v8, v8, a1
-; CHECK-NEXT: vsrl.vi v8, v8, 2
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%a = load <4 x i32>, ptr %x
@@ -5624,33 +5538,26 @@ define void @mulhu_vx_v4i32(ptr %x) {
define void @mulhu_vx_v2i64(ptr %x) {
; RV32-LABEL: mulhu_vx_v2i64:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: lui a1, 699051
-; RV32-NEXT: addi a2, a1, -1366
-; RV32-NEXT: sw a2, 12(sp)
-; RV32-NEXT: addi a1, a1, -1365
-; RV32-NEXT: sw a1, 8(sp)
-; RV32-NEXT: addi a1, sp, 8
-; RV32-NEXT: vlse64.v v9, (a1), zero
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.v.x v9, a1
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vmulhu.vv v8, v8, v9
-; RV32-NEXT: vsrl.vi v8, v8, 1
; RV32-NEXT: vse64.v v8, (a0)
-; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: mulhu_vx_v2i64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: lui a1, 699051
-; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: addiw a1, a1, 1365
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: vmulhu.vx v8, v8, a1
-; RV64-NEXT: vsrl.vi v8, v8, 1
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
%a = load <2 x i64>, ptr %x
@@ -5664,9 +5571,9 @@ define void @mulhs_vx_v16i8(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: li a1, -123
+; CHECK-NEXT: li a1, 33
; CHECK-NEXT: vmulhu.vx v8, v8, a1
-; CHECK-NEXT: vsrl.vi v8, v8, 7
+; CHECK-NEXT: vsrl.vi v8, v8, 5
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
index 4f2fb937ca73f..6a938a679f57c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll
@@ -29,10 +29,10 @@ define <vscale x 1 x i8> @vdivu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b)
define <vscale x 1 x i8> @vdivu_vi_nxv1i8_0(<vscale x 1 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv1i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 1 x i8> %va, splat (i8 -7)
ret <vscale x 1 x i8> %vc
@@ -83,10 +83,10 @@ define <vscale x 2 x i8> @vdivu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b)
define <vscale x 2 x i8> @vdivu_vi_nxv2i8_0(<vscale x 2 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv2i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 2 x i8> %va, splat (i8 -7)
ret <vscale x 2 x i8> %vc
@@ -117,10 +117,10 @@ define <vscale x 4 x i8> @vdivu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b)
define <vscale x 4 x i8> @vdivu_vi_nxv4i8_0(<vscale x 4 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv4i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 4 x i8> %va, splat (i8 -7)
ret <vscale x 4 x i8> %vc
@@ -151,10 +151,10 @@ define <vscale x 8 x i8> @vdivu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b)
define <vscale x 8 x i8> @vdivu_vi_nxv8i8_0(<vscale x 8 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv8i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 8 x i8> %va, splat (i8 -7)
ret <vscale x 8 x i8> %vc
@@ -185,10 +185,10 @@ define <vscale x 16 x i8> @vdivu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %
define <vscale x 16 x i8> @vdivu_vi_nxv16i8_0(<vscale x 16 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv16i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 16 x i8> %va, splat (i8 -7)
ret <vscale x 16 x i8> %vc
@@ -219,10 +219,10 @@ define <vscale x 32 x i8> @vdivu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %
define <vscale x 32 x i8> @vdivu_vi_nxv32i8_0(<vscale x 32 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv32i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 32 x i8> %va, splat (i8 -7)
ret <vscale x 32 x i8> %vc
@@ -253,10 +253,10 @@ define <vscale x 64 x i8> @vdivu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %
define <vscale x 64 x i8> @vdivu_vi_nxv64i8_0(<vscale x 64 x i8> %va) {
; CHECK-LABEL: vdivu_vi_nxv64i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 5
+; CHECK-NEXT: vsrl.vi v8, v8, 7
; CHECK-NEXT: ret
%vc = udiv <vscale x 64 x i8> %va, splat (i8 -7)
ret <vscale x 64 x i8> %vc
@@ -287,11 +287,11 @@ define <vscale x 1 x i16> @vdivu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext
define <vscale x 1 x i16> @vdivu_vi_nxv1i16_0(<vscale x 1 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv1i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 1 x i16> %va, splat (i16 -7)
ret <vscale x 1 x i16> %vc
@@ -322,11 +322,11 @@ define <vscale x 2 x i16> @vdivu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext
define <vscale x 2 x i16> @vdivu_vi_nxv2i16_0(<vscale x 2 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv2i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 2 x i16> %va, splat (i16 -7)
ret <vscale x 2 x i16> %vc
@@ -357,11 +357,11 @@ define <vscale x 4 x i16> @vdivu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext
define <vscale x 4 x i16> @vdivu_vi_nxv4i16_0(<vscale x 4 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv4i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 4 x i16> %va, splat (i16 -7)
ret <vscale x 4 x i16> %vc
@@ -392,11 +392,11 @@ define <vscale x 8 x i16> @vdivu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext
define <vscale x 8 x i16> @vdivu_vi_nxv8i16_0(<vscale x 8 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv8i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 8 x i16> %va, splat (i16 -7)
ret <vscale x 8 x i16> %vc
@@ -427,11 +427,11 @@ define <vscale x 16 x i16> @vdivu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signe
define <vscale x 16 x i16> @vdivu_vi_nxv16i16_0(<vscale x 16 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv16i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 16 x i16> %va, splat (i16 -7)
ret <vscale x 16 x i16> %vc
@@ -462,11 +462,11 @@ define <vscale x 32 x i16> @vdivu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signe
define <vscale x 32 x i16> @vdivu_vi_nxv32i16_0(<vscale x 32 x i16> %va) {
; CHECK-LABEL: vdivu_vi_nxv32i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 13
+; CHECK-NEXT: vsrl.vi v8, v8, 15
; CHECK-NEXT: ret
%vc = udiv <vscale x 32 x i16> %va, splat (i16 -7)
ret <vscale x 32 x i16> %vc
@@ -497,11 +497,11 @@ define <vscale x 1 x i32> @vdivu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 signext
define <vscale x 1 x i32> @vdivu_vi_nxv1i32_0(<vscale x 1 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv1i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 1 x i32> %va, splat (i32 -7)
ret <vscale x 1 x i32> %vc
@@ -532,11 +532,11 @@ define <vscale x 2 x i32> @vdivu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 signext
define <vscale x 2 x i32> @vdivu_vi_nxv2i32_0(<vscale x 2 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv2i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 2 x i32> %va, splat (i32 -7)
ret <vscale x 2 x i32> %vc
@@ -567,11 +567,11 @@ define <vscale x 4 x i32> @vdivu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 signext
define <vscale x 4 x i32> @vdivu_vi_nxv4i32_0(<vscale x 4 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv4i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 4 x i32> %va, splat (i32 -7)
ret <vscale x 4 x i32> %vc
@@ -602,11 +602,11 @@ define <vscale x 8 x i32> @vdivu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 signext
define <vscale x 8 x i32> @vdivu_vi_nxv8i32_0(<vscale x 8 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv8i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 8 x i32> %va, splat (i32 -7)
ret <vscale x 8 x i32> %vc
@@ -637,11 +637,11 @@ define <vscale x 16 x i32> @vdivu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 signe
define <vscale x 16 x i32> @vdivu_vi_nxv16i32_0(<vscale x 16 x i32> %va) {
; CHECK-LABEL: vdivu_vi_nxv16i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v8, v8, a0
-; CHECK-NEXT: vsrl.vi v8, v8, 29
+; CHECK-NEXT: vsrl.vi v8, v8, 31
; CHECK-NEXT: ret
%vc = udiv <vscale x 16 x i32> %va, splat (i32 -7)
ret <vscale x 16 x i32> %vc
@@ -687,15 +687,15 @@ define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV32-V-NEXT: vlse64.v v9, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v9
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -709,12 +709,12 @@ define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv1i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 1 x i64> %va, splat (i64 -7)
@@ -784,15 +784,15 @@ define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV32-V-NEXT: vlse64.v v10, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v10
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -806,12 +806,12 @@ define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv2i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 2 x i64> %va, splat (i64 -7)
@@ -881,15 +881,15 @@ define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV32-V-NEXT: vlse64.v v12, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v12
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -903,12 +903,12 @@ define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv4i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 4 x i64> %va, splat (i64 -7)
@@ -978,15 +978,15 @@ define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-V-NEXT: vlse64.v v16, (a0), zero
; RV32-V-NEXT: vmulhu.vv v8, v8, v16
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v8, v8, a0
; RV32-V-NEXT: addi sp, sp, 16
; RV32-V-NEXT: ret
@@ -1000,12 +1000,12 @@ define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
;
; RV64-V-LABEL: vdivu_vi_nxv8i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV64-V-NEXT: vmulhu.vx v8, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v8, v8, a0
; RV64-V-NEXT: ret
%vc = udiv <vscale x 8 x i64> %va, splat (i64 -7)
@@ -1069,11 +1069,7 @@ define <vscale x 8 x i32> @vdivu_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
; CHECK-NEXT: lui a0, 149797
; CHECK-NEXT: addi a0, a0, -1755
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsub.vv v16, v8, v12
-; CHECK-NEXT: vsrl.vi v16, v16, 1
-; CHECK-NEXT: vadd.vv v12, v16, v12
-; CHECK-NEXT: vsrl.vi v8, v12, 2, v0.t
+; CHECK-NEXT: vmulhu.vx v8, v8, a0, v0.t
; CHECK-NEXT: ret
%vs = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> splat (i32 7), <vscale x 8 x i32> splat (i32 1)
%vc = udiv <vscale x 8 x i32> %va, %vs
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
index ed40f5af4fa4c..c880ae4f44b75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
@@ -29,10 +29,10 @@ define <vscale x 1 x i8> @vremu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 signext %b)
define <vscale x 1 x i8> @vremu_vi_nxv1i8_0(<vscale x 1 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv1i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -65,10 +65,10 @@ define <vscale x 2 x i8> @vremu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 signext %b)
define <vscale x 2 x i8> @vremu_vi_nxv2i8_0(<vscale x 2 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv2i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -101,10 +101,10 @@ define <vscale x 4 x i8> @vremu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 signext %b)
define <vscale x 4 x i8> @vremu_vi_nxv4i8_0(<vscale x 4 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv4i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -137,10 +137,10 @@ define <vscale x 8 x i8> @vremu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 signext %b)
define <vscale x 8 x i8> @vremu_vi_nxv8i8_0(<vscale x 8 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv8i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 5
+; CHECK-NEXT: vsrl.vi v9, v9, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -173,10 +173,10 @@ define <vscale x 16 x i8> @vremu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 signext %
define <vscale x 16 x i8> @vremu_vi_nxv16i8_0(<vscale x 16 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv16i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v10, v10, 5
+; CHECK-NEXT: vsrl.vi v10, v10, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v10
; CHECK-NEXT: ret
@@ -209,10 +209,10 @@ define <vscale x 32 x i8> @vremu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 signext %
define <vscale x 32 x i8> @vremu_vi_nxv32i8_0(<vscale x 32 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv32i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v12, v12, 5
+; CHECK-NEXT: vsrl.vi v12, v12, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v12
; CHECK-NEXT: ret
@@ -245,10 +245,10 @@ define <vscale x 64 x i8> @vremu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 signext %
define <vscale x 64 x i8> @vremu_vi_nxv64i8_0(<vscale x 64 x i8> %va) {
; CHECK-LABEL: vremu_vi_nxv64i8_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 33
+; CHECK-NEXT: li a0, -125
; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v16, v16, 5
+; CHECK-NEXT: vsrl.vi v16, v16, 7
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v16
; CHECK-NEXT: ret
@@ -281,11 +281,11 @@ define <vscale x 1 x i16> @vremu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 signext
define <vscale x 1 x i16> @vremu_vi_nxv1i16_0(<vscale x 1 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv1i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 13
+; CHECK-NEXT: vsrl.vi v9, v9, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -318,11 +318,11 @@ define <vscale x 2 x i16> @vremu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 signext
define <vscale x 2 x i16> @vremu_vi_nxv2i16_0(<vscale x 2 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv2i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 13
+; CHECK-NEXT: vsrl.vi v9, v9, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -355,11 +355,11 @@ define <vscale x 4 x i16> @vremu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 signext
define <vscale x 4 x i16> @vremu_vi_nxv4i16_0(<vscale x 4 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv4i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 13
+; CHECK-NEXT: vsrl.vi v9, v9, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -392,11 +392,11 @@ define <vscale x 8 x i16> @vremu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 signext
define <vscale x 8 x i16> @vremu_vi_nxv8i16_0(<vscale x 8 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv8i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v10, v10, 13
+; CHECK-NEXT: vsrl.vi v10, v10, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v10
; CHECK-NEXT: ret
@@ -429,11 +429,11 @@ define <vscale x 16 x i16> @vremu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 signe
define <vscale x 16 x i16> @vremu_vi_nxv16i16_0(<vscale x 16 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv16i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v12, v12, 13
+; CHECK-NEXT: vsrl.vi v12, v12, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v12
; CHECK-NEXT: ret
@@ -466,11 +466,11 @@ define <vscale x 32 x i16> @vremu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 signe
define <vscale x 32 x i16> @vremu_vi_nxv32i16_0(<vscale x 32 x i16> %va) {
; CHECK-LABEL: vremu_vi_nxv32i16_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 2
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 1048568
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v16, v16, 13
+; CHECK-NEXT: vsrl.vi v16, v16, 15
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v16
; CHECK-NEXT: ret
@@ -503,11 +503,11 @@ define <vscale x 1 x i32> @vremu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 signext
define <vscale x 1 x i32> @vremu_vi_nxv1i32_0(<vscale x 1 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv1i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 29
+; CHECK-NEXT: vsrl.vi v9, v9, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -540,11 +540,11 @@ define <vscale x 2 x i32> @vremu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 signext
define <vscale x 2 x i32> @vremu_vi_nxv2i32_0(<vscale x 2 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv2i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vmulhu.vx v9, v8, a0
-; CHECK-NEXT: vsrl.vi v9, v9, 29
+; CHECK-NEXT: vsrl.vi v9, v9, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v9
; CHECK-NEXT: ret
@@ -577,11 +577,11 @@ define <vscale x 4 x i32> @vremu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 signext
define <vscale x 4 x i32> @vremu_vi_nxv4i32_0(<vscale x 4 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv4i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vmulhu.vx v10, v8, a0
-; CHECK-NEXT: vsrl.vi v10, v10, 29
+; CHECK-NEXT: vsrl.vi v10, v10, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v10
; CHECK-NEXT: ret
@@ -614,11 +614,11 @@ define <vscale x 8 x i32> @vremu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 signext
define <vscale x 8 x i32> @vremu_vi_nxv8i32_0(<vscale x 8 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv8i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vmulhu.vx v12, v8, a0
-; CHECK-NEXT: vsrl.vi v12, v12, 29
+; CHECK-NEXT: vsrl.vi v12, v12, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v12
; CHECK-NEXT: ret
@@ -651,11 +651,11 @@ define <vscale x 16 x i32> @vremu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 signe
define <vscale x 16 x i32> @vremu_vi_nxv16i32_0(<vscale x 16 x i32> %va) {
; CHECK-LABEL: vremu_vi_nxv16i32_0:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a0, 131072
-; CHECK-NEXT: addi a0, a0, 1
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: addi a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vmulhu.vx v16, v8, a0
-; CHECK-NEXT: vsrl.vi v16, v16, 29
+; CHECK-NEXT: vsrl.vi v16, v16, 31
; CHECK-NEXT: li a0, -7
; CHECK-NEXT: vnmsac.vx v8, a0, v16
; CHECK-NEXT: ret
@@ -703,15 +703,15 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV32-V-NEXT: vlse64.v v9, (a0), zero
; RV32-V-NEXT: vmulhu.vv v9, v8, v9
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v9, v9, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v9
@@ -727,12 +727,12 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv1i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m1, ta, ma
; RV64-V-NEXT: vmulhu.vx v9, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v9, v9, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v9
@@ -808,15 +808,15 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV32-V-NEXT: vlse64.v v10, (a0), zero
; RV32-V-NEXT: vmulhu.vv v10, v8, v10
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v10, v10, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v10
@@ -832,12 +832,12 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv2i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; RV64-V-NEXT: vmulhu.vx v10, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v10, v10, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v10
@@ -913,15 +913,15 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV32-V-NEXT: vlse64.v v12, (a0), zero
; RV32-V-NEXT: vmulhu.vv v12, v8, v12
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v12, v12, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v12
@@ -937,12 +937,12 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv4i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; RV64-V-NEXT: vmulhu.vx v12, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v12, v12, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v12
@@ -1018,15 +1018,15 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
; RV32-V: # %bb.0:
; RV32-V-NEXT: addi sp, sp, -16
; RV32-V-NEXT: .cfi_def_cfa_offset 16
-; RV32-V-NEXT: lui a0, 131072
+; RV32-V-NEXT: lui a0, 524288
; RV32-V-NEXT: sw a0, 12(sp)
-; RV32-V-NEXT: li a0, 1
+; RV32-V-NEXT: li a0, 3
; RV32-V-NEXT: sw a0, 8(sp)
; RV32-V-NEXT: addi a0, sp, 8
; RV32-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV32-V-NEXT: vlse64.v v16, (a0), zero
; RV32-V-NEXT: vmulhu.vv v16, v8, v16
-; RV32-V-NEXT: li a0, 61
+; RV32-V-NEXT: li a0, 63
; RV32-V-NEXT: vsrl.vx v16, v16, a0
; RV32-V-NEXT: li a0, -7
; RV32-V-NEXT: vnmsac.vx v8, a0, v16
@@ -1042,12 +1042,12 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
;
; RV64-V-LABEL: vremu_vi_nxv8i64_0:
; RV64-V: # %bb.0:
-; RV64-V-NEXT: li a0, 1
-; RV64-V-NEXT: slli a0, a0, 61
-; RV64-V-NEXT: addi a0, a0, 1
+; RV64-V-NEXT: li a0, -1
+; RV64-V-NEXT: slli a0, a0, 63
+; RV64-V-NEXT: addi a0, a0, 3
; RV64-V-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; RV64-V-NEXT: vmulhu.vx v16, v8, a0
-; RV64-V-NEXT: li a0, 61
+; RV64-V-NEXT: li a0, 63
; RV64-V-NEXT: vsrl.vx v16, v16, a0
; RV64-V-NEXT: li a0, -7
; RV64-V-NEXT: vnmsac.vx v8, a0, v16
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index ffbbe31412ed2..c2557a91714d0 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1096,11 +1096,10 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
; RV32IM: # %bb.0: # %entry
; RV32IM-NEXT: bnez a0, .LBB27_2
; RV32IM-NEXT: # %bb.1: # %entry
-; RV32IM-NEXT: srli a1, a1, 1
-; RV32IM-NEXT: lui a0, 199729
-; RV32IM-NEXT: addi a0, a0, -975
+; RV32IM-NEXT: lui a0, 399458
+; RV32IM-NEXT: addi a0, a0, -1951
; RV32IM-NEXT: mulhu a1, a1, a0
-; RV32IM-NEXT: srli a1, a1, 2
+; RV32IM-NEXT: srli a1, a1, 4
; RV32IM-NEXT: .LBB27_2: # %entry
; RV32IM-NEXT: mv a0, a1
; RV32IM-NEXT: ret
@@ -1109,22 +1108,24 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
; RV64IM: # %bb.0: # %entry
; RV64IM-NEXT: bnez a0, .LBB27_2
; RV64IM-NEXT: # %bb.1: # %entry
-; RV64IM-NEXT: srliw a0, a1, 1
-; RV64IM-NEXT: lui a1, 199729
-; RV64IM-NEXT: addiw a1, a1, -975
-; RV64IM-NEXT: mul a1, a0, a1
-; RV64IM-NEXT: srli a1, a1, 34
+; RV64IM-NEXT: slli a1, a1, 32
+; RV64IM-NEXT: lui a0, 399458
+; RV64IM-NEXT: addi a0, a0, -1951
+; RV64IM-NEXT: slli a0, a0, 32
+; RV64IM-NEXT: mulhu a1, a1, a0
+; RV64IM-NEXT: srli a1, a1, 36
; RV64IM-NEXT: .LBB27_2: # %entry
; RV64IM-NEXT: mv a0, a1
; RV64IM-NEXT: ret
;
; RV64IMXVTCONDOPS-LABEL: select_udiv_3:
; RV64IMXVTCONDOPS: # %bb.0: # %entry
-; RV64IMXVTCONDOPS-NEXT: srliw a2, a1, 1
-; RV64IMXVTCONDOPS-NEXT: lui a3, 199729
-; RV64IMXVTCONDOPS-NEXT: addiw a3, a3, -975
-; RV64IMXVTCONDOPS-NEXT: mul a2, a2, a3
-; RV64IMXVTCONDOPS-NEXT: srli a2, a2, 34
+; RV64IMXVTCONDOPS-NEXT: slli a2, a1, 32
+; RV64IMXVTCONDOPS-NEXT: lui a3, 399458
+; RV64IMXVTCONDOPS-NEXT: addi a3, a3, -1951
+; RV64IMXVTCONDOPS-NEXT: slli a3, a3, 32
+; RV64IMXVTCONDOPS-NEXT: mulhu a2, a2, a3
+; RV64IMXVTCONDOPS-NEXT: srli a2, a2, 36
; RV64IMXVTCONDOPS-NEXT: vt.maskc a1, a1, a0
; RV64IMXVTCONDOPS-NEXT: vt.maskcn a0, a2, a0
; RV64IMXVTCONDOPS-NEXT: or a0, a1, a0
@@ -1132,11 +1133,10 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
;
; RV32IMZICOND-LABEL: select_udiv_3:
; RV32IMZICOND: # %bb.0: # %entry
-; RV32IMZICOND-NEXT: srli a2, a1, 1
-; RV32IMZICOND-NEXT: lui a3, 199729
-; RV32IMZICOND-NEXT: addi a3, a3, -975
-; RV32IMZICOND-NEXT: mulhu a2, a2, a3
-; RV32IMZICOND-NEXT: srli a2, a2, 2
+; RV32IMZICOND-NEXT: lui a2, 399458
+; RV32IMZICOND-NEXT: addi a2, a2, -1951
+; RV32IMZICOND-NEXT: mulhu a2, a1, a2
+; RV32IMZICOND-NEXT: srli a2, a2, 4
; RV32IMZICOND-NEXT: czero.eqz a1, a1, a0
; RV32IMZICOND-NEXT: czero.nez a0, a2, a0
; RV32IMZICOND-NEXT: or a0, a1, a0
@@ -1144,11 +1144,12 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
;
; RV64IMZICOND-LABEL: select_udiv_3:
; RV64IMZICOND: # %bb.0: # %entry
-; RV64IMZICOND-NEXT: srliw a2, a1, 1
-; RV64IMZICOND-NEXT: lui a3, 199729
-; RV64IMZICOND-NEXT: addiw a3, a3, -975
-; RV64IMZICOND-NEXT: mul a2, a2, a3
-; RV64IMZICOND-NEXT: srli a2, a2, 34
+; RV64IMZICOND-NEXT: slli a2, a1, 32
+; RV64IMZICOND-NEXT: lui a3, 399458
+; RV64IMZICOND-NEXT: addi a3, a3, -1951
+; RV64IMZICOND-NEXT: slli a3, a3, 32
+; RV64IMZICOND-NEXT: mulhu a2, a2, a3
+; RV64IMZICOND-NEXT: srli a2, a2, 36
; RV64IMZICOND-NEXT: czero.eqz a1, a1, a0
; RV64IMZICOND-NEXT: czero.nez a0, a2, a0
; RV64IMZICOND-NEXT: or a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index 5fa802b7f27ca..547ba26a198ca 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -10,23 +10,24 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 699051
-; RV32-NEXT: addi a4, a3, -1365
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 1
-; RV32-NEXT: andi a5, a5, -2
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -1366
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 699051
+; RV32-NEXT: addi a5, a4, -1366
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -1365
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_3:
@@ -34,26 +35,29 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 699051
-; RV64-NEXT: addiw a3, a3, -1365
+; RV64-NEXT: lui a3, 349525
+; RV64-NEXT: addiw a3, a3, 1365
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 1
-; RV64-NEXT: andi a4, a4, -2
-; RV64-NEXT: lui a6, %hi(.LCPI0_0)
-; RV64-NEXT: ld a6, %lo(.LCPI0_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 1
+; RV64-NEXT: lui a5, %hi(.LCPI0_0)
+; RV64-NEXT: ld a5, %lo(.LCPI0_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 699051
+; RV64-NEXT: addiw a5, a5, -1365
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 3
ret iXLen2 %a
@@ -65,23 +69,24 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 838861
-; RV32-NEXT: addi a4, a3, -819
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 2
-; RV32-NEXT: andi a5, a5, -4
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -820
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 209715
+; RV32-NEXT: addi a3, a3, 819
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 2
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 838861
+; RV32-NEXT: addi a5, a4, -820
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -819
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_5:
@@ -89,26 +94,29 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 838861
-; RV64-NEXT: addiw a3, a3, -819
+; RV64-NEXT: lui a3, 209715
+; RV64-NEXT: addiw a3, a3, 819
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 2
-; RV64-NEXT: andi a4, a4, -4
-; RV64-NEXT: lui a6, %hi(.LCPI1_0)
-; RV64-NEXT: ld a6, %lo(.LCPI1_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 2
+; RV64-NEXT: lui a5, %hi(.LCPI1_0)
+; RV64-NEXT: ld a5, %lo(.LCPI1_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 838861
+; RV64-NEXT: addiw a5, a5, -819
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 5
ret iXLen2 %a
@@ -172,10 +180,9 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 559241
-; RV32-NEXT: addi a3, a3, -1911
+; RV32-NEXT: lui a3, 69905
+; RV32-NEXT: addi a3, a3, 273
; RV32-NEXT: mulhu a3, a2, a3
-; RV32-NEXT: srli a3, a3, 3
; RV32-NEXT: slli a4, a3, 4
; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: add a2, a2, a3
@@ -198,12 +205,11 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 559241
-; RV64-NEXT: addiw a3, a3, -1911
+; RV64-NEXT: lui a3, 69905
+; RV64-NEXT: addiw a3, a3, 273
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a3, a2, a3
-; RV64-NEXT: srli a3, a3, 3
; RV64-NEXT: slli a4, a3, 4
; RV64-NEXT: lui a5, %hi(.LCPI4_0)
; RV64-NEXT: ld a5, %lo(.LCPI4_0)(a5)
@@ -233,23 +239,24 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 986895
-; RV32-NEXT: addi a4, a3, 241
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 4
-; RV32-NEXT: andi a5, a5, -16
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, 240
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 61681
+; RV32-NEXT: addi a3, a3, -241
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 986895
+; RV32-NEXT: addi a5, a4, 240
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, 241
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_17:
@@ -257,26 +264,29 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 986895
-; RV64-NEXT: addiw a3, a3, 241
+; RV64-NEXT: lui a3, 61681
+; RV64-NEXT: addiw a3, a3, -241
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 4
-; RV64-NEXT: andi a4, a4, -16
-; RV64-NEXT: lui a6, %hi(.LCPI5_0)
-; RV64-NEXT: ld a6, %lo(.LCPI5_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 4
+; RV64-NEXT: lui a5, %hi(.LCPI5_0)
+; RV64-NEXT: ld a5, %lo(.LCPI5_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 986895
+; RV64-NEXT: addiw a5, a5, 241
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 17
ret iXLen2 %a
@@ -288,10 +298,9 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 526344
-; RV32-NEXT: addi a3, a3, 129
+; RV32-NEXT: lui a3, 4112
+; RV32-NEXT: addi a3, a3, 257
; RV32-NEXT: mulhu a3, a2, a3
-; RV32-NEXT: srli a3, a3, 7
; RV32-NEXT: slli a4, a3, 8
; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: add a2, a2, a3
@@ -314,12 +323,11 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 526344
-; RV64-NEXT: addiw a3, a3, 129
+; RV64-NEXT: lui a3, 4112
+; RV64-NEXT: addiw a3, a3, 257
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a3, a2, a3
-; RV64-NEXT: srli a3, a3, 7
; RV64-NEXT: slli a4, a3, 8
; RV64-NEXT: lui a5, %hi(.LCPI6_0)
; RV64-NEXT: ld a5, %lo(.LCPI6_0)(a5)
@@ -349,23 +357,24 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 1044496
-; RV32-NEXT: addi a4, a3, -255
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 8
-; RV32-NEXT: andi a5, a5, -256
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -256
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 4080
+; RV32-NEXT: addi a3, a3, 255
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 8
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 1044496
+; RV32-NEXT: addi a5, a4, -256
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -255
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_257:
@@ -373,26 +382,29 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 1044496
-; RV64-NEXT: addiw a3, a3, -255
+; RV64-NEXT: lui a3, 4080
+; RV64-NEXT: addiw a3, a3, 255
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 8
-; RV64-NEXT: andi a4, a4, -256
-; RV64-NEXT: lui a6, %hi(.LCPI7_0)
-; RV64-NEXT: ld a6, %lo(.LCPI7_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 8
+; RV64-NEXT: lui a5, %hi(.LCPI7_0)
+; RV64-NEXT: ld a5, %lo(.LCPI7_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 1044496
+; RV64-NEXT: addiw a5, a5, -255
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 257
ret iXLen2 %a
@@ -404,10 +416,9 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 524296
+; RV32-NEXT: lui a3, 16
; RV32-NEXT: addi a3, a3, 1
; RV32-NEXT: mulhu a3, a2, a3
-; RV32-NEXT: srli a3, a3, 15
; RV32-NEXT: slli a4, a3, 16
; RV32-NEXT: sub a3, a3, a4
; RV32-NEXT: add a2, a2, a3
@@ -433,12 +444,11 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 524296
+; RV64-NEXT: lui a3, 16
; RV64-NEXT: addiw a3, a3, 1
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
; RV64-NEXT: mulhu a3, a2, a3
-; RV64-NEXT: srli a3, a3, 15
; RV64-NEXT: slli a4, a3, 16
; RV64-NEXT: sub a3, a3, a4
; RV64-NEXT: add a2, a2, a3
@@ -471,14 +481,15 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 1048560
-; RV32-NEXT: addi a4, a3, 1
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: and a3, a5, a3
-; RV32-NEXT: srli a5, a5, 16
-; RV32-NEXT: or a3, a3, a5
+; RV32-NEXT: lui a3, 16
+; RV32-NEXT: addi a3, a3, -1
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 16
+; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: sub a2, a2, a3
; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 1048560
+; RV32-NEXT: addi a4, a4, 1
; RV32-NEXT: mulhu a4, a3, a4
; RV32-NEXT: slli a5, a3, 16
; RV32-NEXT: sub a4, a4, a5
@@ -495,28 +506,30 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 1048560
-; RV64-NEXT: addiw a4, a3, 1
-; RV64-NEXT: slli a5, a4, 32
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: mulhu a5, a2, a4
-; RV64-NEXT: and a3, a5, a3
-; RV64-NEXT: srli a5, a5, 16
-; RV64-NEXT: add a3, a3, a5
-; RV64-NEXT: sub a2, a2, a3
-; RV64-NEXT: sub a3, a0, a2
-; RV64-NEXT: lui a5, 983041
-; RV64-NEXT: slli a5, a5, 4
-; RV64-NEXT: addi a5, a5, -1
-; RV64-NEXT: slli a5, a5, 16
-; RV64-NEXT: mul a5, a3, a5
-; RV64-NEXT: mulhu a6, a3, a4
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: lui a3, 983041
+; RV64-NEXT: slli a4, a3, 20
+; RV64-NEXT: addi a4, a4, -1
+; RV64-NEXT: srli a4, a4, 16
+; RV64-NEXT: mulhu a4, a2, a4
+; RV64-NEXT: slli a5, a4, 16
+; RV64-NEXT: add a4, a5, a4
+; RV64-NEXT: sub a2, a2, a4
+; RV64-NEXT: sub a4, a0, a2
+; RV64-NEXT: slli a3, a3, 4
+; RV64-NEXT: addi a3, a3, -1
+; RV64-NEXT: slli a3, a3, 16
+; RV64-NEXT: mul a3, a4, a3
+; RV64-NEXT: lui a5, 1048560
+; RV64-NEXT: addiw a5, a5, 1
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a4, a5
+; RV64-NEXT: add a3, a6, a3
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a4
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a3, a4
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a3, a1
+; RV64-NEXT: mul a0, a4, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 65537
ret iXLen2 %a
@@ -532,23 +545,24 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV32-NEXT: add a2, a0, a1
; RV32-NEXT: sltu a3, a2, a0
; RV32-NEXT: add a2, a2, a3
-; RV32-NEXT: lui a3, 699051
-; RV32-NEXT: addi a4, a3, -1365
-; RV32-NEXT: mulhu a5, a2, a4
-; RV32-NEXT: srli a6, a5, 1
-; RV32-NEXT: andi a5, a5, -2
-; RV32-NEXT: add a5, a5, a6
-; RV32-NEXT: sub a2, a2, a5
-; RV32-NEXT: sub a5, a0, a2
-; RV32-NEXT: addi a3, a3, -1366
-; RV32-NEXT: mul a3, a5, a3
-; RV32-NEXT: mulhu a6, a5, a4
-; RV32-NEXT: add a3, a6, a3
+; RV32-NEXT: lui a3, 349525
+; RV32-NEXT: addi a3, a3, 1365
+; RV32-NEXT: mulhu a3, a2, a3
+; RV32-NEXT: slli a4, a3, 1
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: sub a2, a2, a3
+; RV32-NEXT: sub a3, a0, a2
+; RV32-NEXT: lui a4, 699051
+; RV32-NEXT: addi a5, a4, -1366
+; RV32-NEXT: mul a5, a3, a5
+; RV32-NEXT: addi a4, a4, -1365
+; RV32-NEXT: mulhu a6, a3, a4
+; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: mul a1, a1, a4
-; RV32-NEXT: add a1, a3, a1
-; RV32-NEXT: mul a0, a5, a4
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: mul a0, a3, a4
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_12:
@@ -560,26 +574,29 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
; RV64-NEXT: add a2, a0, a1
; RV64-NEXT: sltu a3, a2, a0
; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: lui a3, 699051
-; RV64-NEXT: addiw a3, a3, -1365
+; RV64-NEXT: lui a3, 349525
+; RV64-NEXT: addiw a3, a3, 1365
; RV64-NEXT: slli a4, a3, 32
; RV64-NEXT: add a3, a3, a4
-; RV64-NEXT: mulhu a4, a2, a3
-; RV64-NEXT: srli a5, a4, 1
-; RV64-NEXT: andi a4, a4, -2
-; RV64-NEXT: lui a6, %hi(.LCPI10_0)
-; RV64-NEXT: ld a6, %lo(.LCPI10_0)(a6)
-; RV64-NEXT: add a4, a4, a5
-; RV64-NEXT: sub a2, a2, a4
-; RV64-NEXT: sub a4, a0, a2
-; RV64-NEXT: mul a5, a4, a6
-; RV64-NEXT: mulhu a6, a4, a3
-; RV64-NEXT: add a5, a6, a5
+; RV64-NEXT: mulhu a3, a2, a3
+; RV64-NEXT: slli a4, a3, 1
+; RV64-NEXT: lui a5, %hi(.LCPI10_0)
+; RV64-NEXT: ld a5, %lo(.LCPI10_0)(a5)
+; RV64-NEXT: add a3, a4, a3
+; RV64-NEXT: sub a2, a2, a3
+; RV64-NEXT: sub a3, a0, a2
+; RV64-NEXT: mul a4, a3, a5
+; RV64-NEXT: lui a5, 699051
+; RV64-NEXT: addiw a5, a5, -1365
+; RV64-NEXT: slli a6, a5, 32
+; RV64-NEXT: add a5, a5, a6
+; RV64-NEXT: mulhu a6, a3, a5
+; RV64-NEXT: add a4, a6, a4
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: sub a1, a1, a0
-; RV64-NEXT: mul a1, a1, a3
-; RV64-NEXT: add a1, a5, a1
-; RV64-NEXT: mul a0, a4, a3
+; RV64-NEXT: mul a1, a1, a5
+; RV64-NEXT: add a1, a4, a1
+; RV64-NEXT: mul a0, a3, a5
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 12
ret iXLen2 %a
diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 8444520fcc771..2c70ae3215c9d 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -10,12 +10,11 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 699051
-; RV32-NEXT: addi a1, a1, -1365
+; RV32-NEXT: lui a1, 349525
+; RV32-NEXT: addi a1, a1, 1365
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 1
-; RV32-NEXT: andi a1, a1, -2
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -25,14 +24,13 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 699051
-; RV64-NEXT: addiw a1, a1, -1365
+; RV64-NEXT: lui a1, 349525
+; RV64-NEXT: addiw a1, a1, 1365
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 1
-; RV64-NEXT: andi a1, a1, -2
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 1
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -46,12 +44,11 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 838861
-; RV32-NEXT: addi a1, a1, -819
+; RV32-NEXT: lui a1, 209715
+; RV32-NEXT: addi a1, a1, 819
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 2
-; RV32-NEXT: andi a1, a1, -4
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -61,14 +58,13 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 838861
-; RV64-NEXT: addiw a1, a1, -819
+; RV64-NEXT: lui a1, 209715
+; RV64-NEXT: addiw a1, a1, 819
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 2
-; RV64-NEXT: andi a1, a1, -4
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -134,10 +130,9 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 559241
-; RV32-NEXT: addi a1, a1, -1911
+; RV32-NEXT: lui a1, 69905
+; RV32-NEXT: addi a1, a1, 273
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a1, a1, 3
; RV32-NEXT: slli a2, a1, 4
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -149,12 +144,11 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 559241
-; RV64-NEXT: addiw a1, a1, -1911
+; RV64-NEXT: lui a1, 69905
+; RV64-NEXT: addiw a1, a1, 273
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a1, a1, 3
; RV64-NEXT: slli a2, a1, 4
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -170,12 +164,11 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 986895
-; RV32-NEXT: addi a1, a1, 241
+; RV32-NEXT: lui a1, 61681
+; RV32-NEXT: addi a1, a1, -241
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 4
-; RV32-NEXT: andi a1, a1, -16
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 4
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -185,14 +178,13 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 986895
-; RV64-NEXT: addiw a1, a1, 241
+; RV64-NEXT: lui a1, 61681
+; RV64-NEXT: addiw a1, a1, -241
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 4
-; RV64-NEXT: andi a1, a1, -16
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 4
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -206,10 +198,9 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 526344
-; RV32-NEXT: addi a1, a1, 129
+; RV32-NEXT: lui a1, 4112
+; RV32-NEXT: addi a1, a1, 257
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a1, a1, 7
; RV32-NEXT: slli a2, a1, 8
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -221,12 +212,11 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 526344
-; RV64-NEXT: addiw a1, a1, 129
+; RV64-NEXT: lui a1, 4112
+; RV64-NEXT: addiw a1, a1, 257
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a1, a1, 7
; RV64-NEXT: slli a2, a1, 8
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -242,12 +232,11 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 1044496
-; RV32-NEXT: addi a1, a1, -255
+; RV32-NEXT: lui a1, 4080
+; RV32-NEXT: addi a1, a1, 255
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a2, a1, 8
-; RV32-NEXT: andi a1, a1, -256
-; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: slli a2, a1, 8
+; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -257,14 +246,13 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 1044496
-; RV64-NEXT: addiw a1, a1, -255
+; RV64-NEXT: lui a1, 4080
+; RV64-NEXT: addiw a1, a1, 255
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a2, a1, 8
-; RV64-NEXT: andi a1, a1, -256
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: slli a2, a1, 8
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -278,10 +266,9 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 524296
+; RV32-NEXT: lui a1, 16
; RV32-NEXT: addi a1, a1, 1
; RV32-NEXT: mulhu a1, a0, a1
-; RV32-NEXT: srli a1, a1, 15
; RV32-NEXT: slli a2, a1, 16
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
@@ -293,12 +280,11 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 524296
+; RV64-NEXT: lui a1, 16
; RV64-NEXT: addiw a1, a1, 1
; RV64-NEXT: slli a2, a1, 32
; RV64-NEXT: add a1, a1, a2
; RV64-NEXT: mulhu a1, a0, a1
-; RV64-NEXT: srli a1, a1, 15
; RV64-NEXT: slli a2, a1, 16
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
@@ -314,12 +300,11 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a0, a1
; RV32-NEXT: sltu a0, a1, a0
; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: lui a1, 1048560
-; RV32-NEXT: addi a2, a1, 1
-; RV32-NEXT: mulhu a2, a0, a2
-; RV32-NEXT: and a1, a2, a1
-; RV32-NEXT: srli a2, a2, 16
-; RV32-NEXT: or a1, a1, a2
+; RV32-NEXT: lui a1, 16
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: mulhu a1, a0, a1
+; RV32-NEXT: slli a2, a1, 16
+; RV32-NEXT: or a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
@@ -329,14 +314,13 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a0, a1
; RV64-NEXT: sltu a0, a1, a0
; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: lui a1, 1048560
-; RV64-NEXT: addiw a2, a1, 1
-; RV64-NEXT: slli a3, a2, 32
-; RV64-NEXT: add a2, a2, a3
-; RV64-NEXT: mulhu a2, a0, a2
-; RV64-NEXT: and a1, a2, a1
-; RV64-NEXT: srli a2, a2, 16
-; RV64-NEXT: add a1, a1, a2
+; RV64-NEXT: lui a1, 983041
+; RV64-NEXT: slli a1, a1, 20
+; RV64-NEXT: addi a1, a1, -1
+; RV64-NEXT: srli a1, a1, 16
+; RV64-NEXT: mulhu a1, a0, a1
+; RV64-NEXT: slli a2, a1, 16
+; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
@@ -354,12 +338,11 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sltu a2, a1, a2
; RV32-NEXT: add a1, a1, a2
-; RV32-NEXT: lui a2, 699051
-; RV32-NEXT: addi a2, a2, -1365
+; RV32-NEXT: lui a2, 349525
+; RV32-NEXT: addi a2, a2, 1365
; RV32-NEXT: mulhu a2, a1, a2
-; RV32-NEXT: srli a3, a2, 1
-; RV32-NEXT: andi a2, a2, -2
-; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: slli a3, a2, 1
+; RV32-NEXT: add a2, a3, a2
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: slli a1, a1, 2
; RV32-NEXT: andi a0, a0, 3
@@ -376,14 +359,13 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sltu a2, a1, a2
; RV64-NEXT: add a1, a1, a2
-; RV64-NEXT: lui a2, 699051
-; RV64-NEXT: addiw a2, a2, -1365
+; RV64-NEXT: lui a2, 349525
+; RV64-NEXT: addiw a2, a2, 1365
; RV64-NEXT: slli a3, a2, 32
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: mulhu a2, a1, a2
-; RV64-NEXT: srli a3, a2, 1
-; RV64-NEXT: andi a2, a2, -2
-; RV64-NEXT: add a2, a2, a3
+; RV64-NEXT: slli a3, a2, 1
+; RV64-NEXT: add a2, a3, a2
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: slli a1, a1, 2
; RV64-NEXT: andi a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index f83a933c0b5c8..f78e44f869624 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -16,12 +16,9 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
;
; RV32IM-LABEL: fold_urem_positive_odd:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a1, 364242
-; RV32IM-NEXT: addi a1, a1, 777
+; RV32IM-NEXT: lui a1, 706409
+; RV32IM-NEXT: addi a1, a1, 387
; RV32IM-NEXT: mulhu a1, a0, a1
-; RV32IM-NEXT: sub a2, a0, a1
-; RV32IM-NEXT: srli a2, a2, 1
-; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: srli a1, a1, 6
; RV32IM-NEXT: li a2, 95
; RV32IM-NEXT: mul a1, a1, a2
@@ -43,15 +40,11 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
; RV64IM-LABEL: fold_urem_positive_odd:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 364242
-; RV64IM-NEXT: addi a2, a2, 777
+; RV64IM-NEXT: lui a2, 706409
+; RV64IM-NEXT: addi a2, a2, 387
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a2, a0, a1
-; RV64IM-NEXT: srliw a2, a2, 1
-; RV64IM-NEXT: add a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 6
+; RV64IM-NEXT: srli a1, a1, 38
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: subw a0, a0, a1
@@ -70,7 +63,7 @@ define i32 @fold_urem_positive_even(i32 %x) nounwind {
; RV32IM-LABEL: fold_urem_positive_even:
; RV32IM: # %bb.0:
; RV32IM-NEXT: lui a1, 1012964
-; RV32IM-NEXT: addi a1, a1, -61
+; RV32IM-NEXT: addi a1, a1, -63
; RV32IM-NEXT: mulhu a1, a0, a1
; RV32IM-NEXT: srli a1, a1, 10
; RV32IM-NEXT: li a2, 1060
@@ -94,7 +87,7 @@ define i32 @fold_urem_positive_even(i32 %x) nounwind {
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
; RV64IM-NEXT: lui a2, 1012964
-; RV64IM-NEXT: addi a2, a2, -61
+; RV64IM-NEXT: addi a2, a2, -63
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 42
@@ -131,12 +124,9 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
;
; RV32IM-LABEL: combine_urem_udiv:
; RV32IM: # %bb.0:
-; RV32IM-NEXT: lui a1, 364242
-; RV32IM-NEXT: addi a1, a1, 777
+; RV32IM-NEXT: lui a1, 706409
+; RV32IM-NEXT: addi a1, a1, 387
; RV32IM-NEXT: mulhu a1, a0, a1
-; RV32IM-NEXT: sub a2, a0, a1
-; RV32IM-NEXT: srli a2, a2, 1
-; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: srli a1, a1, 6
; RV32IM-NEXT: li a2, 95
; RV32IM-NEXT: mul a2, a1, a2
@@ -169,15 +159,11 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
; RV64IM-LABEL: combine_urem_udiv:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
-; RV64IM-NEXT: lui a2, 364242
-; RV64IM-NEXT: addi a2, a2, 777
+; RV64IM-NEXT: lui a2, 706409
+; RV64IM-NEXT: addi a2, a2, 387
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
-; RV64IM-NEXT: srli a1, a1, 32
-; RV64IM-NEXT: subw a2, a0, a1
-; RV64IM-NEXT: srliw a2, a2, 1
-; RV64IM-NEXT: add a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 6
+; RV64IM-NEXT: srli a1, a1, 38
; RV64IM-NEXT: li a2, 95
; RV64IM-NEXT: mul a2, a1, a2
; RV64IM-NEXT: add a0, a0, a1
@@ -251,9 +237,10 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
; RV64IM: # %bb.0:
; RV64IM-NEXT: lui a1, %hi(.LCPI6_0)
; RV64IM-NEXT: ld a1, %lo(.LCPI6_0)(a1)
-; RV64IM-NEXT: srli a2, a0, 1
-; RV64IM-NEXT: mulhu a1, a2, a1
-; RV64IM-NEXT: srli a1, a1, 4
+; RV64IM-NEXT: mulhu a1, a0, a1
+; RV64IM-NEXT: sub a2, a0, a1
+; RV64IM-NEXT: srli a2, a2, 1
+; RV64IM-NEXT: add a1, a2, a1
; RV64IM-NEXT: li a2, 98
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index c057c656e0fb7..fedb8c0bed02f 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -956,39 +956,35 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
;
; RV64IM-LABEL: dont_fold_urem_i64:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: ld a2, 16(a1)
+; RV64IM-NEXT: ld a2, 8(a1)
; RV64IM-NEXT: lui a3, %hi(.LCPI6_0)
; RV64IM-NEXT: ld a3, %lo(.LCPI6_0)(a3)
; RV64IM-NEXT: ld a4, 24(a1)
-; RV64IM-NEXT: ld a1, 8(a1)
+; RV64IM-NEXT: ld a1, 16(a1)
; RV64IM-NEXT: mulhu a3, a2, a3
-; RV64IM-NEXT: sub a5, a2, a3
-; RV64IM-NEXT: srli a5, a5, 1
-; RV64IM-NEXT: add a3, a5, a3
-; RV64IM-NEXT: srli a3, a3, 4
-; RV64IM-NEXT: li a5, 23
-; RV64IM-NEXT: lui a6, %hi(.LCPI6_1)
-; RV64IM-NEXT: ld a6, %lo(.LCPI6_1)(a6)
-; RV64IM-NEXT: mul a3, a3, a5
+; RV64IM-NEXT: srli a3, a3, 6
+; RV64IM-NEXT: lui a5, %hi(.LCPI6_1)
+; RV64IM-NEXT: ld a5, %lo(.LCPI6_1)(a5)
+; RV64IM-NEXT: li a6, 654
+; RV64IM-NEXT: mul a3, a3, a6
; RV64IM-NEXT: sub a2, a2, a3
-; RV64IM-NEXT: srli a3, a1, 1
-; RV64IM-NEXT: mulhu a3, a3, a6
-; RV64IM-NEXT: srli a3, a3, 7
+; RV64IM-NEXT: mulhu a3, a1, a5
+; RV64IM-NEXT: srli a3, a3, 4
; RV64IM-NEXT: lui a5, %hi(.LCPI6_2)
; RV64IM-NEXT: ld a5, %lo(.LCPI6_2)(a5)
-; RV64IM-NEXT: li a6, 654
+; RV64IM-NEXT: li a6, 23
; RV64IM-NEXT: mul a3, a3, a6
; RV64IM-NEXT: sub a1, a1, a3
; RV64IM-NEXT: mulhu a3, a4, a5
-; RV64IM-NEXT: srli a3, a3, 12
+; RV64IM-NEXT: srli a3, a3, 10
; RV64IM-NEXT: lui a5, 1
; RV64IM-NEXT: addiw a5, a5, 1327
; RV64IM-NEXT: mul a3, a3, a5
; RV64IM-NEXT: sub a4, a4, a3
; RV64IM-NEXT: sd zero, 0(a0)
; RV64IM-NEXT: sd a4, 24(a0)
-; RV64IM-NEXT: sd a1, 8(a0)
-; RV64IM-NEXT: sd a2, 16(a0)
+; RV64IM-NEXT: sd a1, 16(a0)
+; RV64IM-NEXT: sd a2, 8(a0)
; RV64IM-NEXT: ret
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
diff --git a/llvm/test/CodeGen/SystemZ/int-div-06.ll b/llvm/test/CodeGen/SystemZ/int-div-06.ll
index 9de717857d7d9..f3c8e15873489 100644
--- a/llvm/test/CodeGen/SystemZ/int-div-06.ll
+++ b/llvm/test/CodeGen/SystemZ/int-div-06.ll
@@ -1,16 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; Test that divisions by constants are implemented as multiplications.
;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu < %s | FileCheck %s
; Check signed 32-bit division.
define i32 @f1(i32 %a) {
; CHECK-LABEL: f1:
-; CHECK: lgfr [[REG:%r[0-5]]], %r2
-; CHECK: msgfi [[REG]], 502748801
-; CHECK-DAG: srlg [[RES1:%r[0-5]]], [[REG]], 63
-; CHECK-DAG: srag %r2, [[REG]], 46
-; CHECK: ar %r2, [[RES1]]
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lgfr %r0, %r2
+; CHECK-NEXT: msgfi %r0, 502748801
+; CHECK-NEXT: srlg %r1, %r0, 63
+; CHECK-NEXT: srag %r2, %r0, 46
+; CHECK-NEXT: ar %r2, %r1
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
%b = sdiv i32 %a, 139968
ret i32 %b
}
@@ -18,10 +21,13 @@ define i32 @f1(i32 %a) {
; Check unsigned 32-bit division.
define i32 @f2(i32 %a) {
; CHECK-LABEL: f2:
-; CHECK: llgfr [[REG:%r[0-5]]], %r2
-; CHECK: msgfi [[REG]], 502748801
-; CHECK: srlg %r2, [[REG]], 46
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: llgfr %r0, %r2
+; CHECK-NEXT: llilf %r1, 4021990407
+; CHECK-NEXT: msgr %r1, %r0
+; CHECK-NEXT: srlg %r2, %r1, 49
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: br %r14
%b = udiv i32 %a, 139968
ret i32 %b
}
@@ -29,16 +35,18 @@ define i32 @f2(i32 %a) {
; Check signed 64-bit division.
define i64 @f3(i64 %dummy, i64 %a) {
; CHECK-LABEL: f3:
-; CHECK-DAG: llihf [[CONST:%r[0-5]]], 1005497601
-; CHECK-DAG: oilf [[CONST]], 4251762321
-; CHECK-DAG: srag [[REG:%r[0-5]]], %r3, 63
-; CHECK-DAG: ngr [[REG]], [[CONST]]
-; CHECK-DAG: mlgr %r2, [[CONST]]
-; CHECK: sgr %r2, [[REG]]
-; CHECK: srlg [[RES1:%r[0-5]]], %r2, 63
-; CHECK: srag %r2, %r2, 15
-; CHECK: agr %r2, [[RES1]]
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
+; CHECK-NEXT: srag %r0, %r3, 63
+; CHECK-NEXT: llihf %r1, 1005497601
+; CHECK-NEXT: oilf %r1, 4251762321
+; CHECK-NEXT: ngr %r0, %r1
+; CHECK-NEXT: mlgr %r2, %r1
+; CHECK-NEXT: sgr %r2, %r0
+; CHECK-NEXT: srlg %r0, %r2, 63
+; CHECK-NEXT: srag %r2, %r2, 15
+; CHECK-NEXT: agr %r2, %r0
+; CHECK-NEXT: br %r14
%b = sdiv i64 %a, 139968
ret i64 %b
}
@@ -46,11 +54,13 @@ define i64 @f3(i64 %dummy, i64 %a) {
; Check unsigned 64-bit division.
define i64 @f4(i64 %dummy, i64 %a) {
; CHECK-LABEL: f4:
-; CHECK: llihf [[CONST:%r[0-5]]], 1005497601
-; CHECK: oilf [[CONST]], 4251762321
-; CHECK: mlgr %r2, [[CONST]]
-; CHECK: srlg %r2, %r2, 15
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
+; CHECK-NEXT: llihf %r0, 2010995203
+; CHECK-NEXT: oilf %r0, 4208557345
+; CHECK-NEXT: mlgr %r2, %r0
+; CHECK-NEXT: srlg %r2, %r2, 16
+; CHECK-NEXT: br %r14
%b = udiv i64 %a, 139968
ret i64 %b
}
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-13.ll b/llvm/test/CodeGen/SystemZ/int-mul-13.ll
index 82937cf66c629..db3aa1531d90a 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-13.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-13.ll
@@ -81,11 +81,11 @@ define i64 @f4(i64 %dummy, i64 %a, i64 %b) {
define i64 @f5(i64 %dummy, i64 %a) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
-; CHECK-NEXT: llihf %r0, 1782028570
-; CHECK-NEXT: oilf %r0, 598650223
+; CHECK-NEXT: llihf %r0, 891014285
+; CHECK-NEXT: oilf %r0, 299325111
; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q
; CHECK-NEXT: mlgr %r2, %r0
-; CHECK-NEXT: srlg %r2, %r2, 9
+; CHECK-NEXT: srlg %r2, %r2, 8
; CHECK-NEXT: br %r14
%res = udiv i64 %a, 1234
ret i64 %res
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 7087041e8dace..e005f57d2a0e7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -362,31 +362,29 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: sbcs r0, r1, #0
; CHECK-NEXT: blt.w .LBB1_28
; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph
-; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: mov r7, r2
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: csel r7, r2, r0, lt
-; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: mov r1, r7
+; CHECK-NEXT: it ge
+; CHECK-NEXT: movge r7, #1
+; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: cmp r7, #3
; CHECK-NEXT: it ls
-; CHECK-NEXT: movls r1, #3
-; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: subs r1, r1, r7
-; CHECK-NEXT: movw r2, #43691
-; CHECK-NEXT: adds r1, #2
-; CHECK-NEXT: movt r2, #43690
+; CHECK-NEXT: movls r0, #3
+; CHECK-NEXT: subs r0, r0, r7
+; CHECK-NEXT: mov r12, r1
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: mov.w r1, #1431655765
; CHECK-NEXT: ldr r6, [sp, #128]
; CHECK-NEXT: movw r8, :lower16:c
-; CHECK-NEXT: umull r1, r2, r1, r2
+; CHECK-NEXT: umull r0, r1, r0, r1
; CHECK-NEXT: movt r8, :upper16:c
-; CHECK-NEXT: movs r1, #4
+; CHECK-NEXT: mov.w r9, #12
+; CHECK-NEXT: @ implicit-def: $r11
; CHECK-NEXT: @ implicit-def: $r10
; CHECK-NEXT: @ implicit-def: $r5
-; CHECK-NEXT: @ implicit-def: $r11
-; CHECK-NEXT: mov.w r9, #12
-; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r1, r2, lsr #1
-; CHECK-NEXT: add.w r0, r0, r2, lsr #1
+; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: adds r0, r1, #1
+; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: bic r3, r1, #3
; CHECK-NEXT: adr r1, .LCPI1_0
; CHECK-NEXT: vldrw.u32 q0, [r1]
@@ -399,35 +397,31 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r11
-; CHECK-NEXT: cmn.w r11, #4
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: cmn.w r5, #4
; CHECK-NEXT: it le
; CHECK-NEXT: mvnle r0, #3
; CHECK-NEXT: movw r2, #18725
; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: movt r2, #9362
-; CHECK-NEXT: sub.w r1, r0, r11
-; CHECK-NEXT: mov r10, r3
+; CHECK-NEXT: subs r1, r0, r5
+; CHECK-NEXT: mov r5, r3
; CHECK-NEXT: umull r2, r3, r1, r2
-; CHECK-NEXT: subs r2, r1, r3
-; CHECK-NEXT: add.w r2, r3, r2, lsr #1
-; CHECK-NEXT: lsrs r3, r2, #2
-; CHECK-NEXT: lsls r3, r3, #3
-; CHECK-NEXT: sub.w r2, r3, r2, lsr #2
+; CHECK-NEXT: rsb r2, r3, r3, lsl #3
+; CHECK-NEXT: mov r3, r5
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #7
+; CHECK-NEXT: adds r5, r0, #7
; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: adds r5, #2
-; CHECK-NEXT: subs.w r1, r5, lr
-; CHECK-NEXT: asr.w r0, r5, #31
+; CHECK-NEXT: add.w r10, r10, #2
+; CHECK-NEXT: subs.w r1, r10, lr
+; CHECK-NEXT: asr.w r0, r10, #31
; CHECK-NEXT: sbcs.w r0, r0, r12
; CHECK-NEXT: bge.w .LBB1_28
; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader
@@ -436,7 +430,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Child Loop BB1_10 Depth 2
; CHECK-NEXT: @ Child Loop BB1_12 Depth 3
; CHECK-NEXT: @ Child Loop BB1_14 Depth 3
-; CHECK-NEXT: cmp.w r11, #2
+; CHECK-NEXT: cmp r5, #2
; CHECK-NEXT: bgt .LBB1_5
; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
@@ -458,14 +452,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vdup.32 q0, r2
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: b .LBB1_10
; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: add.w r11, r0, #7
+; CHECK-NEXT: adds r5, r0, #7
; CHECK-NEXT: cmn.w r0, #4
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: bge .LBB1_5
; CHECK-NEXT: .LBB1_10: @ %for.body6.us
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
@@ -523,7 +517,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: beq .LBB1_9
; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: eor r1, r10, #1
+; CHECK-NEXT: eor r1, r11, #1
; CHECK-NEXT: lsls r1, r1, #31
; CHECK-NEXT: bne .LBB1_9
; CHECK-NEXT: b .LBB1_26
@@ -532,11 +526,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: beq.w .LBB1_2
; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: .LBB1_19: @ %for.body6.us60
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: lsls.w r1, r10, #31
+; CHECK-NEXT: lsls.w r1, r11, #31
; CHECK-NEXT: bne .LBB1_27
; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
@@ -552,19 +546,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: bgt .LBB1_25
; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
-; CHECK-NEXT: add.w r11, r0, #28
+; CHECK-NEXT: add.w r5, r0, #28
; CHECK-NEXT: cmn.w r0, #25
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: blt .LBB1_19
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #14
+; CHECK-NEXT: add.w r5, r0, #14
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #21
+; CHECK-NEXT: add.w r5, r0, #21
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_26: @ %for.inc19.us
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-select.ll b/llvm/test/CodeGen/Thumb2/thumb2-select.ll
index 105c2672ee1b9..656b6f45f061c 100644
--- a/llvm/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,11 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 -show-mc-encoding %s -o - \
; RUN: | FileCheck %s
define i32 @f1(i32 %a.s) {
-entry:
; CHECK-LABEL: f1:
-; CHECK: it eq
-; CHECK: moveq
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #3 @ encoding: [0x03,0x21]
+; CHECK-NEXT: cmp r0, #4 @ encoding: [0x04,0x28]
+; CHECK-NEXT: it eq @ encoding: [0x08,0xbf]
+; CHECK-NEXT: moveq r1, #2 @ encoding: [0x02,0x21]
+; CHECK-NEXT: mov r0, r1 @ encoding: [0x08,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp eq i32 %a.s, 4
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -13,30 +19,45 @@ entry:
}
define i32 @f2(i32 %a.s) {
-entry:
; CHECK-LABEL: f2:
-; CHECK: it gt
-; CHECK: movgt
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #3 @ encoding: [0x03,0x21]
+; CHECK-NEXT: cmp r0, #4 @ encoding: [0x04,0x28]
+; CHECK-NEXT: it gt @ encoding: [0xc8,0xbf]
+; CHECK-NEXT: movgt r1, #2 @ encoding: [0x02,0x21]
+; CHECK-NEXT: mov r0, r1 @ encoding: [0x08,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp sgt i32 %a.s, 4
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f3(i32 %a.s, i32 %b.s) {
-entry:
; CHECK-LABEL: f3:
-; CHECK: it lt
-; CHECK: movlt
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it lt @ encoding: [0xb8,0xbf]
+; CHECK-NEXT: movlt r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp slt i32 %a.s, %b.s
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f4(i32 %a.s, i32 %b.s) {
-entry:
; CHECK-LABEL: f4:
-; CHECK: it le
-; CHECK: movle
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it le @ encoding: [0xd8,0xbf]
+; CHECK-NEXT: movle r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp sle i32 %a.s, %b.s
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -44,30 +65,46 @@ entry:
}
define i32 @f5(i32 %a.u, i32 %b.u) {
-entry:
; CHECK-LABEL: f5:
-; CHECK: it ls
-; CHECK: movls
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ls @ encoding: [0x98,0xbf]
+; CHECK-NEXT: movls r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp ule i32 %a.u, %b.u
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f6(i32 %a.u, i32 %b.u) {
-entry:
; CHECK-LABEL: f6:
-; CHECK: it hi
-; CHECK: movhi
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it hi @ encoding: [0x88,0xbf]
+; CHECK-NEXT: movhi r2, #2 @ encoding: [0x02,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp = icmp ugt i32 %a.u, %b.u
%tmp1.s = select i1 %tmp, i32 2, i32 3
ret i32 %tmp1.s
}
define i32 @f7(i32 %a, i32 %b, i32 %c) {
-entry:
; CHECK-LABEL: f7:
-; CHECK: it hi
-; CHECK: lsrhi {{r[0-9]+}}
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: mov.w r3, #1431655765 @ encoding: [0x4f,0xf0,0x55,0x33]
+; CHECK-NEXT: umull r3, r2, r2, r3 @ encoding: [0xa2,0xfb,0x03,0x32]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ls @ encoding: [0x98,0xbf]
+; CHECK-NEXT: movls r2, #3 @ encoding: [0x03,0x22]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp1 = icmp ugt i32 %a, %b
%tmp2 = udiv i32 %c, 3
%tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -75,10 +112,15 @@ entry:
}
define i32 @f8(i32 %a, i32 %b, i32 %c) {
-entry:
; CHECK-LABEL: f8:
-; CHECK: it lo
-; CHECK: lsllo {{r[0-9]+}}
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r3, #3 @ encoding: [0x03,0x23]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it lo @ encoding: [0x38,0xbf]
+; CHECK-NEXT: lsllo r3, r2, #2 @ encoding: [0x93,0x00]
+; CHECK-NEXT: mov r0, r3 @ encoding: [0x18,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp1 = icmp ult i32 %a, %b
%tmp2 = mul i32 %c, 4
%tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -86,10 +128,15 @@ entry:
}
define i32 @f9(i32 %a, i32 %b, i32 %c) {
-entry:
; CHECK-LABEL: f9:
-; CHECK: it ge
-; CHECK: rorge.w
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r3, #3 @ encoding: [0x03,0x23]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ge @ encoding: [0xa8,0xbf]
+; CHECK-NEXT: rorge.w r3, r2, #22 @ encoding: [0x4f,0xea,0xb2,0x53]
+; CHECK-NEXT: mov r0, r3 @ encoding: [0x18,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
+entry:
%tmp1 = icmp sge i32 %a, %b
%tmp2 = shl i32 %c, 10
%tmp3 = lshr i32 %c, 22
@@ -100,7 +147,13 @@ entry:
define i32 @f10(i32 %a, i32 %b) {
; CHECK-LABEL: f10:
-; CHECK: movwne {{r[0-9]+}}, #1234 @ encoding: [0x40,0xf2,0xd2,0x4{{[0-9a-f]+}}]
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, #12345 @ encoding: [0x43,0xf2,0x39,0x02]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: it ne @ encoding: [0x18,0xbf]
+; CHECK-NEXT: movwne r2, #1234 @ encoding: [0x40,0xf2,0xd2,0x42]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
%tst = icmp ne i32 %a, %b
%val = select i1 %tst, i32 1234, i32 12345
ret i32 %val
@@ -109,7 +162,15 @@ define i32 @f10(i32 %a, i32 %b) {
; Make sure we pick the Thumb encoding for movw/movt
define i32 @f11(i32 %a, i32 %b) {
; CHECK-LABEL: f11:
-; CHECK: movwne {{r[0-9]+}}, #50033 @ encoding: [0x4c,0xf2,0x71,0x3{{[0-9a-f]+}}]
+; CHECK: @ %bb.0:
+; CHECK-NEXT: movw r2, #49977 @ encoding: [0x4c,0xf2,0x39,0x32]
+; CHECK-NEXT: movt r2, #8288 @ encoding: [0xc2,0xf2,0x60,0x02]
+; CHECK-NEXT: cmp r0, r1 @ encoding: [0x88,0x42]
+; CHECK-NEXT: itt ne @ encoding: [0x1c,0xbf]
+; CHECK-NEXT: movwne r2, #50033 @ encoding: [0x4c,0xf2,0x71,0x32]
+; CHECK-NEXT: movtne r2, #1883 @ encoding: [0xc0,0xf2,0x5b,0x72]
+; CHECK-NEXT: mov r0, r2 @ encoding: [0x10,0x46]
+; CHECK-NEXT: bx lr @ encoding: [0x70,0x47]
%tst = icmp ne i32 %a, %b
%val = select i1 %tst, i32 123454321, i32 543212345
ret i32 %val
diff --git a/llvm/test/CodeGen/VE/Scalar/div.ll b/llvm/test/CodeGen/VE/Scalar/div.ll
index 64caf8a835468..a44a669f50a37 100644
--- a/llvm/test/CodeGen/VE/Scalar/div.ll
+++ b/llvm/test/CodeGen/VE/Scalar/div.ll
@@ -1,14 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
; Function Attrs: norecurse nounwind readnone
define i128 @divi128(i128, i128) {
; CHECK-LABEL: divi128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: lea %s4, __divti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __divti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%3 = sdiv i128 %0, %1
ret i128 %3
}
@@ -37,12 +56,30 @@ define signext i32 @divi32(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divu128(i128, i128) {
; CHECK-LABEL: divu128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: lea %s4, __udivti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __udivti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%3 = udiv i128 %0, %1
ret i128 %3
}
@@ -123,7 +160,22 @@ define zeroext i8 @divu8(i8 zeroext %a, i8 zeroext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divi128ri(i128) {
; CHECK-LABEL: divi128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB10_2:
; CHECK-NEXT: lea %s2, __divti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __divti3 at hi(, %s2)
@@ -131,6 +183,9 @@ define i128 @divi128ri(i128) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = sdiv i128 %0, 3
ret i128 %2
}
@@ -163,7 +218,22 @@ define signext i32 @divi32ri(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divu128ri(i128) {
; CHECK-LABEL: divu128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: lea %s2, __udivti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __udivti3 at hi(, %s2)
@@ -171,6 +241,9 @@ define i128 @divu128ri(i128) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = udiv i128 %0, 3
ret i128 %2
}
@@ -189,10 +262,9 @@ define i64 @divu64ri(i64 %a, i64 %b) {
define zeroext i32 @divu32ri(i32 zeroext %a, i32 zeroext %b) {
; CHECK-LABEL: divu32ri:
; CHECK: # %bb.0:
-; CHECK-NEXT: lea %s1, -1431655765
-; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea %s1, 1431655765
; CHECK-NEXT: muls.l %s0, %s0, %s1
-; CHECK-NEXT: srl %s0, %s0, 33
+; CHECK-NEXT: srl %s0, %s0, 32
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv i32 %a, 3
ret i32 %r
@@ -201,7 +273,22 @@ define zeroext i32 @divu32ri(i32 zeroext %a, i32 zeroext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divi128li(i128) {
; CHECK-LABEL: divi128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB16_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB16_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __divti3 at lo
@@ -211,6 +298,9 @@ define i128 @divi128li(i128) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = sdiv i128 3, %0
ret i128 %2
}
@@ -239,7 +329,22 @@ define signext i32 @divi32li(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @divu128li(i128) {
; CHECK-LABEL: divu128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB19_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB19_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __udivti3 at lo
@@ -249,6 +354,9 @@ define i128 @divu128li(i128) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = udiv i128 3, %0
ret i128 %2
}
diff --git a/llvm/test/CodeGen/VE/Scalar/rem.ll b/llvm/test/CodeGen/VE/Scalar/rem.ll
index 9911405c6a68d..4625e274baa1e 100644
--- a/llvm/test/CodeGen/VE/Scalar/rem.ll
+++ b/llvm/test/CodeGen/VE/Scalar/rem.ll
@@ -1,14 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
; Function Attrs: norecurse nounwind readnone
define i128 @remi128(i128 %a, i128 %b) {
; CHECK-LABEL: remi128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: lea %s4, __modti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __modti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = srem i128 %a, %b
ret i128 %r
}
@@ -41,12 +60,30 @@ define signext i32 @remi32(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @remu128(i128 %a, i128 %b) {
; CHECK-LABEL: remu128:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB3_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: lea %s4, __umodti3 at lo
; CHECK-NEXT: and %s4, %s4, (32)0
; CHECK-NEXT: lea.sl %s12, __umodti3 at hi(, %s4)
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = urem i128 %a, %b
ret i128 %r
}
@@ -137,7 +174,22 @@ define zeroext i8 @remu8(i8 zeroext %a, i8 zeroext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @remi128ri(i128 %a) {
; CHECK-LABEL: remi128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB10_2:
; CHECK-NEXT: lea %s2, __modti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __modti3 at hi(, %s2)
@@ -145,6 +197,9 @@ define i128 @remi128ri(i128 %a) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = srem i128 %a, 3
ret i128 %r
}
@@ -181,7 +236,22 @@ define signext i32 @remi32ri(i32 signext %a) {
; Function Attrs: norecurse nounwind readnone
define i128 @remu128ri(i128 %a) {
; CHECK-LABEL: remu128ri:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB13_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB13_2:
; CHECK-NEXT: lea %s2, __umodti3 at lo
; CHECK-NEXT: and %s2, %s2, (32)0
; CHECK-NEXT: lea.sl %s12, __umodti3 at hi(, %s2)
@@ -189,6 +259,9 @@ define i128 @remu128ri(i128 %a) {
; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = urem i128 %a, 11
ret i128 %r
}
@@ -209,10 +282,9 @@ define i64 @remu64ri(i64 %a) {
define zeroext i32 @remu32ri(i32 zeroext %a) {
; CHECK-LABEL: remu32ri:
; CHECK: # %bb.0:
-; CHECK-NEXT: lea %s1, -1431655765
-; CHECK-NEXT: and %s1, %s1, (32)0
+; CHECK-NEXT: lea %s1, 1431655765
; CHECK-NEXT: muls.l %s1, %s0, %s1
-; CHECK-NEXT: srl %s1, %s1, 33
+; CHECK-NEXT: srl %s1, %s1, 32
; CHECK-NEXT: muls.w.sx %s1, 3, %s1
; CHECK-NEXT: subs.w.sx %s0, %s0, %s1
; CHECK-NEXT: adds.w.zx %s0, %s0, (0)1
@@ -224,7 +296,22 @@ define zeroext i32 @remu32ri(i32 zeroext %a) {
; Function Attrs: norecurse nounwind readnone
define i128 @remi128li(i128 %a) {
; CHECK-LABEL: remi128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB16_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB16_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __modti3 at lo
@@ -234,6 +321,9 @@ define i128 @remi128li(i128 %a) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%r = srem i128 3, %a
ret i128 %r
}
@@ -266,7 +356,22 @@ define signext i32 @remi32li(i32 signext %a, i32 signext %b) {
; Function Attrs: norecurse nounwind readnone
define i128 @remu128li(i128) {
; CHECK-LABEL: remu128li:
-; CHECK: .LBB{{[0-9]+}}_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: st %s9, (, %s11)
+; CHECK-NEXT: st %s10, 8(, %s11)
+; CHECK-NEXT: or %s9, 0, %s11
+; CHECK-NEXT: lea %s11, -240(, %s11)
+; CHECK-NEXT: brge.l.t %s11, %s8, .LBB19_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: ld %s61, 24(, %s14)
+; CHECK-NEXT: or %s62, 0, %s0
+; CHECK-NEXT: lea %s63, 315
+; CHECK-NEXT: shm.l %s63, (%s61)
+; CHECK-NEXT: shm.l %s8, 8(%s61)
+; CHECK-NEXT: shm.l %s11, 16(%s61)
+; CHECK-NEXT: monc
+; CHECK-NEXT: or %s0, 0, %s62
+; CHECK-NEXT: .LBB19_2:
; CHECK-NEXT: or %s3, 0, %s1
; CHECK-NEXT: or %s2, 0, %s0
; CHECK-NEXT: lea %s0, __umodti3 at lo
@@ -276,6 +381,9 @@ define i128 @remu128li(i128) {
; CHECK-NEXT: or %s1, 0, (0)1
; CHECK-NEXT: bsic %s10, (, %s12)
; CHECK-NEXT: or %s11, 0, %s9
+; CHECK-NEXT: ld %s10, 8(, %s11)
+; CHECK-NEXT: ld %s9, (, %s11)
+; CHECK-NEXT: b.l.t (, %s10)
%2 = urem i128 3, %0
ret i128 %2
}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index 3bc0aba8d4264..940886ca1b096 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -7,19 +7,10 @@
define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: udiv_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s0, %s0, %s4
-; CHECK-NEXT: srl %s0, %s0, 32
-; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: muls.l %s1, %s1, %s4
-; CHECK-NEXT: srl %s1, %s1, 32
-; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: muls.l %s2, %s2, %s4
-; CHECK-NEXT: srl %s2, %s2, 32
-; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: muls.l %s3, %s3, %s4
-; CHECK-NEXT: srl %s3, %s3, 32
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: or %s3, 0, (0)1
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
@@ -32,23 +23,6 @@ define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
; CHECK-NEXT: and %s1, %s1, (56)0
; CHECK-NEXT: and %s2, %s2, (56)0
; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s5, %s3, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s3, %s3, %s5
-; CHECK-NEXT: muls.l %s5, %s2, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s2, %s2, %s5
-; CHECK-NEXT: muls.l %s5, %s1, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s1, %s1, %s5
-; CHECK-NEXT: muls.l %s4, %s0, %s4
-; CHECK-NEXT: srl %s4, %s4, 32
-; CHECK-NEXT: muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT: subs.w.sx %s0, %s0, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
diff --git a/llvm/test/CodeGen/X86/and-encoding.ll b/llvm/test/CodeGen/X86/and-encoding.ll
index 248686ff8b7a2..db60d2f561e3e 100644
--- a/llvm/test/CodeGen/X86/and-encoding.ll
+++ b/llvm/test/CodeGen/X86/and-encoding.ll
@@ -104,11 +104,10 @@ define i64 @lopped64_64to32(i64 %x) {
define i32 @shrinkAndKnownBits(i32 %x) {
; CHECK-LABEL: shrinkAndKnownBits:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %ecx # encoding: [0x89,0xf9]
-; CHECK-NEXT: movl $4042322161, %eax # encoding: [0xb8,0xf1,0xf0,0xf0,0xf0]
-; CHECK-NEXT: # imm = 0xF0F0F0F1
-; CHECK-NEXT: imulq %rcx, %rax # encoding: [0x48,0x0f,0xaf,0xc1]
-; CHECK-NEXT: shrq $36, %rax # encoding: [0x48,0xc1,0xe8,0x24]
+; CHECK-NEXT: movl %edi, %eax # encoding: [0x89,0xf8]
+; CHECK-NEXT: imulq $252645135, %rax, %rax # encoding: [0x48,0x69,0xc0,0x0f,0x0f,0x0f,0x0f]
+; CHECK-NEXT: # imm = 0xF0F0F0F
+; CHECK-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
; CHECK-NEXT: andl $-128, %eax # encoding: [0x83,0xe0,0x80]
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a..e97b813e34921 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -695,9 +695,8 @@ define i64 @load_fold_udiv1(ptr %p) {
; CHECK-O3-LABEL: load_fold_udiv1:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rdx
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movabsq $1229782938247303441, %rax # imm = 0x1111111111111111
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
-; CHECK-O3-NEXT: shrq $3, %rax
; CHECK-O3-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
%ret = udiv i64 %v, 15
@@ -882,10 +881,9 @@ define i64 @load_fold_urem1(ptr %p) {
; CHECK-O3-LABEL: load_fold_urem1:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rax
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; CHECK-O3-NEXT: movq %rax, %rdx
; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rcx
-; CHECK-O3-NEXT: shrq $3, %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rcx
; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
; CHECK-O3-NEXT: subq %rcx, %rax
@@ -1493,9 +1491,8 @@ define void @rmw_fold_udiv1(ptr %p, i64 %v) {
; CHECK-LABEL: rmw_fold_udiv1:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rdx
-; CHECK-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-NEXT: movabsq $1229782938247303441, %rax # imm = 0x1111111111111111
; CHECK-NEXT: mulxq %rax, %rax, %rax
-; CHECK-NEXT: shrq $3, %rax
; CHECK-NEXT: movq %rax, (%rdi)
; CHECK-NEXT: retq
%prev = load atomic i64, ptr %p unordered, align 8
@@ -1623,10 +1620,9 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) {
; CHECK-O0-LABEL: rmw_fold_urem1:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-O0-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; CHECK-O0-NEXT: movq %rax, %rdx
; CHECK-O0-NEXT: mulxq %rcx, %rcx, %rcx
-; CHECK-O0-NEXT: shrq $3, %rcx
; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx
; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx
; CHECK-O0-NEXT: subq %rcx, %rax
@@ -1636,9 +1632,8 @@ define void @rmw_fold_urem1(ptr %p, i64 %v) {
; CHECK-O3-LABEL: rmw_fold_urem1:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movq (%rdi), %rdx
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movabsq $1229782938247303441, %rax # imm = 0x1111111111111111
; CHECK-O3-NEXT: mulxq %rax, %rax, %rax
-; CHECK-O3-NEXT: shrq $3, %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,4), %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
; CHECK-O3-NEXT: subq %rax, %rdx
diff --git a/llvm/test/CodeGen/X86/bug80500.ll b/llvm/test/CodeGen/X86/bug80500.ll
index bdf72887ef2f9..90864535c5145 100644
--- a/llvm/test/CodeGen/X86/bug80500.ll
+++ b/llvm/test/CodeGen/X86/bug80500.ll
@@ -7,9 +7,8 @@ define i32 @load_fold_udiv1(ptr %p) {
; CHECK-LABEL: load_fold_udiv1:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl $-2004318071, %edx # imm = 0x88888889
+; CHECK-NEXT: movl $286331153, %edx # imm = 0x11111111
; CHECK-NEXT: mulxl (%eax), %eax, %eax
-; CHECK-NEXT: shrl $3, %eax
; CHECK-NEXT: retl
%v = load i32, ptr %p, align 4
%ret = udiv i32 %v, 15
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index aa3bea2791416..163c11c28882c 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -203,68 +203,56 @@ define i32 @PR43159(ptr %a0) {
; SSE-LABEL: PR43159:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $1, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [822987745,18122225,2164392967,3105965049]
+; SSE-NEXT: pmuludq %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE-NEXT: psubd %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: psrld $7, %xmm0
-; SSE-NEXT: psrld $6, %xmm2
-; SSE-NEXT: movd %xmm2, %edi
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $7, %xmm1
; SSE-NEXT: pextrd $1, %xmm0, %esi
-; SSE-NEXT: pextrd $2, %xmm2, %edx
-; SSE-NEXT: pextrd $3, %xmm0, %ecx
+; SSE-NEXT: psrld $5, %xmm0
+; SSE-NEXT: movd %xmm0, %edi
+; SSE-NEXT: pextrd $2, %xmm1, %edx
+; SSE-NEXT: pextrd $3, %xmm1, %ecx
; SSE-NEXT: jmp foo # TAILCALL
;
; AVX1-LABEL: PR43159:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edi
-; AVX1-NEXT: vpextrd $1, %xmm1, %esi
-; AVX1-NEXT: vpextrd $2, %xmm0, %edx
+; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %edi
+; AVX1-NEXT: vpextrd $1, %xmm0, %esi
+; AVX1-NEXT: vpextrd $2, %xmm1, %edx
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; AVX1-NEXT: jmp foo # TAILCALL
;
; AVX2-LABEL: PR43159:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %edi
@@ -276,18 +264,14 @@ define i32 @PR43159(ptr %a0) {
; AVX512VL-LABEL: PR43159:
; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovd %xmm0, %edi
@@ -299,18 +283,14 @@ define i32 @PR43159(ptr %a0) {
; AVX512DQVL-LABEL: PR43159:
; AVX512DQVL: # %bb.0: # %entry
; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512DQVL-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512DQVL-NEXT: vmovd %xmm0, %edi
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index d5a481549f851..d65399c84f7e7 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -449,7 +449,7 @@ define i32 @combine_udiv_uniform(i32 %x) {
; CHECK-LABEL: combine_udiv_uniform:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: movl $2987803337, %eax # imm = 0xB21642C9
+; CHECK-NEXT: movl $2987803335, %eax # imm = 0xB21642C7
; CHECK-NEXT: imulq %rcx, %rax
; CHECK-NEXT: shrq $36, %rax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
@@ -461,29 +461,19 @@ define i32 @combine_udiv_uniform(i32 %x) {
define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; SSE-LABEL: combine_vec_udiv_uniform:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [25645,25645,25645,25645,25645,25645,25645,25645]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [45589,45589,45589,45589,45589,45589,45589,45589]
; SSE-NEXT: psrlw $4, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_uniform:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [45589,45589,45589,45589,45589,45589,45589,45589]
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_uniform:
; XOP: # %bb.0:
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [25645,25645,25645,25645,25645,25645,25645,25645]
-; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
-; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [45589,45589,45589,45589,45589,45589,45589,45589]
; XOP-NEXT: vpsrlw $4, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
@@ -493,18 +483,12 @@ define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SSE2-LABEL: combine_vec_udiv_nonuniform:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrlw $3, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45589,3855,32779,4681,32767,1,257,32767]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm1
; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,32768,32768,0,0,32768]
; SSE2-NEXT: paddw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
@@ -512,36 +496,31 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
;
; SSE41-LABEL: combine_vec_udiv_nonuniform:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $3, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45589,3855,32779,4681,32767,1,257,32767]
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,32768,32768,0,0,32768]
; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4096,32768,2,16384,1024,u,256,4]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45589,3855,32779,4681,32767,1,257,32767]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,32768,32768,0,0,32768]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,2048,8,u,u,2,2,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4096,32768,2,16384,1024,u,256,4]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [25645,61681,8195,9363,512,32769,32897,2]
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [45589,3855,32779,4681,32767,1,257,32767]
; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,0,32768,32768,0,0,32768]
; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
@@ -550,40 +529,31 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
}
define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
-; SSE2-LABEL: combine_vec_udiv_nonuniform2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [16393,59919,58255,32787,55189,8197,52429,32789]
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [8,2048,2048,2,2048,8,2048,2]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: combine_vec_udiv_nonuniform2:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [8,2048,2048,2,2048,8,2048,2]
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_vec_udiv_nonuniform2:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2049,59917,58253,16393,55187,32787,13107,8197]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32,2048,2048,4,2048,2,8192,8]
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform2:
; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [8,2048,2048,2,2048,8,2048,2]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2049,59917,58253,16393,55187,32787,13107,8197]
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32,2048,2048,4,2048,2,8192,8]
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform2:
; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16393,59919,58255,32787,55189,8197,52429,32789]
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [2049,59917,58253,16393,55187,32787,13107,8197]
+; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -591,31 +561,33 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
}
define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
-; SSE-LABEL: combine_vec_udiv_nonuniform3:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,25645,18351,12137,2115,23705,1041,517]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024]
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_vec_udiv_nonuniform3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,65535,0,0]
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_udiv_nonuniform3:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,4096,8192,4096,u,2048,u,u]
+; SSE41-NEXT: pmulhuw %xmm0, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_nonuniform3:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16384,4096,4096,4096,4096,2048,2048,1024]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4096,8192,4096,u,2048,u,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform3:
; XOP: # %bb.0:
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,25645,18351,12137,2115,23705,1041,517]
-; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: vpsrlw $1, %xmm0, %xmm0
-; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [9363,45589,20971,38835,2115,44619,1041,517]
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
@@ -631,7 +603,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $15, %xmm0
+; SSE2-NEXT: psrlw $14, %xmm0
; SSE2-NEXT: pandn %xmm0, %xmm1
; SSE2-NEXT: por %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -644,7 +616,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm2
-; SSE41-NEXT: psrlw $7, %xmm2
+; SSE41-NEXT: psrlw $6, %xmm2
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
@@ -657,7 +629,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX-NEXT: vpackuswb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -665,12 +637,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
-; XOP-NEXT: movl $171, %eax
+; XOP-NEXT: movl $85, %eax
; XOP-NEXT: vmovd %eax, %xmm1
; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpsrlw $8, %xmm1, %xmm1
-; XOP-NEXT: movl $249, %eax
+; XOP-NEXT: movl $250, %eax
; XOP-NEXT: vmovd %eax, %xmm2
; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1
; XOP-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615]
@@ -683,13 +655,13 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
define <8 x i16> @pr38477(<8 x i16> %a0) {
; SSE2-LABEL: pr38477:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [u,551,57455,32823,32769,4443,32767,2115]
; SSE2-NEXT: pmulhuw %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubw %xmm1, %xmm2
-; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768]
+; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,0,0,0,0,0,32768,0]
; SSE2-NEXT: paddw %xmm1, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,0]
; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
@@ -702,34 +674,34 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
;
; SSE41-LABEL: pr38477:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,551,57455,32823,32769,4443,32767,2115]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubw %xmm1, %xmm2
-; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768]
+; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,0,0,0,0,0,32768,0]
; SSE41-NEXT: paddw %xmm1, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,1024,1024,16,4,1024,u,4096]
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,1024,2,2,8192,4096,u]
; SSE41-NEXT: pmulhuw %xmm2, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6],xmm2[7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: pr38477:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,551,57455,32823,32769,4443,32767,2115]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,0,0,0,0,32768,0]
; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,1024,1024,16,4,1024,u,4096]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,u,1024,2,2,8192,4096,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6],xmm1[7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX-NEXT: retq
;
; XOP-LABEL: pr38477:
; XOP: # %bb.0:
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,4957,57457,4103,16385,35545,2048,2115]
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,551,57455,32823,32769,4443,32767,2115]
; XOP-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,32768,0,0,0,0,0,32768]
+; XOP-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,0,0,0,0,32768,0]
; XOP-NEXT: vpaddw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index ac78136b9d8ea..95be53d3e4a30 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -7,14 +7,14 @@ define zeroext i16 @test1(i16 zeroext %x) nounwind {
; X86-LABEL: test1:
; X86: # %bb.0: # %entry
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $63551, %eax, %eax # imm = 0xF83F
+; X86-NEXT: imull $63549, %eax, %eax # imm = 0xF83D
; X86-NEXT: shrl $21, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test1:
; X64: # %bb.0: # %entry
-; X64-NEXT: imull $63551, %edi, %eax # imm = 0xF83F
+; X64-NEXT: imull $63549, %edi, %eax # imm = 0xF83D
; X64-NEXT: shrl $21, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
@@ -27,15 +27,15 @@ define zeroext i16 @test2(i8 signext %x, i16 zeroext %c) nounwind readnone ssp n
; X86-LABEL: test2:
; X86: # %bb.0: # %entry
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
-; X86-NEXT: shrl $17, %eax
+; X86-NEXT: imull $21845, %eax, %eax # imm = 0x5555
+; X86-NEXT: shrl $16, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test2:
; X64: # %bb.0: # %entry
-; X64-NEXT: imull $43691, %esi, %eax # imm = 0xAAAB
-; X64-NEXT: shrl $17, %eax
+; X64-NEXT: imull $21845, %esi, %eax # imm = 0x5555
+; X64-NEXT: shrl $16, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
entry:
@@ -48,15 +48,14 @@ define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) nounwind readnone ssp nor
; X86-LABEL: test3:
; X86: # %bb.0: # %entry
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %eax
-; X86-NEXT: shrl $9, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: imull $85, %eax, %eax
+; X86-NEXT: movb %ah, %al
; X86-NEXT: retl
;
; X64-LABEL: test3:
; X64: # %bb.0: # %entry
-; X64-NEXT: imull $171, %esi, %eax
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: imull $85, %esi, %eax
+; X64-NEXT: shrl $8, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
entry:
@@ -93,17 +92,17 @@ entry:
define i32 @test5(i32 %A) nounwind {
; X86-LABEL: test5:
; X86: # %bb.0:
-; X86-NEXT: movl $365384439, %eax # imm = 0x15C752F7
+; X86-NEXT: movl $1461537755, %eax # imm = 0x571D4BDB
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $27, %eax
+; X86-NEXT: shrl $29, %eax
; X86-NEXT: retl
;
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: imulq $365384439, %rax, %rax # imm = 0x15C752F7
-; X64-NEXT: shrq $59, %rax
+; X64-NEXT: imulq $1461537755, %rax, %rax # imm = 0x571D4BDB
+; X64-NEXT: shrq $61, %rax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%tmp1 = udiv i32 %A, 1577682821 ; <i32> [#uses=1]
@@ -139,19 +138,27 @@ entry:
define i32 @test7(i32 %x) nounwind {
; X86-LABEL: test7:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: movl $613566757, %ecx # imm = 0x24924925
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-1840700271, %edx # imm = 0x92492491
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shrl %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: shrl $4, %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: test7:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: shrl $2, %edi
-; X64-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: movl $2454267025, %eax # imm = 0x92492491
+; X64-NEXT: imulq %rcx, %rax
; X64-NEXT: shrq $32, %rax
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: shrl %edi
+; X64-NEXT: addl %edi, %eax
+; X64-NEXT: shrl $4, %eax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%div = udiv i32 %x, 28
@@ -163,19 +170,16 @@ define i8 @test8(i8 %x) nounwind {
; X86-LABEL: test8:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $211, %eax, %eax
-; X86-NEXT: shrl $13, %eax
+; X86-NEXT: imull $209, %eax, %eax
+; X86-NEXT: shrl $14, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test8:
; X64: # %bb.0:
-; X64-NEXT: shrb %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $211, %eax, %eax
-; X64-NEXT: shrl $13, %eax
+; X64-NEXT: imull $209, %eax, %eax
+; X64-NEXT: shrl $14, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%div = udiv i8 %x, 78
@@ -186,19 +190,23 @@ define i8 @test9(i8 %x) nounwind {
; X86-LABEL: test9:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrb $2, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: imull $71, %eax, %eax
-; X86-NEXT: shrl $11, %eax
+; X86-NEXT: imull $35, %eax, %ecx
+; X86-NEXT: subb %ch, %al
+; X86-NEXT: shrb %al
+; X86-NEXT: addb %ch, %al
+; X86-NEXT: shrb $4, %al
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: test9:
; X64: # %bb.0:
-; X64-NEXT: shrb $2, %dil
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $71, %eax, %eax
-; X64-NEXT: shrl $11, %eax
+; X64-NEXT: imull $35, %eax, %ecx
+; X64-NEXT: shrl $8, %ecx
+; X64-NEXT: subb %cl, %al
+; X64-NEXT: shrb %al
+; X64-NEXT: addb %cl, %al
+; X64-NEXT: shrb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%div = udiv i8 %x, 116
@@ -311,10 +319,9 @@ define i64 @PR23590(i64 %x) nounwind {
;
; X64-FAST-LABEL: PR23590:
; X64-FAST: # %bb.0: # %entry
-; X64-FAST-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F
+; X64-FAST-NEXT: movabsq $1494268454735485, %rcx # imm = 0x54F077C718E7D
; X64-FAST-NEXT: movq %rdi, %rax
; X64-FAST-NEXT: mulq %rcx
-; X64-FAST-NEXT: shrq $12, %rdx
; X64-FAST-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039
; X64-FAST-NEXT: subq %rax, %rdi
; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
@@ -325,10 +332,9 @@ define i64 @PR23590(i64 %x) nounwind {
;
; X64-SLOW-LABEL: PR23590:
; X64-SLOW: # %bb.0: # %entry
-; X64-SLOW-NEXT: movabsq $6120523590596543007, %rcx # imm = 0x54F077C718E7C21F
+; X64-SLOW-NEXT: movabsq $1494268454735485, %rcx # imm = 0x54F077C718E7D
; X64-SLOW-NEXT: movq %rdi, %rax
; X64-SLOW-NEXT: mulq %rcx
-; X64-SLOW-NEXT: shrq $12, %rdx
; X64-SLOW-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039
; X64-SLOW-NEXT: subq %rax, %rdi
; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925
@@ -376,12 +382,14 @@ define { i64, i32 } @PR38622(i64) nounwind {
;
; X64-LABEL: PR38622:
; X64: # %bb.0:
+; X64-NEXT: movabsq $4951760157141521099, %rcx # imm = 0x44B82FA09B5A52CB
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: shrq $11, %rax
-; X64-NEXT: movabsq $4835703278458517, %rcx # imm = 0x112E0BE826D695
; X64-NEXT: mulq %rcx
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $9, %rax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq %rdx, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: shrq $30, %rax
; X64-NEXT: imull $-294967296, %eax, %ecx # imm = 0xEE6B2800
; X64-NEXT: subl %ecx, %edi
; X64-NEXT: movl %edi, %edx
@@ -455,10 +463,9 @@ define i64 @urem_i64_3(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
@@ -467,10 +474,9 @@ define i64 @urem_i64_3(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_3:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -486,10 +492,9 @@ define i64 @urem_i64_5(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
+; X86-NEXT: movl $858993459, %edx # imm = 0x33333333
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $2, %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
@@ -498,10 +503,9 @@ define i64 @urem_i64_5(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_5:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $2, %rdx
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -517,10 +521,9 @@ define i64 @urem_i64_15(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-2004318071, %edx # imm = 0x88888889
+; X86-NEXT: movl $286331153, %edx # imm = 0x11111111
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $3, %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: subl %eax, %ecx
@@ -530,10 +533,9 @@ define i64 @urem_i64_15(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_15:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X64-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $3, %rdx
; X64-NEXT: leaq (%rdx,%rdx,4), %rax
; X64-NEXT: leaq (%rax,%rax,2), %rax
; X64-NEXT: subq %rax, %rdi
@@ -550,28 +552,26 @@ define i64 @urem_i64_17(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1
+; X86-NEXT: movl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-16, %eax
-; X86-NEXT: shrl $4, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_17:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-16, %rax
-; X64-NEXT: shrq $4, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shlq $4, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -588,9 +588,8 @@ define i64 @urem_i64_255(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; X86-NEXT: movl $16843009, %edx # imm = 0x1010101
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $7, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: subl %eax, %edx
@@ -603,10 +602,9 @@ define i64 @urem_i64_255(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_255:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $7, %rdx
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: shlq $8, %rax
; X64-NEXT: subq %rax, %rdx
@@ -623,28 +621,26 @@ define i64 @urem_i64_257(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01
+; X86-NEXT: movl $16711935, %edx # imm = 0xFF00FF
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-256, %eax
-; X86-NEXT: shrl $8, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %ecx
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: urem_i64_257:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X64-NEXT: movabsq $71777214294589695, %rcx # imm = 0xFF00FF00FF00FF
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-256, %rax
-; X64-NEXT: shrq $8, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shlq $8, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -661,9 +657,8 @@ define i64 @urem_i64_65535(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2147450879, %edx # imm = 0x80008001
+; X86-NEXT: movl $65537, %edx # imm = 0x10001
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $15, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: subl %eax, %edx
@@ -676,10 +671,9 @@ define i64 @urem_i64_65535(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_65535:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq $15, %rdx
; X64-NEXT: movq %rdx, %rax
; X64-NEXT: shlq $16, %rax
; X64-NEXT: subq %rax, %rdx
@@ -696,12 +690,12 @@ define i64 @urem_i64_65537(i64 %x) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-65535, %edx # imm = 0xFFFF0001
+; X86-NEXT: movl $65535, %edx # imm = 0xFFFF
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $16, %eax
-; X86-NEXT: shldl $16, %edx, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
@@ -709,14 +703,13 @@ define i64 @urem_i64_65537(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_65537:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X64-NEXT: movabsq $281470681808895, %rcx # imm = 0xFFFF0000FFFF
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; X64-NEXT: shrq $16, %rdx
-; X64-NEXT: addq %rax, %rdx
-; X64-NEXT: subq %rdx, %rdi
+; X64-NEXT: shlq $16, %rax
+; X64-NEXT: addq %rdx, %rax
+; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
entry:
@@ -735,10 +728,9 @@ define i64 @urem_i64_12(i64 %x) nounwind {
; X86-NEXT: shldl $30, %esi, %ecx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: andl $3, %esi
@@ -749,10 +741,9 @@ define i64 @urem_i64_12(i64 %x) nounwind {
;
; X64-LABEL: urem_i64_12:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: andq $-4, %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
@@ -766,7 +757,6 @@ entry:
define i64 @udiv_i64_3(i64 %x) nounwind {
; X86-LABEL: udiv_i64_3:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -774,32 +764,30 @@ define i64 @udiv_i64_3(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: shrl %edx
+; X86-NEXT: mull %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_3:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 3
@@ -809,7 +797,6 @@ entry:
define i64 @udiv_i64_5(i64 %x) nounwind {
; X86-LABEL: udiv_i64_5:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -817,32 +804,30 @@ define i64 @udiv_i64_5(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-858993459, %ebx # imm = 0xCCCCCCCD
+; X86-NEXT: movl $858993459, %edx # imm = 0x33333333
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: shrl $2, %edx
+; X86-NEXT: mull %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-858993459, %edx # imm = 0xCCCCCCCD
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_5:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $2, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 5
@@ -859,10 +844,9 @@ define i64 @udiv_i64_15(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-2004318071, %edx # imm = 0x88888889
+; X86-NEXT: movl $286331153, %edx # imm = 0x11111111
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $3, %edx
; X86-NEXT: leal (%edx,%edx,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: subl %eax, %esi
@@ -882,10 +866,9 @@ define i64 @udiv_i64_15(i64 %x) nounwind {
; X64-LABEL: udiv_i64_15:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X64-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $3, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 15
@@ -895,7 +878,6 @@ entry:
define i64 @udiv_i64_17(i64 %x) nounwind {
; X86-LABEL: udiv_i64_17:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -903,34 +885,32 @@ define i64 @udiv_i64_17(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-252645135, %ebx # imm = 0xF0F0F0F1
+; X86-NEXT: movl $252645135, %edx # imm = 0xF0F0F0F
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-16, %eax
-; X86-NEXT: shrl $4, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: shll $4, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-252645135, %edx # imm = 0xF0F0F0F1
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_17:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $4, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 17
@@ -946,9 +926,8 @@ define i64 @udiv_i64_255(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; X86-NEXT: movl $16843009, %edx # imm = 0x1010101
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $7, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: subl %eax, %edx
@@ -970,10 +949,9 @@ define i64 @udiv_i64_255(i64 %x) nounwind {
; X64-LABEL: udiv_i64_255:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $7, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 255
@@ -983,7 +961,6 @@ entry:
define i64 @udiv_i64_257(i64 %x) nounwind {
; X86-LABEL: udiv_i64_257:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -991,34 +968,32 @@ define i64 @udiv_i64_257(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-16711935, %ebx # imm = 0xFF00FF01
+; X86-NEXT: movl $16711935, %edx # imm = 0xFF00FF
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $-256, %eax
-; X86-NEXT: shrl $8, %edx
-; X86-NEXT: addl %eax, %edx
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: shll $8, %eax
+; X86-NEXT: addl %edx, %eax
+; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-16711935, %edx # imm = 0xFF00FF01
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-16711935, %edi, %ecx # imm = 0xFF00FF01
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_257:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X64-NEXT: movabsq $71777214294589695, %rcx # imm = 0xFF00FF00FF00FF
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $8, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 257
@@ -1034,9 +1009,8 @@ define i64 @udiv_i64_65535(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl $0, %eax
-; X86-NEXT: movl $-2147450879, %edx # imm = 0x80008001
+; X86-NEXT: movl $65537, %edx # imm = 0x10001
; X86-NEXT: mull %edx
-; X86-NEXT: shrl $15, %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: subl %eax, %edx
@@ -1060,10 +1034,9 @@ define i64 @udiv_i64_65535(i64 %x) nounwind {
; X64-LABEL: udiv_i64_65535:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $15, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 65535
@@ -1073,7 +1046,6 @@ entry:
define i64 @udiv_i64_65537(i64 %x) nounwind {
; X86-LABEL: udiv_i64_65537:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1081,17 +1053,18 @@ define i64 @udiv_i64_65537(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-65535, %ebx # imm = 0xFFFF0001
+; X86-NEXT: movl $65535, %edx # imm = 0xFFFF
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl $16, %eax
-; X86-NEXT: shldl $16, %edx, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-65535, %edx # imm = 0xFFFF0001
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: shll $16, %ecx
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: movl %edi, %ecx
@@ -1100,16 +1073,14 @@ define i64 @udiv_i64_65537(i64 %x) nounwind {
; X86-NEXT: addl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_65537:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X64-NEXT: movabsq $281470681808895, %rcx # imm = 0xFFFF0000FFFF
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $16, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 65537
@@ -1119,7 +1090,6 @@ entry:
define i64 @udiv_i64_12(i64 %x) nounwind {
; X86-LABEL: udiv_i64_12:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1129,32 +1099,31 @@ define i64 @udiv_i64_12(i64 %x) nounwind {
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: movl $-1431655765, %ebx # imm = 0xAAAAAAAB
+; X86-NEXT: movl $1431655765, %edx # imm = 0x55555555
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: shrl %edx
+; X86-NEXT: mull %edx
; X86-NEXT: leal (%edx,%edx,2), %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: subl %esi, %ecx
; X86-NEXT: sbbl $0, %edi
+; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edx
; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-LABEL: udiv_i64_12:
; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: mulq %rcx
; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: shrq $3, %rax
+; X64-NEXT: shrq $2, %rax
; X64-NEXT: retq
entry:
%rem = udiv i64 %x, 12
@@ -1176,10 +1145,9 @@ define i64 @urem_i64_3_optsize(i64 %x) nounwind optsize {
;
; X64-LABEL: urem_i64_3_optsize:
; X64: # %bb.0: # %entry
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rcx
-; X64-NEXT: shrq %rdx
; X64-NEXT: leaq (%rdx,%rdx,2), %rax
; X64-NEXT: subq %rax, %rdi
; X64-NEXT: movq %rdi, %rax
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9..df763d04d6681 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -99,13 +99,13 @@ define i64 @udiv128(i128 %x) nounwind {
; X86-64: # %bb.0:
; X86-64-NEXT: addq %rdi, %rsi
; X86-64-NEXT: adcq $0, %rsi
-; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rsi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq %rdx
-; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT: subq %rsi, %rax
-; X86-64-NEXT: addq %rdi, %rax
+; X86-64-NEXT: leaq (%rdx,%rdx,2), %rcx
+; X86-64-NEXT: subq %rsi, %rcx
+; X86-64-NEXT: addq %rdi, %rcx
+; X86-64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB
; X86-64-NEXT: imulq %rcx, %rax
; X86-64-NEXT: retq
;
@@ -114,14 +114,14 @@ define i64 @udiv128(i128 %x) nounwind {
; WIN64-NEXT: movq %rdx, %r8
; WIN64-NEXT: addq %rcx, %r8
; WIN64-NEXT: adcq $0, %r8
-; WIN64-NEXT: movabsq $-6148914691236517205, %r9 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r8, %rax
-; WIN64-NEXT: mulq %r9
-; WIN64-NEXT: shrq %rdx
-; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT: subq %r8, %rax
-; WIN64-NEXT: addq %rcx, %rax
-; WIN64-NEXT: imulq %r9, %rax
+; WIN64-NEXT: mulq %rdx
+; WIN64-NEXT: leaq (%rdx,%rdx,2), %rdx
+; WIN64-NEXT: subq %r8, %rdx
+; WIN64-NEXT: addq %rcx, %rdx
+; WIN64-NEXT: movabsq $-6148914691236517205, %rax # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: imulq %rdx, %rax
; WIN64-NEXT: retq
@@ -135,10 +135,9 @@ define i128 @urem_i128_3(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
@@ -149,10 +148,9 @@ define i128 @urem_i128_3(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
@@ -168,10 +166,9 @@ define i128 @urem_i128_5(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-3689348814741910323, %rcx # imm = 0xCCCCCCCCCCCCCCCD
+; X86-64-NEXT: movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $2, %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
@@ -182,10 +179,9 @@ define i128 @urem_i128_5(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-3689348814741910323, %rdx # imm = 0xCCCCCCCCCCCCCCCD
+; WIN64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $2, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
@@ -201,10 +197,9 @@ define i128 @urem_i128_15(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; X86-64-NEXT: movabsq $1229782938247303441, %rcx # imm = 0x1111111111111111
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $3, %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: leaq (%rax,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rdi
@@ -216,10 +211,9 @@ define i128 @urem_i128_15(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; WIN64-NEXT: movabsq $1229782938247303441, %rdx # imm = 0x1111111111111111
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $3, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rax,%rax,2), %rax
; WIN64-NEXT: subq %rax, %rcx
@@ -236,14 +230,13 @@ define i128 @urem_i128_17(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F1
+; X86-64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-16, %rax
-; X86-64-NEXT: shrq $4, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: shlq $4, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -252,14 +245,13 @@ define i128 @urem_i128_17(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F1
+; WIN64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-16, %rax
-; WIN64-NEXT: shrq $4, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: shlq $4, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -274,9 +266,8 @@ define i128 @urem_i128_255(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X86-64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $7, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $8, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -292,9 +283,8 @@ define i128 @urem_i128_255(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081
+; WIN64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $7, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $8, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -313,14 +303,13 @@ define i128 @urem_i128_257(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-71777214294589695, %rcx # imm = 0xFF00FF00FF00FF01
+; X86-64-NEXT: movabsq $71777214294589695, %rcx # imm = 0xFF00FF00FF00FF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-256, %rax
-; X86-64-NEXT: shrq $8, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: shlq $8, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -329,14 +318,13 @@ define i128 @urem_i128_257(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-71777214294589695, %rdx # imm = 0xFF00FF00FF00FF01
+; WIN64-NEXT: movabsq $71777214294589695, %rdx # imm = 0xFF00FF00FF00FF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-256, %rax
-; WIN64-NEXT: shrq $8, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: shlq $8, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -351,9 +339,8 @@ define i128 @urem_i128_65535(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X86-64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $15, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $16, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -369,9 +356,8 @@ define i128 @urem_i128_65535(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001
+; WIN64-NEXT: movabsq $281479271743489, %rdx # imm = 0x1000100010001
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $15, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $16, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -390,14 +376,13 @@ define i128 @urem_i128_65537(i128 %x) nounwind {
; X86-64: # %bb.0: # %entry
; X86-64-NEXT: addq %rsi, %rdi
; X86-64-NEXT: adcq $0, %rdi
-; X86-64-NEXT: movabsq $-281470681808895, %rcx # imm = 0xFFFF0000FFFF0001
+; X86-64-NEXT: movabsq $281470681808895, %rcx # imm = 0xFFFF0000FFFF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %rcx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; X86-64-NEXT: shrq $16, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rdi
+; X86-64-NEXT: shlq $16, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rdi
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: xorl %edx, %edx
; X86-64-NEXT: retq
@@ -406,14 +391,13 @@ define i128 @urem_i128_65537(i128 %x) nounwind {
; WIN64: # %bb.0: # %entry
; WIN64-NEXT: addq %rdx, %rcx
; WIN64-NEXT: adcq $0, %rcx
-; WIN64-NEXT: movabsq $-281470681808895, %rdx # imm = 0xFFFF0000FFFF0001
+; WIN64-NEXT: movabsq $281470681808895, %rdx # imm = 0xFFFF0000FFFF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; WIN64-NEXT: shrq $16, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %rcx
+; WIN64-NEXT: shlq $16, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %rcx
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: xorl %edx, %edx
; WIN64-NEXT: retq
@@ -430,10 +414,9 @@ define i128 @urem_i128_12(i128 %x) nounwind {
; X86-64-NEXT: shrq $2, %rsi
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
-; X86-64-NEXT: shrq %rdx
; X86-64-NEXT: leal (%rdx,%rdx,2), %eax
; X86-64-NEXT: subl %eax, %ecx
; X86-64-NEXT: andl $3, %edi
@@ -448,10 +431,9 @@ define i128 @urem_i128_12(i128 %x) nounwind {
; WIN64-NEXT: shrq $2, %rdx
; WIN64-NEXT: addq %rdx, %r8
; WIN64-NEXT: adcq $0, %r8
-; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq %rdx
; WIN64-NEXT: leal (%rdx,%rdx,2), %eax
; WIN64-NEXT: subl %eax, %r8d
; WIN64-NEXT: andl $3, %ecx
@@ -469,16 +451,16 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: shrq %rdx
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -492,16 +474,16 @@ define i128 @udiv_i128_3(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: shrq %rdx
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -519,16 +501,16 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-3689348814741910323, %r8 # imm = 0xCCCCCCCCCCCCCCCD
+; X86-64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: shrq $2, %rdx
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-3689348814741910324, %rcx # imm = 0xCCCCCCCCCCCCCCCC
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-3689348814741910323, %r8 # imm = 0xCCCCCCCCCCCCCCCD
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -542,16 +524,16 @@ define i128 @udiv_i128_5(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD
+; WIN64-NEXT: movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: shrq $2, %rdx
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-3689348814741910324, %r9 # imm = 0xCCCCCCCCCCCCCCCC
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-3689348814741910323, %r10 # imm = 0xCCCCCCCCCCCCCCCD
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -569,10 +551,9 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; X86-64-NEXT: movabsq $1229782938247303441, %rdx # imm = 0x1111111111111111
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
-; X86-64-NEXT: shrq $3, %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: leaq (%rax,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rcx
@@ -594,10 +575,9 @@ define i128 @udiv_i128_15(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; WIN64-NEXT: movabsq $1229782938247303441, %rdx # imm = 0x1111111111111111
; WIN64-NEXT: movq %r9, %rax
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $3, %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rax,%rax,2), %rax
; WIN64-NEXT: subq %rax, %r9
@@ -623,18 +603,18 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F1
+; X86-64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-16, %rax
-; X86-64-NEXT: shrq $4, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: shlq $4, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F1
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -648,18 +628,18 @@ define i128 @udiv_i128_17(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1
+; WIN64-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-16, %rax
-; WIN64-NEXT: shrq $4, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: shlq $4, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-1085102592571150096, %r9 # imm = 0xF0F0F0F0F0F0F0F0
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F1
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -677,9 +657,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9187201950435737471, %rcx # imm = 0x8080808080808081
+; X86-64-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $7, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $8, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -704,9 +683,8 @@ define i128 @udiv_i128_255(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9187201950435737471, %rdx # imm = 0x8080808080808081
+; WIN64-NEXT: movabsq $72340172838076673, %rdx # imm = 0x101010101010101
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $7, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $8, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -735,18 +713,18 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-71777214294589695, %r8 # imm = 0xFF00FF00FF00FF01
+; X86-64-NEXT: movabsq $71777214294589695, %rdx # imm = 0xFF00FF00FF00FF
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-256, %rax
-; X86-64-NEXT: shrq $8, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: shlq $8, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-71777214294589696, %rcx # imm = 0xFF00FF00FF00FF00
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-71777214294589695, %r8 # imm = 0xFF00FF00FF00FF01
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -760,18 +738,18 @@ define i128 @udiv_i128_257(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01
+; WIN64-NEXT: movabsq $71777214294589695, %rdx # imm = 0xFF00FF00FF00FF
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-256, %rax
-; WIN64-NEXT: shrq $8, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: shlq $8, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-71777214294589696, %r9 # imm = 0xFF00FF00FF00FF00
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-71777214294589695, %r10 # imm = 0xFF00FF00FF00FF01
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -789,9 +767,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: addq %rsi, %rax
; X86-64-NEXT: adcq $0, %rax
-; X86-64-NEXT: movabsq $-9223231297218904063, %rcx # imm = 0x8000800080008001
+; X86-64-NEXT: movabsq $281479271743489, %rcx # imm = 0x1000100010001
; X86-64-NEXT: mulq %rcx
-; X86-64-NEXT: shrq $15, %rdx
; X86-64-NEXT: movq %rdx, %rax
; X86-64-NEXT: shlq $16, %rax
; X86-64-NEXT: subq %rax, %rdx
@@ -816,9 +793,8 @@ define i128 @udiv_i128_65535(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: addq %rdx, %rax
; WIN64-NEXT: adcq $0, %rax
-; WIN64-NEXT: movabsq $-9223231297218904063, %rdx # imm = 0x8000800080008001
+; WIN64-NEXT: movabsq $281479271743489, %rdx # imm = 0x1000100010001
; WIN64-NEXT: mulq %rdx
-; WIN64-NEXT: shrq $15, %rdx
; WIN64-NEXT: movq %rdx, %rax
; WIN64-NEXT: shlq $16, %rax
; WIN64-NEXT: subq %rax, %rdx
@@ -847,18 +823,18 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-281470681808895, %r8 # imm = 0xFFFF0000FFFF0001
+; X86-64-NEXT: movabsq $281470681808895, %rdx # imm = 0xFFFF0000FFFF
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: movq %rdx, %rax
-; X86-64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; X86-64-NEXT: shrq $16, %rdx
-; X86-64-NEXT: addq %rax, %rdx
-; X86-64-NEXT: subq %rdx, %rcx
+; X86-64-NEXT: shlq $16, %rax
+; X86-64-NEXT: addq %rdx, %rax
+; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-281470681808896, %rcx # imm = 0xFFFF0000FFFF0000
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-281470681808895, %r8 # imm = 0xFFFF0000FFFF0001
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -872,18 +848,18 @@ define i128 @udiv_i128_65537(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %rdx, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001
+; WIN64-NEXT: movabsq $281470681808895, %rdx # imm = 0xFFFF0000FFFF
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: movq %rdx, %rax
-; WIN64-NEXT: andq $-65536, %rax # imm = 0xFFFF0000
-; WIN64-NEXT: shrq $16, %rdx
-; WIN64-NEXT: addq %rax, %rdx
-; WIN64-NEXT: subq %rdx, %r9
+; WIN64-NEXT: shlq $16, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-281470681808896, %r9 # imm = 0xFFFF0000FFFF0000
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-281470681808895, %r10 # imm = 0xFFFF0000FFFF0001
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
@@ -903,16 +879,16 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
; X86-64-NEXT: movq %rdi, %rcx
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: adcq $0, %rcx
-; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; X86-64-NEXT: movq %rcx, %rax
-; X86-64-NEXT: mulq %r8
-; X86-64-NEXT: shrq %rdx
+; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: subq %rcx, %rdi
; X86-64-NEXT: sbbq $0, %rsi
; X86-64-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
; X86-64-NEXT: imulq %rdi, %rcx
+; X86-64-NEXT: movabsq $-6148914691236517205, %r8 # imm = 0xAAAAAAAAAAAAAAAB
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: mulq %r8
; X86-64-NEXT: addq %rcx, %rdx
@@ -928,16 +904,16 @@ define i128 @udiv_i128_12(i128 %x) nounwind {
; WIN64-NEXT: movq %rcx, %r9
; WIN64-NEXT: addq %r8, %r9
; WIN64-NEXT: adcq $0, %r9
-; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
+; WIN64-NEXT: movabsq $6148914691236517205, %rdx # imm = 0x5555555555555555
; WIN64-NEXT: movq %r9, %rax
-; WIN64-NEXT: mulq %r10
-; WIN64-NEXT: shrq %rdx
+; WIN64-NEXT: mulq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax
; WIN64-NEXT: subq %rax, %r9
; WIN64-NEXT: subq %r9, %rcx
; WIN64-NEXT: sbbq $0, %r8
; WIN64-NEXT: movabsq $-6148914691236517206, %r9 # imm = 0xAAAAAAAAAAAAAAAA
; WIN64-NEXT: imulq %rcx, %r9
+; WIN64-NEXT: movabsq $-6148914691236517205, %r10 # imm = 0xAAAAAAAAAAAAAAAB
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: mulq %r10
; WIN64-NEXT: addq %r9, %rdx
diff --git a/llvm/test/CodeGen/X86/divrem-by-select.ll b/llvm/test/CodeGen/X86/divrem-by-select.ll
index f9582bb7343ba..d9cb6a506e90d 100644
--- a/llvm/test/CodeGen/X86/divrem-by-select.ll
+++ b/llvm/test/CodeGen/X86/divrem-by-select.ll
@@ -28,14 +28,13 @@ define <2 x i64> @udiv_identity_const(<2 x i1> %c, <2 x i64> %x) {
; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1
; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rdx
-; CHECK-X64-V4-NEXT: movabsq $3353953467947191203, %rax # imm = 0x2E8BA2E8BA2E8BA3
+; CHECK-X64-V4-NEXT: movabsq $1676976733973595602, %rax # imm = 0x1745D1745D1745D2
; CHECK-X64-V4-NEXT: mulxq %rax, %rcx, %rcx
; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0
; CHECK-X64-V4-NEXT: vmovq %xmm1, %rdx
; CHECK-X64-V4-NEXT: mulxq %rax, %rax, %rax
; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2
-; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; CHECK-X64-V4-NEXT: vpsrlq $1, %xmm0, %xmm1 {%k1}
+; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm2[0],xmm0[0]
; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-X64-V4-NEXT: retq
%d = select <2 x i1> %c, <2 x i64> <i64 11, i64 11>, <2 x i64> <i64 1, i64 1>
@@ -70,14 +69,14 @@ define <2 x i64> @udiv_identity_const_todo_getter_nonzero(<2 x i1> %c, <2 x i64>
; CHECK-X64-V4-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-X64-V4-NEXT: vpmovq2m %xmm0, %k1
; CHECK-X64-V4-NEXT: vpextrq $1, %xmm1, %rdx
-; CHECK-X64-V4-NEXT: movabsq $-3689348814741910323, %rax # imm = 0xCCCCCCCCCCCCCCCD
+; CHECK-X64-V4-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
; CHECK-X64-V4-NEXT: mulxq %rax, %rcx, %rcx
; CHECK-X64-V4-NEXT: vmovq %rcx, %xmm0
; CHECK-X64-V4-NEXT: vmovq %xmm1, %rdx
; CHECK-X64-V4-NEXT: mulxq %rax, %rax, %rax
; CHECK-X64-V4-NEXT: vmovq %rax, %xmm2
; CHECK-X64-V4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; CHECK-X64-V4-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1}
+; CHECK-X64-V4-NEXT: vpsrlq $1, %xmm0, %xmm1 {%k1}
; CHECK-X64-V4-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-X64-V4-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll
index 3196f8177cc9b..3d2b1360121f1 100644
--- a/llvm/test/CodeGen/X86/freeze.ll
+++ b/llvm/test/CodeGen/X86/freeze.ll
@@ -127,11 +127,10 @@ define i32 @freeze_zext(i64 %a) nounwind {
; X86ASM: # %bb.0: # %entry
; X86ASM-NEXT: movq %rdi, %rax
; X86ASM-NEXT: movl %eax, %ecx
-; X86ASM-NEXT: movl $3435973837, %edx # imm = 0xCCCCCCCD
-; X86ASM-NEXT: imulq %rcx, %rdx
-; X86ASM-NEXT: shrq $35, %rdx
-; X86ASM-NEXT: addl %edx, %edx
-; X86ASM-NEXT: leal (%rdx,%rdx,4), %ecx
+; X86ASM-NEXT: imulq $858993459, %rcx, %rcx # imm = 0x33333333
+; X86ASM-NEXT: shrq $33, %rcx
+; X86ASM-NEXT: addl %ecx, %ecx
+; X86ASM-NEXT: leal (%rcx,%rcx,4), %ecx
; X86ASM-NEXT: subl %ecx, %eax
; X86ASM-NEXT: # kill: def $eax killed $eax killed $rax
; X86ASM-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/known-bits.ll b/llvm/test/CodeGen/X86/known-bits.ll
index 9741f6f0a5e2d..0a337216c476b 100644
--- a/llvm/test/CodeGen/X86/known-bits.ll
+++ b/llvm/test/CodeGen/X86/known-bits.ll
@@ -8,9 +8,10 @@ define void @knownbits_zext_in_reg(ptr) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl (%eax), %ecx
-; X86-NEXT: imull $101, %ecx, %eax
-; X86-NEXT: shrl $14, %eax
-; X86-NEXT: imull $177, %ecx, %edx
+; X86-NEXT: leal (%ecx,%ecx,4), %eax
+; X86-NEXT: leal (%eax,%eax,4), %eax
+; X86-NEXT: shrl $12, %eax
+; X86-NEXT: imull $175, %ecx, %edx
; X86-NEXT: shrl $14, %edx
; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: xorl %ebx, %ebx
@@ -31,9 +32,10 @@ define void @knownbits_zext_in_reg(ptr) nounwind {
; X64-LABEL: knownbits_zext_in_reg:
; X64: # %bb.0: # %BB
; X64-NEXT: movzbl (%rdi), %eax
-; X64-NEXT: imull $101, %eax, %ecx
-; X64-NEXT: shrl $14, %ecx
-; X64-NEXT: imull $177, %eax, %edx
+; X64-NEXT: leal (%rax,%rax,4), %ecx
+; X64-NEXT: leal (%rcx,%rcx,4), %ecx
+; X64-NEXT: shrl $12, %ecx
+; X64-NEXT: imull $175, %eax, %edx
; X64-NEXT: shrl $14, %edx
; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: xorl %esi, %esi
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d..ac4323351b0ca 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -24,23 +24,37 @@ define <4 x i32> @pow2_non_splat_vec(<4 x i32> %x) {
define <4 x i32> @pow2_non_splat_vec_fail0(<4 x i32> %x) {
; CHECK-LABEL: pow2_non_splat_vec_fail0:
; CHECK: # %bb.0:
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [954437177,1073741824,268435456,67108864]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1908874353,u,2147483647,u]
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-NEXT: movdqa %xmm1, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubd %xmm1, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: paddd %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm1
+; CHECK-NEXT: psrld $5, %xmm1
+; CHECK-NEXT: movdqa %xmm2, %xmm3
; CHECK-NEXT: psrld $1, %xmm3
-; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; CHECK-NEXT: psrld $2, %xmm2
+; CHECK-NEXT: psrld $3, %xmm3
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: psubd %xmm1, %xmm0
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; CHECK-NEXT: psubd %xmm2, %xmm0
; CHECK-NEXT: retq
%r = urem <4 x i32> %x, <i32 9, i32 4, i32 16, i32 64>
ret <4 x i32> %r
diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
index 3edbcd1fe18eb..ea5b241f3184d 100644
--- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
+++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
@@ -470,21 +470,17 @@ define <4 x i32> @udiv_op0_constant(ptr %p) nounwind {
define <2 x i64> @udiv_op1_constant(ptr %p) nounwind {
; SSE-LABEL: udiv_op1_constant:
; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D
-; SSE-NEXT: mulq %rcx
-; SSE-NEXT: shrq $4, %rdx
+; SSE-NEXT: movabsq $-4392081922311798005, %rax # imm = 0xC30C30C30C30C30B
+; SSE-NEXT: mulq (%rdi)
+; SSE-NEXT: shrq $5, %rdx
; SSE-NEXT: movq %rdx, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: udiv_op1_constant:
; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: movabsq $-4392081922311798003, %rcx # imm = 0xC30C30C30C30C30D
-; AVX-NEXT: mulq %rcx
-; AVX-NEXT: shrq $4, %rdx
+; AVX-NEXT: movabsq $-4392081922311798005, %rax # imm = 0xC30C30C30C30C30B
+; AVX-NEXT: mulq (%rdi)
+; AVX-NEXT: shrq $5, %rdx
; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: retq
%x = load i64, ptr %p
@@ -519,11 +515,8 @@ define <16 x i8> @urem_op1_constant(ptr %p) nounwind {
; SSE-LABEL: urem_op1_constant:
; SSE: # %bb.0:
; SSE-NEXT: movzbl (%rdi), %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrb %cl
-; SSE-NEXT: movzbl %cl, %ecx
-; SSE-NEXT: imull $49, %ecx, %ecx
-; SSE-NEXT: shrl $10, %ecx
+; SSE-NEXT: imull $97, %eax, %ecx
+; SSE-NEXT: shrl $12, %ecx
; SSE-NEXT: imull $42, %ecx, %ecx
; SSE-NEXT: subb %cl, %al
; SSE-NEXT: movzbl %al, %eax
@@ -533,11 +526,8 @@ define <16 x i8> @urem_op1_constant(ptr %p) nounwind {
; AVX-LABEL: urem_op1_constant:
; AVX: # %bb.0:
; AVX-NEXT: movzbl (%rdi), %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrb %cl
-; AVX-NEXT: movzbl %cl, %ecx
-; AVX-NEXT: imull $49, %ecx, %ecx
-; AVX-NEXT: shrl $10, %ecx
+; AVX-NEXT: imull $97, %eax, %ecx
+; AVX-NEXT: shrl $12, %ecx
; AVX-NEXT: imull $42, %ecx, %ecx
; AVX-NEXT: subb %cl, %al
; AVX-NEXT: vmovd %eax, %xmm0
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 9e398096bfcc5..09717fb0bf37c 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -284,7 +284,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE2-LABEL: p8_vector_urem_by_const__nonsplat_undef3:
; SSE2: # %bb.0:
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655765,1431655765,1431655765,1431655765]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -292,7 +292,7 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psrld $2, %xmm2
+; SSE2-NEXT: psrld $1, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
@@ -309,12 +309,12 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; SSE4: # %bb.0:
; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [1431655765,1431655765,1431655765,1431655765]
; SSE4-NEXT: pmuludq %xmm2, %xmm1
; SSE4-NEXT: pmuludq %xmm0, %xmm2
; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE4-NEXT: psrld $2, %xmm2
+; SSE4-NEXT: psrld $1, %xmm2
; SSE4-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE4-NEXT: psubd %xmm2, %xmm0
; SSE4-NEXT: pxor %xmm1, %xmm1
@@ -326,12 +326,12 @@ define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531]
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1431655765,1431655765,1431655765,1431655765]
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6]
; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll
index 0b7d64f38c780..34f0ade956c3b 100644
--- a/llvm/test/CodeGen/X86/pr35636.ll
+++ b/llvm/test/CodeGen/X86/pr35636.ll
@@ -5,10 +5,10 @@
define void @_Z15uint64_to_asciimPc(i64 %arg) {
; HSW-LABEL: _Z15uint64_to_asciimPc:
; HSW: # %bb.0: # %bb
-; HSW-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; HSW-NEXT: movabsq $6490371073168534535, %rax # imm = 0x5A126E1A84AE6C07
; HSW-NEXT: movq %rdi, %rdx
; HSW-NEXT: mulxq %rax, %rax, %rax
-; HSW-NEXT: shrq $42, %rax
+; HSW-NEXT: shrq $45, %rax
; HSW-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
; HSW-NEXT: shrq $20, %rax
; HSW-NEXT: leal (%rax,%rax,4), %eax
@@ -22,10 +22,10 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
;
; ZN-LABEL: _Z15uint64_to_asciimPc:
; ZN: # %bb.0: # %bb
-; ZN-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; ZN-NEXT: movabsq $6490371073168534535, %rax # imm = 0x5A126E1A84AE6C07
; ZN-NEXT: movq %rdi, %rdx
; ZN-NEXT: mulxq %rax, %rax, %rax
-; ZN-NEXT: shrq $42, %rax
+; ZN-NEXT: shrq $45, %rax
; ZN-NEXT: imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
; ZN-NEXT: shrq $20, %rax
; ZN-NEXT: leal 5(%rax,%rax,4), %eax
diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll
index f1538f3598aec..ce3f8805a6083 100644
--- a/llvm/test/CodeGen/X86/pr38217.ll
+++ b/llvm/test/CodeGen/X86/pr38217.ll
@@ -10,13 +10,13 @@ define dso_local void @_Z12d2s_bufferedmPc(i64 %arg, ptr nocapture %arg1) {
; CHECK-NEXT: jb .LBB0_3
; CHECK-NEXT: # %bb.1: # %bb2.preheader
; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: movabsq $3777893186295716171, %r8 # imm = 0x346DC5D63886594B
+; CHECK-NEXT: movabsq $-3335171328526686933, %r8 # imm = 0xD1B71758E219652B
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %bb2
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: mulq %r8
-; CHECK-NEXT: shrq $11, %rdx
+; CHECK-NEXT: shrq $13, %rdx
; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710
; CHECK-NEXT: movq %rdi, %r9
; CHECK-NEXT: subq %rax, %r9
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index a950a13b0d8ca..3a7a01cb88124 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -12,44 +12,26 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX256BW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX256BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX256BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
-; AVX256BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX256BW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX256BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX256BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX256BW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX256BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX256BW-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX256BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX256BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX256BW-NEXT: retq
;
; AVX512BWVL-LABEL: test_div7_32i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BWVL-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BWVL-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BWVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BWVL-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
%res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
diff --git a/llvm/test/CodeGen/X86/rem.ll b/llvm/test/CodeGen/X86/rem.ll
index 893b49f9a0179..6890240f561be 100644
--- a/llvm/test/CodeGen/X86/rem.ll
+++ b/llvm/test/CodeGen/X86/rem.ll
@@ -40,10 +40,9 @@ define i32 @test3(i32 %X) {
; CHECK-LABEL: test3:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; CHECK-NEXT: movl $16843009, %edx # imm = 0x1010101
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: shrl $7, %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll $8, %eax
; CHECK-NEXT: subl %eax, %edx
diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
index 1ead3f98ab5d6..b96a644b803b9 100644
--- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll
@@ -83,14 +83,13 @@ define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
; X64-LABEL: vrolq_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rax
-; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
; X64-NEXT: mulq %rcx
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vmovq %xmm0, %rax
; X64-NEXT: mulq %rcx
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
; X64-NEXT: vprolq $57, %xmm0, %xmm0
; X64-NEXT: retq
%lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
@@ -265,7 +264,7 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: vpextrq $1, %xmm0, %rcx
-; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
+; X64-NEXT: movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
; X64-NEXT: vmovq %rdx, %xmm1
@@ -274,14 +273,19 @@ define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
; X64-NEXT: mulq %rdi
; X64-NEXT: vmovq %rdx, %xmm0
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
-; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
+; X64-NEXT: movabsq $-6180857105216966647, %rdi # imm = 0xAA392F35DC17F009
; X64-NEXT: movq %rcx, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: vmovq %rdx, %xmm1
+; X64-NEXT: subq %rdx, %rcx
+; X64-NEXT: shrq %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: vmovq %rcx, %xmm1
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: mulq %rdi
-; X64-NEXT: vmovq %rdx, %xmm2
+; X64-NEXT: subq %rdx, %rsi
+; X64-NEXT: shrq %rsi
+; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: vmovq %rsi, %xmm2
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X64-NEXT: vpsrlq $9, %xmm1, %xmm1
; X64-NEXT: vpsllq $56, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 8f046a4f5aea5..b86e1d6674340 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -82,17 +82,16 @@ define i8 @rolb_extract_udiv(i8 %i) nounwind {
; X86-LABEL: rolb_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %eax
-; X86-NEXT: shrl $9, %eax
-; X86-NEXT: rolb $4, %al
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: imull $85, %eax, %eax
+; X86-NEXT: rolb $4, %ah
+; X86-NEXT: movb %ah, %al
; X86-NEXT: retl
;
; X64-LABEL: rolb_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: imull $171, %eax, %eax
-; X64-NEXT: shrl $9, %eax
+; X64-NEXT: imull $85, %eax, %eax
+; X64-NEXT: shrl $8, %eax
; X64-NEXT: rolb $4, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
@@ -224,32 +223,23 @@ define i8 @no_extract_udiv(i8 %i) nounwind {
; X86-LABEL: no_extract_udiv:
; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: imull $171, %eax, %ecx
-; X86-NEXT: imull $79, %eax, %edx
-; X86-NEXT: subb %dh, %al
-; X86-NEXT: shrb %al
-; X86-NEXT: addb %dh, %al
-; X86-NEXT: shrb $5, %al
-; X86-NEXT: shlb $3, %ch
-; X86-NEXT: orb %al, %ch
-; X86-NEXT: andb $-9, %ch
-; X86-NEXT: movb %ch, %al
+; X86-NEXT: imull $85, %eax, %ecx
+; X86-NEXT: imull $83, %eax, %eax
+; X86-NEXT: shlb $4, %ch
+; X86-NEXT: shrl $12, %eax
+; X86-NEXT: orb %ch, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_udiv:
; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %ecx
-; X64-NEXT: imull $171, %ecx, %eax
+; X64-NEXT: imull $85, %ecx, %eax
; X64-NEXT: shrl $8, %eax
-; X64-NEXT: imull $79, %ecx, %edx
-; X64-NEXT: shrl $8, %edx
-; X64-NEXT: subb %dl, %cl
-; X64-NEXT: shrb %cl
-; X64-NEXT: addb %dl, %cl
-; X64-NEXT: shrb $5, %cl
-; X64-NEXT: shlb $3, %al
+; X64-NEXT: imull $83, %ecx, %ecx
+; X64-NEXT: shrl $12, %ecx
+; X64-NEXT: shlb $4, %al
; X64-NEXT: orb %cl, %al
-; X64-NEXT: andb $-9, %al
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%lhs_div = udiv i8 %i, 3
diff --git a/llvm/test/CodeGen/X86/urem-i8-constant.ll b/llvm/test/CodeGen/X86/urem-i8-constant.ll
index ae218405c0ef0..493b69fbf2937 100644
--- a/llvm/test/CodeGen/X86/urem-i8-constant.ll
+++ b/llvm/test/CodeGen/X86/urem-i8-constant.ll
@@ -7,8 +7,8 @@ define i8 @foo(i8 %tmp325) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: imull $111, %eax, %ecx
-; CHECK-NEXT: shrl $12, %ecx
+; CHECK-NEXT: imull $55, %eax, %ecx
+; CHECK-NEXT: shrl $11, %ecx
; CHECK-NEXT: leal (%ecx,%ecx,8), %edx
; CHECK-NEXT: leal (%ecx,%edx,4), %ecx
; CHECK-NEXT: subb %cl, %al
diff --git a/llvm/test/CodeGen/X86/urem-lkk.ll b/llvm/test/CodeGen/X86/urem-lkk.ll
index 573f875544cd4..00d7b09d013fe 100644
--- a/llvm/test/CodeGen/X86/urem-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-lkk.ll
@@ -6,13 +6,9 @@ define i32 @fold_urem_positive_odd(i32 %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movl %edi, %edx
-; CHECK-NEXT: subl %ecx, %edx
-; CHECK-NEXT: shrl %edx
-; CHECK-NEXT: addl %ecx, %edx
-; CHECK-NEXT: shrl $6, %edx
+; CHECK-NEXT: movl $2893451651, %edx # imm = 0xAC769183
+; CHECK-NEXT: imulq %rcx, %rdx
+; CHECK-NEXT: shrq $38, %rdx
; CHECK-NEXT: imull $95, %edx, %ecx
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: retq
@@ -26,7 +22,7 @@ define i32 @fold_urem_positive_even(i32 %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3
+; CHECK-NEXT: movl $4149100481, %edx # imm = 0xF74E3FC1
; CHECK-NEXT: imulq %rcx, %rdx
; CHECK-NEXT: shrq $42, %rdx
; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424
@@ -41,17 +37,14 @@ define i32 @fold_urem_positive_even(i32 %x) {
define i32 @combine_urem_udiv(i32 %x) {
; CHECK-LABEL: combine_urem_udiv:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: imulq $1491936009, %rax, %rcx # imm = 0x58ED2309
-; CHECK-NEXT: shrq $32, %rcx
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: subl %ecx, %eax
-; CHECK-NEXT: shrl %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: shrl $6, %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: movl $2893451651, %eax # imm = 0xAC769183
+; CHECK-NEXT: imulq %rcx, %rax
+; CHECK-NEXT: shrq $38, %rax
; CHECK-NEXT: imull $95, %eax, %ecx
; CHECK-NEXT: subl %ecx, %edi
; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
%1 = urem i32 %x, 95
%2 = udiv i32 %x, 95
@@ -93,12 +86,14 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
define i64 @dont_fold_urem_i64(i64 %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: # %bb.0:
+; CHECK-NEXT: movabsq $188232082384791343, %rcx # imm = 0x29CBC14E5E0A72F
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movabsq $6023426636313322977, %rcx # imm = 0x5397829CBC14E5E1
; CHECK-NEXT: mulq %rcx
-; CHECK-NEXT: shrq $4, %rdx
-; CHECK-NEXT: imulq $98, %rdx, %rax
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rdx, %rax
+; CHECK-NEXT: shrq %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: imulq $98, %rax, %rax
; CHECK-NEXT: subq %rax, %rdi
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
index 2166e43fc4286..fb411b35e1b6a 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll
@@ -246,21 +246,18 @@ define <4 x i32> @test_urem_even_neg100(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_odd_undef1:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [171798692,171798692,171798692,171798692]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $3, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [25,25,25,25]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [25,25,25,25]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
@@ -271,12 +268,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_odd_undef1:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $3, %xmm2
; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
@@ -287,12 +283,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_odd_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -303,12 +298,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_urem_odd_undef1:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25]
; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -320,12 +314,11 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-LABEL: test_urem_odd_undef1:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [171798692,171798692,171798692,171798692]
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $3, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -341,7 +334,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind {
define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE2-LABEL: test_urem_even_undef1:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [687194767,687194767,687194767,687194767]
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
@@ -349,15 +342,19 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psrld $5, %xmm2
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100]
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE2-NEXT: psubd %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $1, %xmm1
+; CHECK-SSE2-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE2-NEXT: psrld $4, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [100,100,100,100]
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3
+; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrld $31, %xmm0
@@ -366,14 +363,18 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-SSE41-LABEL: test_urem_even_undef1:
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1
; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2
; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-SSE41-NEXT: psrld $5, %xmm2
-; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $1, %xmm1
+; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1
+; CHECK-SSE41-NEXT: psrld $4, %xmm1
+; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-SSE41-NEXT: psrld $31, %xmm0
@@ -382,12 +383,15 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX1-LABEL: test_urem_even_undef1:
; CHECK-AVX1: # %bb.0:
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1
+; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
+; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -398,12 +402,15 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX2-LABEL: test_urem_even_undef1:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1
+; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
+; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX2-NEXT: vpsrld $4, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100]
; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -415,12 +422,15 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind {
; CHECK-AVX512VL-LABEL: test_urem_even_undef1:
; CHECK-AVX512VL: # %bb.0:
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535]
+; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [687194767,687194767,687194767,687194767]
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; CHECK-AVX512VL-NEXT: vpsrld $5, %xmm1, %xmm1
+; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT: vpsrld $1, %xmm2, %xmm2
+; CHECK-AVX512VL-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT: vpsrld $4, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 94c7892795c2b..b825d1a6931c8 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -6,81 +6,77 @@
define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; SSE-LABEL: fold_urem_vec_1:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl $2, %ecx
-; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; SSE-NEXT: shrl $19, %ecx
-; SSE-NEXT: imull $124, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: movd %xmm0, %ecx
-; SSE-NEXT: movzwl %cx, %edx
-; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %edx
-; SSE-NEXT: imull $95, %edx, %edx
-; SSE-NEXT: subl %edx, %ecx
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl %ecx
-; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; SSE-NEXT: shrl $17, %ecx
-; SSE-NEXT: imull $98, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shll $6, %ecx
+; SSE-NEXT: leal (%rcx,%rax,2), %ecx
; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $9, %edx
-; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
+; SSE-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pextrw $1, %xmm0, %ecx
+; SSE-NEXT: imull $1057, %ecx, %edx # imm = 0x421
+; SSE-NEXT: shrl $16, %edx
+; SSE-NEXT: movl %ecx, %esi
+; SSE-NEXT: subl %edx, %esi
+; SSE-NEXT: movzwl %si, %esi
+; SSE-NEXT: shrl %esi
+; SSE-NEXT: addl %edx, %esi
+; SSE-NEXT: shrl %esi
+; SSE-NEXT: imull $124, %esi, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: movzwl %dx, %esi
+; SSE-NEXT: imull $690, %esi, %esi # imm = 0x2B2
+; SSE-NEXT: shrl $16, %esi
+; SSE-NEXT: imull $95, %esi, %esi
+; SSE-NEXT: subl %esi, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: pinsrw $1, %ecx, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %ecx
+; SSE-NEXT: imull $21399, %ecx, %edx # imm = 0x5397
+; SSE-NEXT: shrl $21, %edx
+; SSE-NEXT: imull $98, %edx, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: pinsrw $2, %ecx, %xmm1
; SSE-NEXT: pinsrw $3, %eax, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fold_urem_vec_1:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
-; AVX-NEXT: shrl $19, %ecx
-; AVX-NEXT: imull $124, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movzwl %cx, %edx
-; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
-; AVX-NEXT: shrl $22, %edx
-; AVX-NEXT: imull $95, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
-; AVX-NEXT: shrl $17, %ecx
-; AVX-NEXT: imull $98, %ecx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shll $6, %ecx
+; AVX-NEXT: leal (%rcx,%rax,2), %ecx
; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $9, %edx
-; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
+; AVX-NEXT: imull $1003, %ecx, %ecx # imm = 0x3EB
; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpextrw $1, %xmm0, %ecx
+; AVX-NEXT: imull $1057, %ecx, %edx # imm = 0x421
+; AVX-NEXT: shrl $16, %edx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: movzwl %si, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: imull $124, %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %xmm0, %edx
+; AVX-NEXT: movzwl %dx, %esi
+; AVX-NEXT: imull $690, %esi, %esi # imm = 0x2B2
+; AVX-NEXT: shrl $16, %esi
+; AVX-NEXT: imull $95, %esi, %esi
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: vmovd %edx, %xmm1
+; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-NEXT: imull $21399, %ecx, %edx # imm = 0x5397
+; AVX-NEXT: shrl $21, %edx
+; AVX-NEXT: imull $98, %edx, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
ret <4 x i16> %1
@@ -89,17 +85,15 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; SSE-LABEL: fold_urem_vec_2:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [690,690,690,690,690,690,690,690]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psrlw $6, %xmm1
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
; SSE-NEXT: psubw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fold_urem_vec_2:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
-; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [690,690,690,690,690,690,690,690]
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -112,9 +106,8 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; SSE-LABEL: combine_urem_udiv:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [690,690,690,690,690,690,690,690]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psrlw $6, %xmm1
; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
; SSE-NEXT: pmullw %xmm1, %xmm2
; SSE-NEXT: psubw %xmm2, %xmm0
@@ -123,8 +116,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
;
; AVX-LABEL: combine_urem_udiv:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
-; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [690,690,690,690,690,690,690,690]
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -148,8 +140,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; SSE-NEXT: andl $7, %eax
; SSE-NEXT: pinsrw $2, %eax, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %ecx
+; SSE-NEXT: imull $690, %eax, %ecx # imm = 0x2B2
+; SSE-NEXT: shrl $16, %ecx
; SSE-NEXT: imull $95, %ecx, %ecx
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pinsrw $3, %eax, %xmm1
@@ -166,8 +158,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX1-NEXT: shrl $22, %ecx
+; AVX1-NEXT: imull $690, %eax, %ecx # imm = 0x2B2
+; AVX1-NEXT: shrl $16, %ecx
; AVX1-NEXT: imull $95, %ecx, %ecx
; AVX1-NEXT: subl %ecx, %eax
; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -184,8 +176,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; AVX2-NEXT: shrl $22, %ecx
+; AVX2-NEXT: imull $690, %eax, %ecx # imm = 0x2B2
+; AVX2-NEXT: shrl $16, %ecx
; AVX2-NEXT: imull $95, %ecx, %ecx
; AVX2-NEXT: subl %ecx, %eax
; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -198,30 +190,24 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; SSE-LABEL: dont_fold_urem_one:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; SSE-NEXT: shrl $16, %ecx
-; SSE-NEXT: movl %eax, %edx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: movzwl %dx, %edx
-; SSE-NEXT: shrl %edx
-; SSE-NEXT: addl %ecx, %edx
-; SSE-NEXT: shrl $4, %edx
-; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
-; SSE-NEXT: shll $3, %ecx
-; SSE-NEXT: subl %ecx, %edx
-; SSE-NEXT: addl %eax, %edx
; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; SSE-NEXT: imull $51305, %eax, %ecx # imm = 0xC869
; SSE-NEXT: shrl $25, %ecx
; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pinsrw $2, %edx, %xmm1
+; SSE-NEXT: pextrw $2, %xmm0, %eax
+; SSE-NEXT: imull $45589, %eax, %ecx # imm = 0xB215
+; SSE-NEXT: shrl $20, %ecx
+; SSE-NEXT: leal (%rcx,%rcx,2), %edx
+; SSE-NEXT: shll $3, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: addl %eax, %ecx
+; SSE-NEXT: pinsrw $2, %ecx, %xmm1
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; SSE-NEXT: shrl $26, %ecx
+; SSE-NEXT: imull $24749, %eax, %ecx # imm = 0x60AD
+; SSE-NEXT: shrl $27, %ecx
; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pinsrw $3, %eax, %xmm1
@@ -230,30 +216,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
;
; AVX-LABEL: dont_fold_urem_one:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $4, %edx
-; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX-NEXT: shll $3, %ecx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: addl %eax, %edx
; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX-NEXT: imull $51305, %eax, %ecx # imm = 0xC869
; AVX-NEXT: shrl $25, %ecx
; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX-NEXT: vpextrw $2, %xmm0, %eax
+; AVX-NEXT: imull $45589, %eax, %ecx # imm = 0xB215
+; AVX-NEXT: shrl $20, %ecx
+; AVX-NEXT: leal (%rcx,%rcx,2), %edx
+; AVX-NEXT: shll $3, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX-NEXT: shrl $26, %ecx
+; AVX-NEXT: imull $24749, %eax, %ecx # imm = 0x60AD
+; AVX-NEXT: shrl $27, %ecx
; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
@@ -276,34 +256,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; SSE-LABEL: dont_fold_urem_i64:
; SSE: # %bb.0:
; SSE-NEXT: movq %xmm1, %rcx
-; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; SSE-NEXT: movabsq $-5614226457215950493, %rdx # imm = 0xB21642C8590B2163
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: mulq %rdx
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: subq %rdx, %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: addq %rdx, %rax
-; SSE-NEXT: shrq $4, %rax
-; SSE-NEXT: leaq (%rax,%rax,2), %rdx
-; SSE-NEXT: shlq $3, %rdx
-; SSE-NEXT: subq %rdx, %rax
-; SSE-NEXT: addq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm2
+; SSE-NEXT: shrq $4, %rdx
+; SSE-NEXT: leaq (%rdx,%rdx,2), %rax
+; SSE-NEXT: shlq $3, %rax
+; SSE-NEXT: subq %rax, %rdx
+; SSE-NEXT: addq %rcx, %rdx
+; SSE-NEXT: movq %rdx, %xmm2
; SSE-NEXT: pextrq $1, %xmm1, %rcx
-; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; SSE-NEXT: movabsq $3483213337908644819, %rdx # imm = 0x3056DC1372F28BD3
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: mulq %rdx
-; SSE-NEXT: shrq $12, %rdx
+; SSE-NEXT: shrq $10, %rdx
; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
; SSE-NEXT: subq %rax, %rcx
; SSE-NEXT: movq %rcx, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movabsq $1805185964399711473, %rdx # imm = 0x190D4F120190D4F1
; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
; SSE-NEXT: mulq %rdx
-; SSE-NEXT: shrq $7, %rdx
+; SSE-NEXT: shrq $6, %rdx
; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
; SSE-NEXT: subq %rax, %rcx
; SSE-NEXT: movq %rcx, %xmm0
@@ -315,34 +290,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovq %xmm1, %rcx
-; AVX1-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; AVX1-NEXT: movabsq $-5614226457215950493, %rdx # imm = 0xB21642C8590B2163
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $4, %rax
-; AVX1-NEXT: leaq (%rax,%rax,2), %rdx
-; AVX1-NEXT: shlq $3, %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: shrq $4, %rdx
+; AVX1-NEXT: leaq (%rdx,%rdx,2), %rax
+; AVX1-NEXT: shlq $3, %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX1-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; AVX1-NEXT: movabsq $3483213337908644819, %rdx # imm = 0x3056DC1372F28BD3
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: shrq $12, %rdx
+; AVX1-NEXT: shrq $10, %rdx
; AVX1-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
; AVX1-NEXT: subq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movabsq $1805185964399711473, %rdx # imm = 0x190D4F120190D4F1
; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
; AVX1-NEXT: mulq %rdx
-; AVX1-NEXT: shrq $7, %rdx
+; AVX1-NEXT: shrq $6, %rdx
; AVX1-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
; AVX1-NEXT: subq %rax, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
@@ -354,34 +324,29 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; AVX2-NEXT: movabsq $-5614226457215950493, %rdx # imm = 0xB21642C8590B2163
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $4, %rax
-; AVX2-NEXT: leaq (%rax,%rax,2), %rdx
-; AVX2-NEXT: shlq $3, %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: shrq $4, %rdx
+; AVX2-NEXT: leaq (%rdx,%rdx,2), %rax
+; AVX2-NEXT: shlq $3, %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; AVX2-NEXT: movabsq $3483213337908644819, %rdx # imm = 0x3056DC1372F28BD3
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $12, %rdx
+; AVX2-NEXT: shrq $10, %rdx
; AVX2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
; AVX2-NEXT: subq %rax, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movabsq $1805185964399711473, %rdx # imm = 0x190D4F120190D4F1
; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
; AVX2-NEXT: mulq %rdx
-; AVX2-NEXT: shrq $7, %rdx
+; AVX2-NEXT: shrq $6, %rdx
; AVX2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
; AVX2-NEXT: subq %rax, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index a9427be39ca3e..84e245115f9c7 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -12,67 +12,40 @@
define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_div7_2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; SSE2-NEXT: mulq %rcx
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: addq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: mulq %rcx
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: psrlq $2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rcx
-; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
-; SSE41-NEXT: movq %xmm0, %rcx
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: addq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: movq %rdx, %xmm1
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: mulq %rcx
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: psrlq $2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_div7_2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0
; AVX-NEXT: retq
%res = udiv <2 x i64> %a, <i64 7, i64 7>
ret <2 x i64> %res
@@ -82,17 +55,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: paddd %xmm2, %xmm0
-; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_4i32:
@@ -100,13 +68,9 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; SSE41-NEXT: pmuludq %xmm2, %xmm1
-; SSE41-NEXT: pmuludq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE41-NEXT: psubd %xmm2, %xmm0
-; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm2, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: pmuludq %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_4i32:
@@ -114,13 +78,9 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i32:
@@ -128,13 +88,9 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
ret <4 x i32> %res
@@ -143,21 +99,12 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: test_div7_8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [9363,9363,9363,9363,9363,9363,9363,9363]
; SSE-NEXT: retq
;
; AVX-LABEL: test_div7_8i16:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX-NEXT: retq
%res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <8 x i16> %res
@@ -172,37 +119,24 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT: pmullw %xmm3, %xmm4
-; SSE2-NEXT: psrlw $8, %xmm4
-; SSE2-NEXT: packuswb %xmm2, %xmm4
-; SSE2-NEXT: psubb %xmm4, %xmm0
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: paddb %xmm4, %xmm0
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pmullw %xmm3, %xmm0
+; SSE2-NEXT: psrlw $8, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_16i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw %xmm1, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: packuswb %xmm2, %xmm3
-; SSE41-NEXT: psubb %xmm3, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: paddb %xmm3, %xmm0
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
+; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: packuswb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_16i8:
@@ -212,46 +146,29 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2NOBW-LABEL: test_div7_16i8:
; AVX2NOBW: # %bb.0:
-; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vzeroupper
; AVX2NOBW-NEXT: retq
;
; AVX512BW-LABEL: test_div7_16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
@@ -268,33 +185,29 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,127,113,51,185,85,20,145]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: psubb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,128,0,0,0,0,0,128]
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: paddb %xmm3, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,256,64,32,128,64,64,256]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,64,64,128,32,64,256,32]
; SSE2-NEXT: psrlw $8, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -302,38 +215,28 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-LABEL: test_divconstant_16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE41-NEXT: psllw $7, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,127,113,51,185,85,20,145]
; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $7, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: packuswb %xmm3, %xmm4
-; SSE41-NEXT: psubb %xmm4, %xmm0
+; SSE41-NEXT: packuswb %xmm2, %xmm3
+; SSE41-NEXT: psubb %xmm3, %xmm0
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,128,0,0,0,0,0,128]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: paddb %xmm4, %xmm2
+; SSE41-NEXT: paddb %xmm3, %xmm2
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,32,32,32,128,128,64]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,256,64,32,128,64,64,256]
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,256,128,32,32,32,64,64]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,64,64,128,32,64,256,32]
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: packuswb %xmm2, %xmm0
; SSE41-NEXT: retq
@@ -341,35 +244,27 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-LABEL: test_divconstant_16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [145,20,85,185,51,113,113,37]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,127,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,128,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,64,32,128,64,64,256]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,64,64,128,32,64,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -377,21 +272,19 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-LABEL: test_divconstant_16i8:
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,127,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,128,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,64,64,128,32,64,256,32,32,256,64,32,128,64,64,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -400,20 +293,18 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
;
; AVX512BW-LABEL: test_divconstant_16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,127,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,128,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,2,2,1,3,2,0,3,3,0,2,3,1,2,2,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
@@ -434,28 +325,18 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: addq %rdx, %rax
-; SSE2-NEXT: shrq $2, %rax
-; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: addq %rcx, %rax
-; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: mulq %rsi
-; SSE2-NEXT: movq %rcx, %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: shrq %rax
-; SSE2-NEXT: addq %rdx, %rax
-; SSE2-NEXT: shrq $2, %rax
-; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: addq %rcx, %rax
-; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -466,27 +347,17 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: shrq %rax
-; SSE41-NEXT: addq %rdx, %rax
-; SSE41-NEXT: shrq $2, %rax
-; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: addq %rcx, %rax
-; SSE41-NEXT: movq %rax, %xmm1
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: shrq %rax
-; SSE41-NEXT: addq %rdx, %rax
-; SSE41-NEXT: shrq $2, %rax
-; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: addq %rcx, %rax
-; SSE41-NEXT: movq %rax, %xmm0
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
@@ -496,27 +367,17 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%res = urem <2 x i64> %a, <i64 7, i64 7>
@@ -534,15 +395,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: paddd %xmm2, %xmm1
-; SSE2-NEXT: psrld $2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pslld $3, %xmm2
-; SSE2-NEXT: psubd %xmm2, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pslld $3, %xmm1
+; SSE2-NEXT: psubd %xmm1, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
@@ -553,15 +409,10 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: pmuludq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: psrld $1, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: pslld $3, %xmm2
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pslld $3, %xmm1
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: paddd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
@@ -572,10 +423,6 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
; AVX1-NEXT: vpslld $3, %xmm1, %xmm2
; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -589,10 +436,6 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
; AVX2-NEXT: vpslld $3, %xmm1, %xmm2
; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -606,24 +449,15 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psubw %xmm1, %xmm2
-; SSE-NEXT: psrlw $1, %xmm2
-; SSE-NEXT: paddw %xmm1, %xmm2
-; SSE-NEXT: psrlw $2, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psllw $3, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm2
-; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllw $3, %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm1
+; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_rem7_8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1
; AVX-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX-NEXT: vpsubw %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -646,18 +480,11 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pmullw %xmm3, %xmm4
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: packuswb %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psubb %xmm4, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: paddb %xmm4, %xmm1
-; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: psllw $3, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psllw $3, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: psubb %xmm2, %xmm1
-; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: psubb %xmm1, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_16i8:
@@ -672,18 +499,11 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: pmullw %xmm1, %xmm3
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psubb %xmm3, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
+; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: psllw $3, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: paddb %xmm3, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psllw $3, %xmm2
-; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: psubb %xmm2, %xmm1
-; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: psubb %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
@@ -697,12 +517,6 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -716,12 +530,6 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -735,12 +543,6 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -761,37 +563,33 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,256,256,256,256,256,256,256]
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [147,79,171,117,205,57,57,37]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,256,256,256,256,256,256,128]
-; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,32,57,205,117,171,79,147]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,127,113,51,185,85,20,145]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: packuswb %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,0,0,128,0,0,0,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: psrlw $8, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,0,0,0,128,0,0,0]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,128,0,0,0,0,0,128]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: packuswb %xmm4, %xmm2
; SSE2-NEXT: paddb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [32,256,64,32,128,64,64,256]
; SSE2-NEXT: psrlw $8, %xmm3
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,13,12,11,10,9,9,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,64,64,128,32,64,256,32]
; SSE2-NEXT: psrlw $8, %xmm2
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14]
; SSE2-NEXT: pand %xmm4, %xmm2
@@ -802,41 +600,31 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-LABEL: test_remconstant_16i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE41-NEXT: psllw $7, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [147,79,171,117,205,57,57,37]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: psllw $7, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,6],xmm4[7]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [37,32,57,205,117,171,79,147]
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: packuswb %xmm3, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psubb %xmm4, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,128,0,0,0,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [145,20,85,185,51,113,113,37]
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [128,0,0,0,128,0,0,0]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37,127,113,51,185,85,20,145]
; SSE41-NEXT: psrlw $8, %xmm3
; SSE41-NEXT: packuswb %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm4, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubb %xmm3, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: packuswb %xmm3, %xmm2
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,128,0,0,0,0,0,128]
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: packuswb %xmm2, %xmm4
+; SSE41-NEXT: paddb %xmm3, %xmm4
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [32,256,64,32,128,64,64,256]
+; SSE41-NEXT: psrlw $8, %xmm4
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [256,64,64,128,32,64,256,32]
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: packuswb %xmm4, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; SSE41-NEXT: psllw $8, %xmm1
@@ -849,35 +637,27 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-LABEL: test_remconstant_16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [145,20,85,185,51,113,113,37]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [147,79,171,117,205,57,57,37]
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37,127,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,128,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,64,32,32,32,128,128,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,64,32,128,64,64,256]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,64,64,128,32,64,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
@@ -891,21 +671,19 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-LABEL: test_remconstant_16i8:
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,127,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,128,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,64,64,128,32,64,256,32,32,256,64,32,128,64,64,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -917,20 +695,18 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
;
; AVX512BW-LABEL: test_remconstant_16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,32,57,205,117,171,79,147,147,79,171,117,205,57,57,37]
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [37,127,113,51,185,85,20,145,145,20,85,185,51,113,113,37]
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [128,0,0,0,128,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,128,0,0,0,0,0,128,128,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,2,1,3,2,0,3,3,0,2,3,1,2,2,0]
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 3ed716881281d..4099351288a5e 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -10,79 +10,44 @@
define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_div7_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: addq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: mulq %rcx
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vpsrlq $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: addq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: mulq %rcx
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0
; AVX2-NEXT: retq
%res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
ret <4 x i64> %res
@@ -91,27 +56,19 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
@@ -119,13 +76,9 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <8 x i32> %res
@@ -134,28 +87,16 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_div7_16i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX2-NEXT: retq
%res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <16 x i16> %res
@@ -170,31 +111,17 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -205,30 +132,18 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
%res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
@@ -243,66 +158,50 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [145,20,85,185,51,113,127,37]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [185,97,51,107,113,15,127,17]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [128,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,32,32,32,128,256,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,256,64,32,128,64,64,256]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,16,16,128,64,16,256,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [16,32,64,32,32,256,32,256]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [17,127,15,113,107,51,97,185]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [137,16,241,57,27,205,135,187]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,32,57,205,117,171,79,147]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,127,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,0,0,128,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,128,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,16,64,128,16,16,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,32,256,32,32,64,32,16]
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,64,64,128,32,64,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -312,31 +211,26 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [17,127,15,113,107,51,97,185,145,20,85,185,51,113,127,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,127,113,51,185,85,20,145,185,97,51,107,113,15,127,17]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,128,0,0,0,0,0,0,128,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,128,0,0,0,0,0,128,0,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [32,256,16,64,128,16,16,32,64,64,32,32,32,128,256,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [256,32,256,32,32,64,32,16,32,256,64,32,128,64,64,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,32,16,16,128,64,16,256,32]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,64,64,128,32,64,256,32,16,32,64,32,32,256,32,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2NOBW-NEXT: retq
@@ -344,13 +238,12 @@ define <32 x i8> @test_divconstant_32i8(<32 x i8> %a) nounwind {
; AVX512BW-LABEL: test_divconstant_32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,32,57,205,117,171,79,147,137,16,241,57,27,205,135,187,187,135,205,27,57,241,16,137,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,127,113,51,185,85,20,145,17,127,15,113,107,51,97,185,185,97,51,107,113,15,127,17,145,20,85,185,51,113,127,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,128,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
@@ -374,52 +267,32 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: shrq %rax
-; AVX1-NEXT: addq %rdx, %rax
-; AVX1-NEXT: shrq $2, %rax
-; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: addq %rcx, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -431,52 +304,32 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: shrq %rax
-; AVX2-NEXT: addq %rdx, %rax
-; AVX2-NEXT: shrq $2, %rax
-; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -494,10 +347,6 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3],xmm4[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
; AVX1-NEXT: vpslld $3, %xmm2, %xmm4
; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -506,10 +355,6 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
; AVX1-NEXT: vpslld $3, %xmm2, %xmm3
; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
@@ -524,10 +369,6 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1
; AVX2-NEXT: vpslld $3, %ymm1, %ymm2
; AVX2-NEXT: vpsubd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -542,18 +383,10 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
-; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $3, %xmm3, %xmm4
; AVX1-NEXT: vpsubw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
; AVX1-NEXT: vpsubw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
@@ -563,10 +396,6 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX2-LABEL: test_rem7_16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $3, %ymm1, %ymm2
; AVX2-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
@@ -588,18 +417,10 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpsllw $3, %xmm3, %xmm5
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $3, %xmm3, %xmm7
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
@@ -608,14 +429,8 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $3, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -632,12 +447,6 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
-; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsllw $3, %ymm1, %ymm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
@@ -650,12 +459,6 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
@@ -674,77 +477,61 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [147,79,171,117,205,57,32,37]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [145,20,85,185,51,113,127,37]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [187,135,205,27,57,241,16,137]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [185,97,51,107,113,15,127,17]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,0,0,128,0,0,0,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,0,0,0,0,0,128,0]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,64,32,32,32,128,256,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [32,256,64,32,128,64,64,256]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,16,16,128,64,16,256,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [16,32,64,32,32,256,32,256]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
-; AVX1-NEXT: vpsllw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsubb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm5
-; AVX1-NEXT: vpsllw $7, %xmm5, %xmm5
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
-; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [137,16,241,57,27,205,135,187]
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
+; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [17,127,15,113,107,51,97,185]
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [37,127,113,51,185,85,20,145]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6],xmm6[7]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [37,32,57,205,117,171,79,147]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsubb %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,0,0,0,0,0,128,0]
+; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 # [0,128,0,0,0,0,0,0]
; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [128,0,0,0,128,0,0,0]
-; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
-; AVX1-NEXT: vpackuswb %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,256,16,64,128,16,16,32]
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,256,128,32,32,32,64,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,128,0,0,0,0,0,128]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,32,256,32,32,64,32,16]
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [256,64,64,128,32,64,256,32]
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -756,31 +543,26 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW: # %bb.0:
; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [256,256,256,256,256,256,256,128,128,256,256,256,256,256,256,256]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [137,16,241,57,27,205,135,187,147,79,171,117,205,57,32,37]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [17,127,15,113,107,51,97,185,145,20,85,185,51,113,127,37]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm4, %ymm3
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,32,57,205,117,171,79,147,187,135,205,27,57,241,16,137]
+; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [37,127,113,51,185,85,20,145,185,97,51,107,113,15,127,17]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm0, %ymm3
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,0,0,0,128,0,0,0,128]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,0,128,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,0,0,0,128,0,0,0,0,128,0,0,0,0,0,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,128,0,0,0,0,0,128,0,0,0,0,0,0,128,0]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,256,16,64,128,16,16,32,64,64,32,32,32,128,256,64]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,32,256,32,32,64,32,16,32,256,64,32,128,64,64,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,32,16,16,128,64,16,256,32]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,64,64,128,32,64,256,32,16,32,64,32,32,256,32,256]
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -794,13 +576,12 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX512BW-LABEL: test_remconstant_32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,32,57,205,117,171,79,147,137,16,241,57,27,205,135,187,187,135,205,27,57,241,16,137,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [37,127,113,51,185,85,20,145,17,127,15,113,107,51,97,185,185,97,51,107,113,15,127,17,145,20,85,185,51,113,127,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [128,0,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,128,0,0,0,0,0,0,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [0,128,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index ef6129cc85889..fc7fcc684f5bf 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -10,73 +10,40 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_div7_8i64:
; AVX: # %bb.0:
; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
-; AVX-NEXT: vmovq %xmm1, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rax
+; AVX-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpextrq $1, %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm2, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-NEXT: vpextrq $1, %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
-; AVX-NEXT: vmovq %xmm0, %rcx
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: mulq %rsi
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: shrq %rcx
-; AVX-NEXT: addq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: mulq %rcx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX-NEXT: vpsrlq $2, %zmm0, %zmm0
; AVX-NEXT: retq
%res = udiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
ret <8 x i64> %res
@@ -87,14 +54,10 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
-; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
-; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm0
-; AVX-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX-NEXT: vpaddd %zmm3, %zmm0, %zmm0
-; AVX-NEXT: vpsrld $2, %zmm0, %zmm0
+; AVX-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm1
+; AVX-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
+; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm0
; AVX-NEXT: retq
%res = udiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -103,28 +66,16 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512BW-NEXT: retq
%res = udiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
ret <32 x i16> %res
@@ -133,36 +84,24 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_div7_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_64i8:
@@ -172,16 +111,10 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = udiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <64 x i8> %res
@@ -197,57 +130,49 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,9,145,151,39,163,85,177,145,20,85,185,51,113,127,37]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [107,55,113,233,15,247,127,9,185,97,51,107,113,15,127,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,128,0,128,0,0,0,128,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [16,16,256,128,32,64,16,16,64,64,32,32,32,128,256,64]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,256,16,16,64,16,32,16,32,256,64,32,128,64,64,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [16,32,16,8,128,8,16,256,16,32,64,32,32,256,32,256]
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,127,15,113,107,51,97,185,9,127,247,15,233,113,55,107]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,127,113,51,185,85,20,145,177,85,163,39,151,145,9,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,128,0,0,0,0,0,128,0,0,0,128,0,128,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [32,256,16,64,128,16,16,32,16,256,8,8,8,32,16,64]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,32,256,32,32,64,32,16,256,16,8,128,8,16,32,16]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,64,64,128,32,64,256,32,16,32,16,64,16,16,256,128]
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -257,22 +182,18 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [17,127,15,113,107,51,97,185,9,127,247,15,233,113,55,107,17,9,145,151,39,163,85,177,145,20,85,185,51,113,127,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,127,113,51,185,85,20,145,177,85,163,39,151,145,9,17,107,55,113,233,15,247,127,9,185,97,51,107,113,15,127,17]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,128,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,128,0,0,0,0,0,128,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
@@ -300,105 +221,65 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm2
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm3
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: shrq %rax
-; AVX-NEXT: addq %rdx, %rax
-; AVX-NEXT: shrq $2, %rax
-; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: addq %rcx, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -416,12 +297,8 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
; AVX-NEXT: vpmovsxbd {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm1
-; AVX-NEXT: vpsrld $1, %zmm1, %zmm1
-; AVX-NEXT: vpaddd %zmm3, %zmm1, %zmm1
-; AVX-NEXT: vpsrld $2, %zmm1, %zmm1
-; AVX-NEXT: vpslld $3, %zmm1, %zmm2
-; AVX-NEXT: vpsubd %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpslld $3, %zmm3, %zmm1
+; AVX-NEXT: vpsubd %zmm1, %zmm3, %zmm1
; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = urem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
@@ -434,18 +311,10 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm4
; AVX512F-NEXT: vpsubw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
; AVX512F-NEXT: vpsubw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -455,10 +324,6 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512BW-LABEL: test_rem7_32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
; AVX512BW-NEXT: vpsubw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@@ -480,18 +345,10 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5
-; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm5
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm7
-; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
@@ -500,14 +357,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
-; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -524,12 +375,6 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; AVX512BW-NEXT: vpsllw $3, %zmm1, %zmm2
; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
@@ -549,68 +394,60 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [256,256,64,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,9,145,151,39,163,85,177,145,20,85,185,51,113,127,37]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,256,128,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [107,55,113,233,15,247,127,9,185,97,51,107,113,15,127,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [0,0,128,0,128,0,0,0,128,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [16,16,256,128,32,64,16,16,64,64,32,32,32,128,256,64]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [128,256,16,16,64,16,32,16,32,256,64,32,128,64,64,256]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [16,32,16,8,128,8,16,256,16,32,64,32,32,256,32,256]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm4
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
-; AVX512F-NEXT: vpsllw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,256,256,256,256,256,256,128,256,256,256,256,256,256,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27]
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
+; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [17,127,15,113,107,51,97,185,9,127,247,15,233,113,55,107]
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [37,127,113,51,185,85,20,145,177,85,163,39,151,145,9,17]
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [256,256,256,256,256,256,256,128,256,256,256,256,256,64,256,256]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0]
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 # [0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0]
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 # [32,256,16,64,128,16,16,32,16,256,8,8,8,32,16,64]
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
-; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,128,0,0,0,0,0,128,0,0,0,128,0,128,0,0]
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [256,32,256,32,32,64,32,16,256,16,8,128,8,16,32,16]
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,64,64,128,32,64,256,32,16,32,16,64,16,16,256,128]
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
@@ -622,22 +459,18 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [137,16,241,57,27,205,135,187,9,8,249,241,235,57,111,27,137,27,37,19,79,41,171,101,147,79,171,117,205,57,32,37]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 # [17,127,15,113,107,51,97,185,9,127,247,15,233,113,55,107,17,9,145,151,39,163,85,177,145,20,85,185,51,113,127,37]
; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3
-; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,32,57,205,117,171,79,147,101,171,41,79,19,37,27,137,27,111,57,235,241,249,8,9,187,135,205,27,57,241,16,137]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [37,127,113,51,185,85,20,145,177,85,163,39,151,145,9,17,107,55,113,233,15,247,127,9,185,97,51,107,113,15,127,17]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm3
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm3[8],zmm1[8],zmm3[9],zmm1[9],zmm3[10],zmm1[10],zmm3[11],zmm1[11],zmm3[12],zmm1[12],zmm3[13],zmm1[13],zmm3[14],zmm1[14],zmm3[15],zmm1[15],zmm3[24],zmm1[24],zmm3[25],zmm1[25],zmm3[26],zmm1[26],zmm3[27],zmm1[27],zmm3[28],zmm1[28],zmm3[29],zmm1[29],zmm3[30],zmm1[30],zmm3[31],zmm1[31],zmm3[40],zmm1[40],zmm3[41],zmm1[41],zmm3[42],zmm1[42],zmm3[43],zmm1[43],zmm3[44],zmm1[44],zmm3[45],zmm1[45],zmm3[46],zmm1[46],zmm3[47],zmm1[47],zmm3[56],zmm1[56],zmm3[57],zmm1[57],zmm3[58],zmm1[58],zmm3[59],zmm1[59],zmm3[60],zmm1[60],zmm3[61],zmm1[61],zmm3[62],zmm1[62],zmm3[63],zmm1[63]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,0,0,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,128,0,0,0,128,0,0,0,128]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 # [0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,128,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm1[0],zmm3[1],zmm1[1],zmm3[2],zmm1[2],zmm3[3],zmm1[3],zmm3[4],zmm1[4],zmm3[5],zmm1[5],zmm3[6],zmm1[6],zmm3[7],zmm1[7],zmm3[16],zmm1[16],zmm3[17],zmm1[17],zmm3[18],zmm1[18],zmm3[19],zmm1[19],zmm3[20],zmm1[20],zmm3[21],zmm1[21],zmm3[22],zmm1[22],zmm3[23],zmm1[23],zmm3[32],zmm1[32],zmm3[33],zmm1[33],zmm3[34],zmm1[34],zmm3[35],zmm1[35],zmm3[36],zmm1[36],zmm3[37],zmm1[37],zmm3[38],zmm1[38],zmm3[39],zmm1[39],zmm3[48],zmm1[48],zmm3[49],zmm1[49],zmm3[50],zmm1[50],zmm3[51],zmm1[51],zmm3[52],zmm1[52],zmm3[53],zmm1[53],zmm3[54],zmm1[54],zmm3[55],zmm1[55]
-; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [128,0,0,0,128,0,0,0,128,0,0,0,0,0,128,0,0,0,0,0,0,0,0,128,0,128,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 # [0,128,0,0,0,0,0,128,0,0,0,128,0,128,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0]
; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
; AVX512BW-NEXT: vpackuswb %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpaddb %zmm2, %zmm3, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
index 10a840218c864..27c7cd0c9be97 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -7,17 +7,12 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; X64-NEXT: pmuludq %xmm1, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64-NEXT: pmuludq %xmm1, %xmm3
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: psubd %xmm2, %xmm0
-; X64-NEXT: psrld $1, %xmm0
-; X64-NEXT: paddd %xmm2, %xmm0
-; X64-NEXT: psrld $2, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movq %xmm0, (%rsi)
; X64-NEXT: retq
;
@@ -30,16 +25,11 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: movdqa %xmm0, %xmm2
; X86-NEXT: pmuludq %xmm1, %xmm2
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; X86-NEXT: pmuludq %xmm1, %xmm3
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: psubd %xmm2, %xmm0
-; X86-NEXT: psrld $1, %xmm0
-; X86-NEXT: paddd %xmm2, %xmm0
-; X86-NEXT: psrld $2, %xmm0
-; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
%a = load <2 x i32>, ptr %x
%b = udiv <2 x i32> %a, <i32 7, i32 7>
@@ -59,16 +49,11 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
; X64-NEXT: pmuludq %xmm1, %xmm3
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psubd %xmm2, %xmm1
-; X64-NEXT: psrld $1, %xmm1
-; X64-NEXT: paddd %xmm2, %xmm1
-; X64-NEXT: psrld $2, %xmm1
-; X64-NEXT: movdqa %xmm1, %xmm2
-; X64-NEXT: pslld $3, %xmm2
-; X64-NEXT: psubd %xmm2, %xmm1
-; X64-NEXT: paddd %xmm0, %xmm1
-; X64-NEXT: movq %xmm1, (%rsi)
+; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: pslld $3, %xmm1
+; X64-NEXT: psubd %xmm1, %xmm2
+; X64-NEXT: paddd %xmm0, %xmm2
+; X64-NEXT: movq %xmm2, (%rsi)
; X64-NEXT: retq
;
; X86-LABEL: test_urem7_v2i32:
@@ -85,16 +70,11 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
; X86-NEXT: pmuludq %xmm1, %xmm3
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-NEXT: movdqa %xmm0, %xmm1
-; X86-NEXT: psubd %xmm2, %xmm1
-; X86-NEXT: psrld $1, %xmm1
-; X86-NEXT: paddd %xmm2, %xmm1
-; X86-NEXT: psrld $2, %xmm1
-; X86-NEXT: movdqa %xmm1, %xmm2
-; X86-NEXT: pslld $3, %xmm2
-; X86-NEXT: psubd %xmm2, %xmm1
-; X86-NEXT: paddd %xmm0, %xmm1
-; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: movdqa %xmm2, %xmm1
+; X86-NEXT: pslld $3, %xmm1
+; X86-NEXT: psubd %xmm1, %xmm2
+; X86-NEXT: paddd %xmm0, %xmm2
+; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
%a = load <2 x i32>, ptr %x
%b = urem <2 x i32> %a, <i32 7, i32 7>
diff --git a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
index 6007c4f0b0231..f12f525fd5995 100644
--- a/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll
@@ -8,21 +8,19 @@
define <8 x i8> @vshli_target_constant(<8 x i16> %arg, <8 x i32> %arg1) {
; CHECK-LABEL: vshli_target_constant:
; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2863311531,2863311531,2863311531,2863311531]
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1431655765,1431655765,1431655765,1431655765]
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; CHECK-NEXT: psrld $1, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm2
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
; CHECK-NEXT: pmuludq %xmm0, %xmm3
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT: psrld $1, %xmm2
; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; CHECK-NEXT: pand %xmm3, %xmm2
; CHECK-NEXT: pand %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll b/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll
index df48a29156caa..5cb0e7e08ea6d 100644
--- a/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll
+++ b/llvm/test/CodeGen/X86/x86_64-mul-by-const.ll
@@ -1,9 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; Formerly there were two shifts. rdar://8771012.
define i32 @f9188_mul365384439_shift27(i32 %A) nounwind {
-; CHECK: imulq $365384439,
-; CHECK: shrq $59, %rax
+; CHECK-LABEL: f9188_mul365384439_shift27:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: imulq $1461537755, %rax, %rax ## imm = 0x571D4BDB
+; CHECK-NEXT: shrq $61, %rax
+; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
+; CHECK-NEXT: retq
%tmp1 = udiv i32 %A, 1577682821 ; <i32> [#uses=1]
ret i32 %tmp1
}
diff --git a/llvm/unittests/Support/DivisionByConstantTest.cpp b/llvm/unittests/Support/DivisionByConstantTest.cpp
index 2b17f98bb75b2..260899d92c8fc 100644
--- a/llvm/unittests/Support/DivisionByConstantTest.cpp
+++ b/llvm/unittests/Support/DivisionByConstantTest.cpp
@@ -96,8 +96,7 @@ APInt MULHU(APInt X, APInt Y) {
}
APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
- bool LZOptimization,
- bool AllowEvenDivisorOptimization, bool ForceNPQ,
+ bool LZOptimization, bool ForceNPQ,
UnsignedDivisionByConstantInfo Magics) {
assert(!Divisor.isOne() && "Division by 1 is not supported using Magic.");
@@ -108,8 +107,7 @@ APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
// Clip to the number of leading zeros in the divisor.
LeadingZeros = std::min(LeadingZeros, Divisor.countl_zero());
if (LeadingZeros > 0) {
- Magics = UnsignedDivisionByConstantInfo::get(
- Divisor, LeadingZeros, AllowEvenDivisorOptimization);
+ Magics = UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
assert(!Magics.IsAdd && "Should use cheap fixup now");
}
}
@@ -166,21 +164,17 @@ TEST(UnsignedDivisionByConstantTest, Test) {
EnumerateAPInts(Bits, [Divisor, Magics, Bits](const APInt &Numerator) {
APInt NativeResult = Numerator.udiv(Divisor);
for (bool LZOptimization : {true, false}) {
- for (bool AllowEvenDivisorOptimization : {true, false}) {
- for (bool ForceNPQ : {false, true}) {
- APInt MagicResult = UnsignedDivideUsingMagic(
- Numerator, Divisor, LZOptimization,
- AllowEvenDivisorOptimization, ForceNPQ, Magics);
- ASSERT_EQ(MagicResult, NativeResult)
- << " ... given the operation: urem i" << Bits << " "
- << Numerator << ", " << Divisor
- << " (allow LZ optimization = "
- << LZOptimization << ", allow even divisior optimization = "
- << AllowEvenDivisorOptimization << ", force NPQ = "
- << ForceNPQ << ")";
- }
+ for (bool ForceNPQ : {false, true}) {
+ APInt MagicResult = UnsignedDivideUsingMagic(
+ Numerator, Divisor, LZOptimization, ForceNPQ, Magics);
+ ASSERT_EQ(MagicResult, NativeResult)
+ << " ... given the operation: urem i" << Bits << " "
+ << Numerator << ", " << Divisor
+ << " (allow LZ optimization = " << LZOptimization
+ << ", force NPQ = " << ForceNPQ << ")";
}
}
+ }
});
});
}
>From f81e0bb822b534bcb4758a9684dfae6750f241e1 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 19 Jul 2024 13:02:57 -0400
Subject: [PATCH 2/3] Fallback?
---
llvm/lib/Support/DivisionByConstantInfo.cpp | 86 +++++++++++++++++++++
1 file changed, 86 insertions(+)
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index 3af537cc39686..18576b66d9936 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -70,11 +70,96 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
/// S. Warren, Jr., chapter 10.
/// LeadingZeros can be used to simplify the calculation if the upper bits
/// of the divided value are known zero.
+
+static UnsignedDivisionByConstantInfo get2(const APInt &D, unsigned LeadingZeros) {
+ assert(!D.isZero() && !D.isOne() && "Precondition violation.");
+ assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
+
+ APInt Delta;
+ struct UnsignedDivisionByConstantInfo Retval;
+ Retval.IsAdd = false; // initialize "add" indicator
+ APInt AllOnes =
+ APInt::getLowBitsSet(D.getBitWidth(), D.getBitWidth() - LeadingZeros);
+ APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
+ APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
+
+ // Calculate NC, the largest dividend such that NC.urem(D) == D-1.
+ APInt NC = AllOnes - (AllOnes + 1 - D).urem(D);
+ assert(NC.urem(D) == D - 1 && "Unexpected NC value");
+ unsigned P = D.getBitWidth() - 1; // initialize P
+ APInt Q1, R1, Q2, R2;
+ // initialize Q1 = 2P/NC; R1 = rem(2P,NC)
+ APInt::udivrem(SignedMin, NC, Q1, R1);
+ // initialize Q2 = (2P-1)/D; R2 = rem((2P-1),D)
+ APInt::udivrem(SignedMax, D, Q2, R2);
+ do {
+ P = P + 1;
+ if (R1.uge(NC - R1)) {
+ // update Q1
+ Q1 <<= 1;
+ ++Q1;
+ // update R1
+ R1 <<= 1;
+ R1 -= NC;
+ } else {
+ Q1 <<= 1; // update Q1
+ R1 <<= 1; // update R1
+ }
+ if ((R2 + 1).uge(D - R2)) {
+ if (Q2.uge(SignedMax))
+ Retval.IsAdd = true;
+ // update Q2
+ Q2 <<= 1;
+ ++Q2;
+ // update R2
+ R2 <<= 1;
+ ++R2;
+ R2 -= D;
+ } else {
+ if (Q2.uge(SignedMin))
+ Retval.IsAdd = true;
+ // update Q2
+ Q2 <<= 1;
+ // update R2
+ R2 <<= 1;
+ ++R2;
+ }
+ // Delta = D - 1 - R2
+ Delta = D;
+ --Delta;
+ Delta -= R2;
+ } while (P < D.getBitWidth() * 2 &&
+ (Q1.ult(Delta) || (Q1 == Delta && R1.isZero())));
+
+ if (Retval.IsAdd && !D[0]) {
+ unsigned PreShift = D.countr_zero();
+ APInt ShiftedD = D.lshr(PreShift);
+ Retval =
+ UnsignedDivisionByConstantInfo::get(ShiftedD, LeadingZeros + PreShift);
+ assert(Retval.IsAdd == 0 && Retval.PreShift == 0);
+ Retval.PreShift = PreShift;
+ return Retval;
+ }
+
+ Retval.Magic = std::move(Q2); // resulting magic number
+ ++Retval.Magic;
+ Retval.PostShift = P - D.getBitWidth(); // resulting shift
+ // Reduce shift amount for IsAdd.
+ if (Retval.IsAdd) {
+ assert(Retval.PostShift > 0 && "Unexpected shift");
+ Retval.PostShift -= 1;
+ }
+ Retval.PreShift = 0;
+ return Retval;
+}
+
UnsignedDivisionByConstantInfo
UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
assert(!D.isZero() && !D.isOne() && "Precondition violation.");
assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
+ if (D.isPowerOf2())
+ return get2(D, LeadingZeros);
struct UnsignedDivisionByConstantInfo Retval;
APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
@@ -138,3 +223,4 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
return Retval;
}
+
>From b401c547d4442915022c39a041b49f6977a4fa72 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 19 Jul 2024 13:04:51 -0400
Subject: [PATCH 3/3] f
---
llvm/lib/Support/DivisionByConstantInfo.cpp | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index 18576b66d9936..6290cb91cbfe8 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -168,9 +168,9 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
// initialize Q = (2P-1)/D; R2 = rem((2P-1),D)
APInt::udivrem(SignedMax, D, Q2, R2);
- APInt down_multiplier = APInt::getZero(D.getBitWidth());
- unsigned down_exponent = 0;
- bool hasMagicDown = false;
+ APInt MultiplierRoundDown = APInt::getZero(D.getBitWidth());
+ unsigned ExponentRoundDown = 0;
+ bool HasMagicDown = false;
unsigned Log2D = D.ceilLogBase2();
unsigned Exponent = 0;
@@ -193,10 +193,10 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
// Set magic_down if we have not set it yet and this exponent works for the
// round_down algorithm
- if (!hasMagicDown && R2.ule(Ule)) {
- hasMagicDown = true;
- down_multiplier = Q2;
- down_exponent = Exponent;
+ if (!HasMagicDown && R2.ule(Ule)) {
+ HasMagicDown = true;
+ MultiplierRoundDown = Q2;
+ ExponentRoundDown = Exponent;
}
}
@@ -208,9 +208,9 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
Retval.IsAdd = false;
} else if (!D[0]) {
//
- Retval.Magic = down_multiplier;
+ Retval.Magic = MultiplierRoundDown;
Retval.PreShift = 0;
- Retval.PostShift = down_exponent;
+ Retval.PostShift = ExponentRoundDown;
Retval.IsAdd = true;
} else {
unsigned PreShift = D.countr_zero();
More information about the llvm-commits
mailing list