[llvm] [CodeGen] Use round-down algorithm for uncooperative constants (PR #99666)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 19 10:31:05 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: AtariDreams (AtariDreams)
<details>
<summary>Changes</summary>
This is inspired by https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
---
Patch is 626.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99666.diff
73 Files Affected:
- (modified) llvm/include/llvm/Support/DivisionByConstantInfo.h (+2-3)
- (modified) llvm/lib/Support/DivisionByConstantInfo.cpp (+74-13)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll (+58-80)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir (+89-102)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir (+2-4)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir (+4-30)
- (modified) llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll (+9-11)
- (modified) llvm/test/CodeGen/AArch64/rotate-extract.ll (+6-9)
- (modified) llvm/test/CodeGen/AArch64/sve-expand-div.ll (+4-13)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll (+441-503)
- (modified) llvm/test/CodeGen/AArch64/urem-lkk.ll (+16-22)
- (modified) llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll (+7-6)
- (modified) llvm/test/CodeGen/AArch64/urem-vector-lkk.ll (+55-80)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll (+5-14)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+8-28)
- (modified) llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+58-87)
- (modified) llvm/test/CodeGen/AMDGPU/urem.ll (-1)
- (modified) llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll (+3-3)
- (modified) llvm/test/CodeGen/PowerPC/urem-lkk.ll (+5-11)
- (modified) llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll (+103-119)
- (modified) llvm/test/CodeGen/RISCV/div-by-constant.ll (+113-128)
- (modified) llvm/test/CodeGen/RISCV/div.ll (+30-32)
- (modified) llvm/test/CodeGen/RISCV/pr51206.ll (+2-3)
- (modified) llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll (+9-11)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll (+6-12)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll (+10-16)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll (+139-232)
- (modified) llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode.ll (+76-80)
- (modified) llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll (+75-75)
- (modified) llvm/test/CodeGen/RISCV/select.ll (+25-24)
- (modified) llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll (+209-192)
- (modified) llvm/test/CodeGen/RISCV/split-urem-by-constant.ll (+62-80)
- (modified) llvm/test/CodeGen/RISCV/urem-lkk.ll (+16-29)
- (modified) llvm/test/CodeGen/RISCV/urem-vector-lkk.ll (+13-17)
- (modified) llvm/test/CodeGen/SystemZ/int-div-06.ll (+36-26)
- (modified) llvm/test/CodeGen/SystemZ/int-mul-13.ll (+3-3)
- (modified) llvm/test/CodeGen/Thumb2/mve-blockplacement.ll (+39-45)
- (modified) llvm/test/CodeGen/Thumb2/thumb2-select.ll (+90-29)
- (modified) llvm/test/CodeGen/VE/Scalar/div.ll (+117-9)
- (modified) llvm/test/CodeGen/VE/Scalar/rem.ll (+117-9)
- (modified) llvm/test/CodeGen/VE/Vector/vec_divrem.ll (+4-30)
- (modified) llvm/test/CodeGen/X86/and-encoding.ll (+4-5)
- (modified) llvm/test/CodeGen/X86/atomic-unordered.ll (+5-10)
- (modified) llvm/test/CodeGen/X86/bug80500.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/combine-pmuldq.ll (+40-60)
- (modified) llvm/test/CodeGen/X86/combine-udiv.ll (+80-123)
- (modified) llvm/test/CodeGen/X86/divide-by-constant.ll (+134-166)
- (modified) llvm/test/CodeGen/X86/divmod128.ll (+108-132)
- (modified) llvm/test/CodeGen/X86/divrem-by-select.ll (+4-5)
- (modified) llvm/test/CodeGen/X86/freeze.ll (+4-5)
- (modified) llvm/test/CodeGen/X86/known-bits.ll (+8-6)
- (modified) llvm/test/CodeGen/X86/known-pow2.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/load-scalar-as-vector.ll (+10-20)
- (modified) llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll (+6-6)
- (modified) llvm/test/CodeGen/X86/pr35636.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/pr38217.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll (+12-30)
- (modified) llvm/test/CodeGen/X86/rem.ll (+1-2)
- (modified) llvm/test/CodeGen/X86/rotate-extract-vector.ll (+11-7)
- (modified) llvm/test/CodeGen/X86/rotate-extract.ll (+15-25)
- (modified) llvm/test/CodeGen/X86/urem-i8-constant.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/urem-lkk.ll (+15-20)
- (modified) llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll (+50-40)
- (modified) llvm/test/CodeGen/X86/urem-vector-lkk.ll (+124-159)
- (modified) llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll (+183-407)
- (modified) llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll (+174-405)
- (modified) llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll (+149-328)
- (modified) llvm/test/CodeGen/X86/vector-idiv-v2i32.ll (+20-40)
- (modified) llvm/test/CodeGen/X86/vshli-simplify-demanded-bits.ll (+1-3)
- (modified) llvm/test/CodeGen/X86/x86_64-mul-by-const.ll (+8-2)
- (modified) llvm/unittests/Support/DivisionByConstantTest.cpp (+11-17)
``````````diff
diff --git a/llvm/include/llvm/Support/DivisionByConstantInfo.h b/llvm/include/llvm/Support/DivisionByConstantInfo.h
index caa0b35e71447..fb0c1382ce821 100644
--- a/llvm/include/llvm/Support/DivisionByConstantInfo.h
+++ b/llvm/include/llvm/Support/DivisionByConstantInfo.h
@@ -26,9 +26,8 @@ struct SignedDivisionByConstantInfo {
/// Magic data for optimising unsigned division by a constant.
struct UnsignedDivisionByConstantInfo {
- static UnsignedDivisionByConstantInfo
- get(const APInt &D, unsigned LeadingZeros = 0,
- bool AllowEvenDivisorOptimization = true);
+ static UnsignedDivisionByConstantInfo get(const APInt &D,
+ unsigned LeadingZeros = 0);
APInt Magic; ///< magic number
bool IsAdd; ///< add indicator
unsigned PostShift; ///< post-shift amount
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index b0e503003a680..9f602d8c12bd8 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -70,9 +70,9 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
/// S. Warren, Jr., chapter 10.
/// LeadingZeros can be used to simplify the calculation if the upper bits
/// of the divided value are known zero.
-UnsignedDivisionByConstantInfo
-UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
- bool AllowEvenDivisorOptimization) {
+
+static UnsignedDivisionByConstantInfo get2(const APInt &D,
+ unsigned LeadingZeros) {
assert(!D.isZero() && !D.isOne() && "Precondition violation.");
assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
@@ -132,16 +132,6 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
} while (P < D.getBitWidth() * 2 &&
(Q1.ult(Delta) || (Q1 == Delta && R1.isZero())));
- if (Retval.IsAdd && !D[0] && AllowEvenDivisorOptimization) {
- unsigned PreShift = D.countr_zero();
- APInt ShiftedD = D.lshr(PreShift);
- Retval =
- UnsignedDivisionByConstantInfo::get(ShiftedD, LeadingZeros + PreShift);
- assert(Retval.IsAdd == 0 && Retval.PreShift == 0);
- Retval.PreShift = PreShift;
- return Retval;
- }
-
Retval.Magic = std::move(Q2); // resulting magic number
++Retval.Magic;
Retval.PostShift = P - D.getBitWidth(); // resulting shift
@@ -153,3 +143,74 @@ UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros,
Retval.PreShift = 0;
return Retval;
}
+
+UnsignedDivisionByConstantInfo
+UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
+ assert(!D.isZero() && !D.isOne() && "Precondition violation.");
+ assert(D.getBitWidth() > 1 && "Does not work at smaller bitwidths.");
+
+ if (D.isPowerOf2())
+ return get2(D, LeadingZeros);
+ struct UnsignedDivisionByConstantInfo Retval;
+ APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
+
+ // Calculate NC, the largest dividend such that NC.urem(D) == D-1.
+ APInt Q2, R2;
+ // initialize Q = (2P-1)/D; R2 = rem((2P-1),D)
+ APInt::udivrem(SignedMax, D, Q2, R2);
+
+ APInt MultiplierRoundDown = APInt::getZero(D.getBitWidth());
+ unsigned ExponentRoundDown = 0;
+ bool HasMagicDown = false;
+
+ unsigned Log2D = D.ceilLogBase2();
+ unsigned Exponent = 0;
+
+ for (;; Exponent++) {
+ if (R2.uge(D - R2)) {
+ Q2 <<= 1;
+ ++Q2;
+ R2 <<= 1;
+ R2 -= D;
+ } else {
+ Q2 <<= 1;
+ R2 <<= 1;
+ }
+
+ APInt Ule = APInt::getOneBitSet(D.getBitWidth(), Exponent + LeadingZeros);
+
+ if (Exponent + LeadingZeros >= Log2D || (D - R2).ule(Ule))
+ break;
+
+ // Set magic_down if we have not set it yet and this exponent works for the
+ // round_down algorithm
+ if (!HasMagicDown && R2.ule(Ule)) {
+ HasMagicDown = true;
+ MultiplierRoundDown = Q2;
+ ExponentRoundDown = Exponent;
+ }
+ }
+
+ if (Exponent < Log2D) {
+ // Do the normal values
+ Retval.Magic = Q2 + 1;
+ Retval.PreShift = 0;
+ Retval.PostShift = Exponent;
+ Retval.IsAdd = false;
+ } else if (!D[0]) {
+ //
+ Retval.Magic = MultiplierRoundDown;
+ Retval.PreShift = 0;
+ Retval.PostShift = ExponentRoundDown;
+ Retval.IsAdd = true;
+ } else {
+ unsigned PreShift = D.countr_zero();
+ APInt ShiftedD = D.lshr(PreShift);
+ Retval = UnsignedDivisionByConstantInfo::get(
+ ShiftedD, D.getBitWidth() - LeadingZeros - PreShift);
+ assert(Retval.IsAdd == 0 && Retval.PreShift == 0);
+ Retval.PreShift = PreShift;
+ }
+
+ return Retval;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index d465e0237201b..49e96cc424c1d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -6,14 +6,12 @@
define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_uniform:
; SDAG: // %bb.0:
-; SDAG-NEXT: mov w8, #25645 // =0x642d
+; SDAG-NEXT: mov w8, #45589 // =0xb215
; SDAG-NEXT: dup v1.8h, w8
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: usra v1.8h, v0.8h, #1
-; SDAG-NEXT: ushr v0.8h, v1.8h, #4
+; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; SDAG-NEXT: ushr v0.8h, v0.8h, #4
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_uniform:
@@ -21,11 +19,9 @@ define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; GISEL-NEXT: adrp x8, .LCPI0_0
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: usra v1.8h, v0.8h, #1
-; GISEL-NEXT: ushr v0.8h, v1.8h, #4
+; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: ushr v0.8h, v0.8h, #4
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
ret <8 x i16> %1
@@ -37,37 +33,30 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SDAG-NEXT: adrp x8, .LCPI1_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; SDAG-NEXT: adrp x8, .LCPI1_1
+; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; SDAG-NEXT: adrp x8, .LCPI1_2
-; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h
-; SDAG-NEXT: umull2 v3.4s, v1.8h, v2.8h
-; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
-; SDAG-NEXT: adrp x8, .LCPI1_3
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
+; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_2]
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform:
; GISEL: // %bb.0:
-; GISEL-NEXT: adrp x8, .LCPI1_3
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_3]
; GISEL-NEXT: adrp x8, .LCPI1_2
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_2]
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_2]
; GISEL-NEXT: adrp x8, .LCPI1_1
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h
-; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h
+; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; GISEL-NEXT: adrp x8, .LCPI1_0
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
@@ -87,13 +76,17 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SDAG-NEXT: adrp x8, .LCPI2_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; SDAG-NEXT: adrp x8, .LCPI2_1
-; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
-; SDAG-NEXT: adrp x8, .LCPI2_2
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; SDAG-NEXT: adrp x8, .LCPI2_2
+; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
+; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
-; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
@@ -102,15 +95,18 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; GISEL-NEXT: adrp x8, .LCPI2_2
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_2]
; GISEL-NEXT: adrp x8, .LCPI2_1
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_1]
-; GISEL-NEXT: adrp x8, .LCPI2_0
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
-; GISEL-NEXT: neg v1.8h, v1.8h
-; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_1]
+; GISEL-NEXT: adrp x8, .LCPI2_0
+; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
+; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
+; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
+; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h
+; GISEL-NEXT: add v0.8h, v0.8h, v1.8h
+; GISEL-NEXT: neg v1.8h, v2.8h
; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -124,12 +120,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; SDAG-NEXT: adrp x8, .LCPI3_1
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
-; SDAG-NEXT: usra v1.8h, v0.8h, #1
-; SDAG-NEXT: ldr q0, [x8, :lo12:.LCPI3_1]
-; SDAG-NEXT: ushl v0.8h, v1.8h, v0.8h
+; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h
+; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
+; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform3:
@@ -138,13 +132,11 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
; GISEL-NEXT: adrp x8, .LCPI3_0
; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
-; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
-; GISEL-NEXT: usra v1.8h, v0.8h, #1
-; GISEL-NEXT: neg v0.8h, v2.8h
-; GISEL-NEXT: ushl v0.8h, v1.8h, v0.8h
+; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
+; GISEL-NEXT: neg v1.8h, v1.8h
+; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h
+; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
ret <8 x i16> %1
@@ -153,7 +145,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform4:
; SDAG: // %bb.0:
-; SDAG-NEXT: movi v1.16b, #171
+; SDAG-NEXT: movi v1.16b, #85
; SDAG-NEXT: adrp x8, .LCPI4_0
; SDAG-NEXT: adrp x9, .LCPI4_1
; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_1]
@@ -162,7 +154,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
-; SDAG-NEXT: ushr v1.16b, v1.16b, #7
+; SDAG-NEXT: ushr v1.16b, v1.16b, #6
; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
@@ -192,50 +184,36 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
; SDAG-LABEL: pr38477:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI5_0
-; SDAG-NEXT: adrp x9, .LCPI5_4
+; SDAG-NEXT: adrp x9, .LCPI5_3
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; SDAG-NEXT: adrp x8, .LCPI5_1
-; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_1]
-; SDAG-NEXT: adrp x8, .LCPI5_2
+; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_3]
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
-; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: sub v2.8h, v0.8h, v1.8h
-; SDAG-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; SDAG-NEXT: umull v2.4s, v2.4h, v3.4h
-; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_4]
; SDAG-NEXT: and v0.16b, v0.16b, v3.16b
-; SDAG-NEXT: uzp2 v2.8h, v2.8h, v4.8h
-; SDAG-NEXT: add v1.8h, v2.8h, v1.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2]
-; SDAG-NEXT: adrp x8, .LCPI5_3
+; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
+; SDAG-NEXT: adrp x8, .LCPI5_2
; SDAG-NEXT: ushl v1.8h, v1.8h, v2.8h
-; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_3]
+; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2]
; SDAG-NEXT: and v1.16b, v1.16b, v2.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
;
; GISEL-LABEL: pr38477:
; GISEL: // %bb.0:
-; GISEL-NEXT: adrp x8, .LCPI5_3
-; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_3]
; GISEL-NEXT: adrp x8, .LCPI5_2
-; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
+; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_2]
; GISEL-NEXT: adrp x8, .LCPI5_0
-; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
-; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h
-; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h
-; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h
; GISEL-NEXT: ldr d3, [x8, :lo12:.LCPI5_0]
; GISEL-NEXT: adrp x8, .LCPI5_1
-; GISEL-NEXT: ushll v3.8h, v3.8b, #0
-; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h
+; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h
+; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1]
-; GISEL-NEXT: shl v3.8h, v3.8h, #15
-; GISEL-NEXT: add v1.8h, v2.8h, v1.8h
+; GISEL-NEXT: ushll v3.8h, v3.8b, #0
+; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; GISEL-NEXT: neg v2.8h, v4.8h
+; GISEL-NEXT: shl v3.8h, v3.8h, #15
; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h
; GISEL-NEXT: sshr v2.8h, v3.8h, #15
; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
index f8578a694e2d4..8cd2d7e9fc17b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir
@@ -9,13 +9,11 @@ body: |
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 818089009
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[C1]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32)
- ; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1636178017
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
+ ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
%0:_(s32) = COPY $w0
%cst:_(s32) = G_CONSTANT i32 42
%2:_(s32) = G_UDIV %0(s32), %cst(s32)
@@ -35,18 +33,13 @@ body: |
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>)
- ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>)
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<8 x s16>)
+ ; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
@@ -72,30 +65,26 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -19947
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195
- ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
- ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
- ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 3855
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32757
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+ ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 4681
+ ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 512
- ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767
- ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
- ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639
- ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C8]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C7]](s16), [[C1]](s16), [[C1]](s16), [[C12]](s16), [[C12]](s16...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/99666
More information about the llvm-commits
mailing list