[llvm] [GlobalISel] Allow expansion of srem by constant in prelegalizer (PR #148845)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 15 06:04:44 PDT 2025
https://github.com/jyli0116 created https://github.com/llvm/llvm-project/pull/148845
This patch allows srem by a constant to be expanded more efficiently to avoid the need for expensive sdiv instructions. This is the last part of the patches which fixes #118090
>From aec14974c6bb91149f4129b8d6bb780abde7f3ed Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Tue, 15 Jul 2025 12:38:37 +0000
Subject: [PATCH] [GlobalISel] Allow expansion of srem by constant in
prelegalizer
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 23 +-
.../include/llvm/Target/GlobalISel/Combine.td | 20 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 45 +-
llvm/test/CodeGen/AArch64/rem-by-const.ll | 819 ++++++++----------
.../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 158 +---
5 files changed, 449 insertions(+), 616 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 31f1197b9723b..da829046cc421 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -700,18 +700,19 @@ class CombinerHelper {
/// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
/// return an expression that implements it by multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
- MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
+ MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const;
/// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
- bool matchUDivorURemByConst(MachineInstr &MI) const;
- void applyUDivorURemByConst(MachineInstr &MI) const;
-
- /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
- /// expression that implements it by multiplying by a magic number.
- /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
- MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
- /// Combine G_SDIV by constant into a multiply by magic constant.
- bool matchSDivByConst(MachineInstr &MI) const;
- void applySDivByConst(MachineInstr &MI) const;
+ bool matchUDivOrURemByConst(MachineInstr &MI) const;
+ void applyUDivOrURemByConst(MachineInstr &MI) const;
+
+ /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by
+ /// constant, return an expression that implements it by multiplying by a
+ /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's
+ /// Guide".
+ MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const;
+ /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant.
+ bool matchSDivOrSRemByConst(MachineInstr &MI) const;
+ void applySDivOrSRemByConst(MachineInstr &MI) const;
/// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
/// return expressions that implements it by shifting.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 66051d756c808..fc81ab76dc72d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
def udiv_by_const : GICombineRule<
(defs root:$root),
(match (G_UDIV $dst, $x, $y):$root,
- [{ return Helper.matchUDivorURemByConst(*${root}); }]),
- (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+ [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+ (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
def sdiv_by_const : GICombineRule<
(defs root:$root),
(match (G_SDIV $dst, $x, $y):$root,
- [{ return Helper.matchSDivByConst(*${root}); }]),
- (apply [{ Helper.applySDivByConst(*${root}); }])>;
+ [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+ (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
def sdiv_by_pow2 : GICombineRule<
(defs root:$root),
@@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2,
def urem_by_const : GICombineRule<
(defs root:$root),
(match (G_UREM $dst, $x, $y):$root,
- [{ return Helper.matchUDivorURemByConst(*${root}); }]),
- (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+ [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+ (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
-def intrem_combines : GICombineGroup<[urem_by_const]>;
+def srem_by_const : GICombineRule<
+ (defs root:$root),
+ (match (G_SREM $dst, $x, $y):$root,
+ [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+ (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
def reassoc_ptradd : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3922eba55e195..e8f513ad5a7a9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
return false;
}
-MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
auto &UDivorRem = cast<GenericMachineInstr>(MI);
@@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
return ret;
}
-bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
Register Dst = MI.getOperand(0).getReg();
@@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
-void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
- auto *NewMI = buildUDivorURemUsingMul(MI);
+void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const {
+ auto *NewMI = buildUDivOrURemUsingMul(MI);
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
-bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
- assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM);
Register Dst = MI.getOperand(0).getReg();
Register RHS = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
@@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
return false;
// If the sdiv has an 'exact' flag we can use a simpler lowering.
- if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+ if (Opcode == TargetOpcode::G_SDIV &&
+ MI.getFlag(MachineInstr::MIFlag::IsExact)) {
return matchUnaryPredicate(
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
@@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
!isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
return false;
+ if (Opcode == TargetOpcode::G_SREM &&
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+ return false;
}
return matchUnaryPredicate(
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
-void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
- auto *NewMI = buildSDivUsingMul(MI);
+void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const {
+ auto *NewMI = buildSDivOrSRemUsingMul(MI);
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
-MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
- assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
- auto &SDiv = cast<GenericMachineInstr>(MI);
- Register Dst = SDiv.getReg(0);
- Register LHS = SDiv.getReg(1);
- Register RHS = SDiv.getReg(2);
+MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ assert(MI.getOpcode() == TargetOpcode::G_SDIV ||
+ Opcode == TargetOpcode::G_SREM);
+ auto &SDivorRem = cast<GenericMachineInstr>(MI);
+ Register Dst = SDivorRem.getReg(0);
+ Register LHS = SDivorRem.getReg(1);
+ Register RHS = SDivorRem.getReg(2);
LLT Ty = MRI.getType(Dst);
LLT ScalarTy = Ty.getScalarType();
const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
auto T = MIB.buildLShr(Ty, Q, SignShift);
T = MIB.buildAnd(Ty, T, ShiftMask);
- return MIB.buildAdd(Ty, Q, T);
+ auto ret = MIB.buildAdd(Ty, Q, T);
+
+ if (Opcode == TargetOpcode::G_SREM) {
+ auto Prod = MIB.buildMul(Ty, ret, RHS);
+ return MIB.buildSub(Ty, LHS, Prod);
+ }
+ return ret;
}
bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1376f5d9a380d..b124042265d40 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) {
; CHECK-GI-LABEL: si8_7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxtb w8, w0
-; CHECK-GI-NEXT: mov w9, #7 // =0x7
-; CHECK-GI-NEXT: sdiv w8, w8, w9
+; CHECK-GI-NEXT: mov w9, #-109 // =0xffffff93
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: sxth w8, w8
+; CHECK-GI-NEXT: add w8, w0, w8, asr #8
+; CHECK-GI-NEXT: sbfx w8, w8, #2, #6
+; CHECK-GI-NEXT: ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -45,8 +50,14 @@ define i8 @si8_100(i8 %a, i8 %b) {
; CHECK-GI-LABEL: si8_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxtb w8, w0
+; CHECK-GI-NEXT: mov w9, #41 // =0x29
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: sxth w8, w8
+; CHECK-GI-NEXT: sbfx w8, w8, #8, #8
+; CHECK-GI-NEXT: asr w8, w8, #4
+; CHECK-GI-NEXT: ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: sdiv w8, w8, w9
; CHECK-GI-NEXT: msub w0, w8, w9, w0
; CHECK-GI-NEXT: ret
entry:
@@ -129,8 +140,12 @@ define i16 @si16_7(i16 %a, i16 %b) {
; CHECK-GI-LABEL: si16_7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxth w8, w0
-; CHECK-GI-NEXT: mov w9, #7 // =0x7
-; CHECK-GI-NEXT: sdiv w8, w8, w9
+; CHECK-GI-NEXT: mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: asr w8, w8, #16
+; CHECK-GI-NEXT: asr w8, w8, #1
+; CHECK-GI-NEXT: ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -155,8 +170,13 @@ define i16 @si16_100(i16 %a, i16 %b) {
; CHECK-GI-LABEL: si16_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxth w8, w0
+; CHECK-GI-NEXT: mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: asr w8, w8, #16
+; CHECK-GI-NEXT: asr w8, w8, #3
+; CHECK-GI-NEXT: ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: sdiv w8, w8, w9
; CHECK-GI-NEXT: msub w0, w8, w9, w0
; CHECK-GI-NEXT: ret
entry:
@@ -240,8 +260,13 @@ define i32 @si32_7(i32 %a, i32 %b) {
;
; CHECK-GI-LABEL: si32_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: sdiv w8, w0, w8
+; CHECK-GI-NEXT: mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT: movk w8, #37449, lsl #16
+; CHECK-GI-NEXT: smull x8, w0, w8
+; CHECK-GI-NEXT: asr x8, x8, #32
+; CHECK-GI-NEXT: add w8, w8, w0
+; CHECK-GI-NEXT: asr w8, w8, #2
+; CHECK-GI-NEXT: add w8, w8, w8, lsr #31
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -265,9 +290,14 @@ define i32 @si32_100(i32 %a, i32 %b) {
;
; CHECK-GI-LABEL: si32_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: sdiv w9, w0, w8
-; CHECK-GI-NEXT: msub w0, w9, w8, w0
+; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT: mov w9, #100 // =0x64
+; CHECK-GI-NEXT: movk w8, #20971, lsl #16
+; CHECK-GI-NEXT: smull x8, w0, w8
+; CHECK-GI-NEXT: asr x8, x8, #32
+; CHECK-GI-NEXT: asr w8, w8, #5
+; CHECK-GI-NEXT: add w8, w8, w8, lsr #31
+; CHECK-GI-NEXT: msub w0, w8, w9, w0
; CHECK-GI-NEXT: ret
entry:
%s = srem i32 %a, 100
@@ -348,8 +378,13 @@ define i64 @si64_7(i64 %a, i64 %b) {
;
; CHECK-GI-LABEL: si64_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: sdiv x8, x0, x8
+; CHECK-GI-NEXT: mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT: movk x8, #9362, lsl #16
+; CHECK-GI-NEXT: movk x8, #37449, lsl #32
+; CHECK-GI-NEXT: movk x8, #18724, lsl #48
+; CHECK-GI-NEXT: smulh x8, x0, x8
+; CHECK-GI-NEXT: asr x8, x8, #1
+; CHECK-GI-NEXT: add x8, x8, x8, lsr #63
; CHECK-GI-NEXT: lsl x9, x8, #3
; CHECK-GI-NEXT: sub x8, x9, x8
; CHECK-GI-NEXT: sub x0, x0, x8
@@ -376,9 +411,16 @@ define i64 @si64_100(i64 %a, i64 %b) {
;
; CHECK-GI-LABEL: si64_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: sdiv x9, x0, x8
-; CHECK-GI-NEXT: msub x0, x9, x8, x0
+; CHECK-GI-NEXT: mov x8, #55051 // =0xd70b
+; CHECK-GI-NEXT: mov w9, #100 // =0x64
+; CHECK-GI-NEXT: movk x8, #28835, lsl #16
+; CHECK-GI-NEXT: movk x8, #2621, lsl #32
+; CHECK-GI-NEXT: movk x8, #41943, lsl #48
+; CHECK-GI-NEXT: smulh x8, x0, x8
+; CHECK-GI-NEXT: add x8, x8, x0
+; CHECK-GI-NEXT: asr x8, x8, #6
+; CHECK-GI-NEXT: add x8, x8, x8, lsr #63
+; CHECK-GI-NEXT: msub x0, x8, x9, x0
; CHECK-GI-NEXT: ret
entry:
%s = srem i64 %a, 100
@@ -644,25 +686,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
;
; CHECK-GI-LABEL: sv2i8_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT: mov w8, #65427 // =0xff93
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: sdiv w9, w9, w8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: smov w11, v1.h[1]
-; CHECK-GI-NEXT: sdiv w8, w10, w8
-; CHECK-GI-NEXT: smov w10, v1.h[0]
+; CHECK-GI-NEXT: smov w8, v1.h[0]
+; CHECK-GI-NEXT: smov w9, v1.h[1]
+; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov w8, #8 // =0x8
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov w8, #2 // =0x2
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: neg v2.4h, v2.4h
+; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v2.b[1], w8
+; CHECK-GI-NEXT: mov w8, #7 // =0x7
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: neg v2.8b, v2.8b
+; CHECK-GI-NEXT: mov w9, v1.s[1]
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: neg v2.8b, v3.8b
+; CHECK-GI-NEXT: movi v3.2s, #7
+; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: umov w8, v1.b[0]
+; CHECK-GI-NEXT: umov w10, v1.b[1]
+; CHECK-GI-NEXT: umov w9, v2.b[0]
+; CHECK-GI-NEXT: umov w11, v2.b[1]
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: fmov s1, w10
-; CHECK-GI-NEXT: mov v1.s[1], w11
-; CHECK-GI-NEXT: mov v2.s[1], w8
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov v1.s[1], w10
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i8> %d, <i8 7, i8 7>
@@ -687,25 +753,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
;
; CHECK-GI-LABEL: sv2i8_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT: mov w8, #41 // =0x29
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: sdiv w9, w9, w8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: smov w11, v1.h[1]
-; CHECK-GI-NEXT: sdiv w8, w10, w8
-; CHECK-GI-NEXT: smov w10, v1.h[0]
+; CHECK-GI-NEXT: smov w8, v1.h[0]
+; CHECK-GI-NEXT: smov w9, v1.h[1]
+; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov w8, #8 // =0x8
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov w8, #4 // =0x4
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: neg v2.4h, v2.4h
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: mov w8, #7 // =0x7
+; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: neg v3.8b, v3.8b
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: mov v2.b[1], w8
+; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT: neg v2.8b, v2.8b
+; CHECK-GI-NEXT: movi v3.2s, #100
+; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: umov w8, v1.b[0]
+; CHECK-GI-NEXT: umov w10, v1.b[1]
+; CHECK-GI-NEXT: umov w9, v2.b[0]
+; CHECK-GI-NEXT: umov w11, v2.b[1]
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: fmov s1, w10
-; CHECK-GI-NEXT: mov v1.s[1], w11
-; CHECK-GI-NEXT: mov v2.s[1], w8
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov v1.s[1], w10
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i8> %d, <i8 100, i8 100>
@@ -872,30 +959,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
;
; CHECK-GI-LABEL: sv4i8_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: movi v3.4h, #7
-; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT: mov v2.h[1], w8
-; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w9, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w9
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov w8, #147 // =0x93
+; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT: mov w9, #7 // =0x7
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT: mov v1.b[1], w8
+; CHECK-GI-NEXT: mov v4.b[1], w9
+; CHECK-GI-NEXT: mov v1.b[2], w8
+; CHECK-GI-NEXT: mov v4.b[2], w9
+; CHECK-GI-NEXT: mov v1.b[3], w8
+; CHECK-GI-NEXT: mov w8, #2 // =0x2
+; CHECK-GI-NEXT: mov v4.b[3], w9
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT: fmov d2, d0
+; CHECK-GI-NEXT: mov v3.b[2], w8
+; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #8
+; CHECK-GI-NEXT: mov v3.b[3], w8
+; CHECK-GI-NEXT: uzp1 v1.8b, v2.8b, v0.8b
+; CHECK-GI-NEXT: neg v2.8b, v3.8b
+; CHECK-GI-NEXT: dup v3.4h, w9
+; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: neg v2.8b, v4.8b
+; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT: add v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: mls v0.4h, v1.4h, v3.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i8> %d, <i8 7, i8 7, i8 7, i8 7>
@@ -943,30 +1037,37 @@ define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) {
;
; CHECK-GI-LABEL: sv4i8_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: mov w8, #41 // =0x29
+; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT: mov w9, #7 // =0x7
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT: mov v1.b[1], w8
+; CHECK-GI-NEXT: mov v4.b[1], w9
+; CHECK-GI-NEXT: mov v1.b[2], w8
+; CHECK-GI-NEXT: mov v4.b[2], w9
+; CHECK-GI-NEXT: mov v1.b[3], w8
+; CHECK-GI-NEXT: mov w8, #4 // =0x4
+; CHECK-GI-NEXT: mov v4.b[3], w9
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT: mov v3.b[2], w8
+; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
+; CHECK-GI-NEXT: mov v3.b[3], w8
; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: movi v3.4h, #100
-; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT: mov v2.h[1], w8
-; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w9, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w9
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: neg v2.8b, v3.8b
+; CHECK-GI-NEXT: dup v3.4h, w8
+; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: neg v2.8b, v4.8b
+; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT: add v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: mls v0.4h, v1.4h, v3.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i8> %d, <i8 100, i8 100, i8 100, i8 100>
@@ -988,42 +1089,15 @@ define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) {
;
; CHECK-GI-LABEL: sv8i8_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: movi v4.8b, #7
-; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: fmov w13, s0
-; CHECK-GI-NEXT: mov w10, v1.s[1]
-; CHECK-GI-NEXT: mov w14, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v1.s[2]
-; CHECK-GI-NEXT: mov w15, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v1.s[3]
-; CHECK-GI-NEXT: mov w16, v0.s[3]
-; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT: sdiv w13, w13, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s3, w13
-; CHECK-GI-NEXT: sdiv w14, w14, w8
-; CHECK-GI-NEXT: mov v2.s[1], w10
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v3.s[1], w14
-; CHECK-GI-NEXT: sdiv w15, w15, w8
-; CHECK-GI-NEXT: mov v2.s[2], w11
-; CHECK-GI-NEXT: sdiv w12, w12, w8
-; CHECK-GI-NEXT: mov v3.s[2], w15
-; CHECK-GI-NEXT: sdiv w8, w16, w8
-; CHECK-GI-NEXT: mov v2.s[3], w12
-; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s
-; CHECK-GI-NEXT: mov v3.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NEXT: xtn v0.8b, v0.8h
+; CHECK-GI-NEXT: movi v1.8b, #147
+; CHECK-GI-NEXT: movi v3.8b, #7
+; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT: add v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #2
+; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #2
+; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b
; CHECK-GI-NEXT: ret
entry:
%s = srem <8 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -1044,42 +1118,14 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) {
;
; CHECK-GI-LABEL: sv8i8_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: movi v4.8b, #100
-; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: fmov w13, s0
-; CHECK-GI-NEXT: mov w10, v1.s[1]
-; CHECK-GI-NEXT: mov w14, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v1.s[2]
-; CHECK-GI-NEXT: mov w15, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v1.s[3]
-; CHECK-GI-NEXT: mov w16, v0.s[3]
-; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0
-; CHECK-GI-NEXT: sdiv w13, w13, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s3, w13
-; CHECK-GI-NEXT: sdiv w14, w14, w8
-; CHECK-GI-NEXT: mov v2.s[1], w10
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v3.s[1], w14
-; CHECK-GI-NEXT: sdiv w15, w15, w8
-; CHECK-GI-NEXT: mov v2.s[2], w11
-; CHECK-GI-NEXT: sdiv w12, w12, w8
-; CHECK-GI-NEXT: mov v3.s[2], w15
-; CHECK-GI-NEXT: sdiv w8, w16, w8
-; CHECK-GI-NEXT: mov v2.s[3], w12
-; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s
-; CHECK-GI-NEXT: mov v3.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-GI-NEXT: xtn v0.8b, v0.8h
+; CHECK-GI-NEXT: movi v1.8b, #41
+; CHECK-GI-NEXT: movi v3.8b, #100
+; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4
+; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4
+; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b
; CHECK-GI-NEXT: ret
entry:
%s = srem <8 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1102,72 +1148,16 @@ define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) {
;
; CHECK-GI-LABEL: sv16i8_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: movi v16.8b, #7
-; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0
-; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: fmov w13, s2
-; CHECK-GI-NEXT: fmov w17, s0
-; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: mov w14, v2.s[1]
-; CHECK-GI-NEXT: mov w18, v0.s[1]
-; CHECK-GI-NEXT: mov w3, v3.s[1]
-; CHECK-GI-NEXT: mov w15, v2.s[2]
-; CHECK-GI-NEXT: mov w0, v0.s[2]
-; CHECK-GI-NEXT: sdiv w11, w9, w8
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov w4, v3.s[2]
-; CHECK-GI-NEXT: mov w16, v2.s[3]
-; CHECK-GI-NEXT: mov w1, v0.s[3]
-; CHECK-GI-NEXT: mov w5, v3.s[3]
-; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0
-; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0
-; CHECK-GI-NEXT: sdiv w13, w13, w8
-; CHECK-GI-NEXT: fmov s4, w11
-; CHECK-GI-NEXT: sdiv w17, w17, w8
-; CHECK-GI-NEXT: fmov s5, w13
-; CHECK-GI-NEXT: sdiv w2, w2, w8
-; CHECK-GI-NEXT: fmov s6, w17
-; CHECK-GI-NEXT: sdiv w12, w9, w8
-; CHECK-GI-NEXT: mov w9, v1.s[2]
-; CHECK-GI-NEXT: fmov s7, w2
-; CHECK-GI-NEXT: sdiv w14, w14, w8
-; CHECK-GI-NEXT: mov v4.s[1], w12
-; CHECK-GI-NEXT: sdiv w18, w18, w8
-; CHECK-GI-NEXT: mov v5.s[1], w14
-; CHECK-GI-NEXT: sdiv w3, w3, w8
-; CHECK-GI-NEXT: mov v6.s[1], w18
-; CHECK-GI-NEXT: sdiv w10, w9, w8
-; CHECK-GI-NEXT: mov w9, v1.s[3]
-; CHECK-GI-NEXT: mov v7.s[1], w3
-; CHECK-GI-NEXT: sdiv w15, w15, w8
-; CHECK-GI-NEXT: mov v4.s[2], w10
-; CHECK-GI-NEXT: sdiv w0, w0, w8
-; CHECK-GI-NEXT: mov v5.s[2], w15
-; CHECK-GI-NEXT: sdiv w4, w4, w8
-; CHECK-GI-NEXT: mov v6.s[2], w0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: mov v7.s[2], w4
-; CHECK-GI-NEXT: sdiv w16, w16, w8
-; CHECK-GI-NEXT: mov v4.s[3], w9
-; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT: sdiv w1, w1, w8
-; CHECK-GI-NEXT: mov v5.s[3], w16
-; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s
-; CHECK-GI-NEXT: sdiv w8, w5, w8
-; CHECK-GI-NEXT: mov v6.s[3], w1
-; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s
-; CHECK-GI-NEXT: mov v7.s[3], w8
-; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: movi v1.16b, #147
+; CHECK-GI-NEXT: movi v3.16b, #7
+; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: add v1.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: sshr v2.16b, v1.16b, #2
+; CHECK-GI-NEXT: ushr v2.16b, v2.16b, #7
+; CHECK-GI-NEXT: ssra v2.16b, v1.16b, #2
+; CHECK-GI-NEXT: mls v0.16b, v2.16b, v3.16b
; CHECK-GI-NEXT: ret
entry:
%s = srem <16 x i8> %d, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -1189,72 +1179,15 @@ define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) {
;
; CHECK-GI-LABEL: sv16i8_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: movi v16.8b, #100
-; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0
-; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: fmov w13, s2
-; CHECK-GI-NEXT: fmov w17, s0
-; CHECK-GI-NEXT: fmov w2, s3
-; CHECK-GI-NEXT: mov w14, v2.s[1]
-; CHECK-GI-NEXT: mov w18, v0.s[1]
-; CHECK-GI-NEXT: mov w3, v3.s[1]
-; CHECK-GI-NEXT: mov w15, v2.s[2]
-; CHECK-GI-NEXT: mov w0, v0.s[2]
-; CHECK-GI-NEXT: sdiv w11, w9, w8
-; CHECK-GI-NEXT: mov w9, v1.s[1]
-; CHECK-GI-NEXT: mov w4, v3.s[2]
-; CHECK-GI-NEXT: mov w16, v2.s[3]
-; CHECK-GI-NEXT: mov w1, v0.s[3]
-; CHECK-GI-NEXT: mov w5, v3.s[3]
-; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0
-; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0
-; CHECK-GI-NEXT: sdiv w13, w13, w8
-; CHECK-GI-NEXT: fmov s4, w11
-; CHECK-GI-NEXT: sdiv w17, w17, w8
-; CHECK-GI-NEXT: fmov s5, w13
-; CHECK-GI-NEXT: sdiv w2, w2, w8
-; CHECK-GI-NEXT: fmov s6, w17
-; CHECK-GI-NEXT: sdiv w12, w9, w8
-; CHECK-GI-NEXT: mov w9, v1.s[2]
-; CHECK-GI-NEXT: fmov s7, w2
-; CHECK-GI-NEXT: sdiv w14, w14, w8
-; CHECK-GI-NEXT: mov v4.s[1], w12
-; CHECK-GI-NEXT: sdiv w18, w18, w8
-; CHECK-GI-NEXT: mov v5.s[1], w14
-; CHECK-GI-NEXT: sdiv w3, w3, w8
-; CHECK-GI-NEXT: mov v6.s[1], w18
-; CHECK-GI-NEXT: sdiv w10, w9, w8
-; CHECK-GI-NEXT: mov w9, v1.s[3]
-; CHECK-GI-NEXT: mov v7.s[1], w3
-; CHECK-GI-NEXT: sdiv w15, w15, w8
-; CHECK-GI-NEXT: mov v4.s[2], w10
-; CHECK-GI-NEXT: sdiv w0, w0, w8
-; CHECK-GI-NEXT: mov v5.s[2], w15
-; CHECK-GI-NEXT: sdiv w4, w4, w8
-; CHECK-GI-NEXT: mov v6.s[2], w0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: mov v7.s[2], w4
-; CHECK-GI-NEXT: sdiv w16, w16, w8
-; CHECK-GI-NEXT: mov v4.s[3], w9
-; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s
-; CHECK-GI-NEXT: sdiv w1, w1, w8
-; CHECK-GI-NEXT: mov v5.s[3], w16
-; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s
-; CHECK-GI-NEXT: sdiv w8, w5, w8
-; CHECK-GI-NEXT: mov v6.s[3], w1
-; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s
-; CHECK-GI-NEXT: mov v7.s[3], w8
-; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s
-; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT: movi v1.16b, #41
+; CHECK-GI-NEXT: movi v3.16b, #100
+; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT: sshr v2.16b, v1.16b, #4
+; CHECK-GI-NEXT: ushr v2.16b, v2.16b, #7
+; CHECK-GI-NEXT: ssra v2.16b, v1.16b, #4
+; CHECK-GI-NEXT: mls v0.16b, v2.16b, v3.16b
; CHECK-GI-NEXT: ret
entry:
%s = srem <16 x i8> %d, <i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100, i8 100>
@@ -1754,20 +1687,31 @@ define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) {
;
; CHECK-GI-LABEL: sv2i16_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
+; CHECK-GI-NEXT: mov w8, #18725 // =0x4925
+; CHECK-GI-NEXT: shl v2.2s, v0.2s, #16
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT: sshr v2.2s, v2.2s, #16
; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
+; CHECK-GI-NEXT: mov w8, #1 // =0x1
; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: mov v2.s[1], w10
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov w8, #15 // =0xf
+; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: mov v3.h[1], w8
+; CHECK-GI-NEXT: neg v2.4h, v2.4h
+; CHECK-GI-NEXT: mov w8, #7 // =0x7
+; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: neg v2.4h, v3.4h
+; CHECK-GI-NEXT: dup v3.2s, w8
+; CHECK-GI-NEXT: ushl v2.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i16> %d, <i16 7, i16 7>
@@ -1792,20 +1736,31 @@ define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) {
;
; CHECK-GI-LABEL: sv2i16_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
+; CHECK-GI-NEXT: mov w8, #5243 // =0x147b
+; CHECK-GI-NEXT: shl v2.2s, v0.2s, #16
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16
+; CHECK-GI-NEXT: sshr v2.2s, v2.2s, #16
; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
+; CHECK-GI-NEXT: mov w8, #3 // =0x3
; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: mov v2.s[1], w10
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov w8, #15 // =0xf
+; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: mov v3.h[1], w8
+; CHECK-GI-NEXT: neg v2.4h, v2.4h
+; CHECK-GI-NEXT: mov w8, #100 // =0x64
+; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: neg v2.4h, v3.4h
+; CHECK-GI-NEXT: dup v3.2s, w8
+; CHECK-GI-NEXT: ushl v2.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i16> %d, <i16 100, i16 100>
@@ -1949,24 +1904,15 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) {
;
; CHECK-GI-LABEL: sv4i16_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: movi v2.4h, #7
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w8, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: adrp x8, .LCPI44_0
+; CHECK-GI-NEXT: movi v3.4h, #7
+; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0]
+; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1
+; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1
+; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i16> %d, <i16 7, i16 7, i16 7, i16 7>
@@ -1988,24 +1934,15 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) {
;
; CHECK-GI-LABEL: sv4i16_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: movi v2.4h, #100
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w8, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: adrp x8, .LCPI45_0
+; CHECK-GI-NEXT: movi v3.4h, #100
+; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0]
+; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16
+; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3
+; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3
+; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i16> %d, <i16 100, i16 100, i16 100, i16 100>
@@ -2028,38 +1965,16 @@ define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) {
;
; CHECK-GI-LABEL: sv8i16_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: movi v4.4h, #7
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: fmov w13, s0
-; CHECK-GI-NEXT: mov w10, v1.s[1]
-; CHECK-GI-NEXT: mov w14, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v1.s[2]
-; CHECK-GI-NEXT: mov w15, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v1.s[3]
-; CHECK-GI-NEXT: mov w16, v0.s[3]
-; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w13, w13, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s3, w13
-; CHECK-GI-NEXT: sdiv w14, w14, w8
-; CHECK-GI-NEXT: mov v2.s[1], w10
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v3.s[1], w14
-; CHECK-GI-NEXT: sdiv w15, w15, w8
-; CHECK-GI-NEXT: mov v2.s[2], w11
-; CHECK-GI-NEXT: sdiv w12, w12, w8
-; CHECK-GI-NEXT: mov v3.s[2], w15
-; CHECK-GI-NEXT: sdiv w8, w16, w8
-; CHECK-GI-NEXT: mov v2.s[3], w12
-; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s
-; CHECK-GI-NEXT: mov v3.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: adrp x8, .LCPI46_0
+; CHECK-GI-NEXT: movi v3.8h, #7
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI46_0]
+; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT: sshr v2.8h, v1.8h, #1
+; CHECK-GI-NEXT: ushr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT: ssra v2.8h, v1.8h, #1
+; CHECK-GI-NEXT: mls v0.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: ret
entry:
%s = srem <8 x i16> %d, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
@@ -2082,38 +1997,16 @@ define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) {
;
; CHECK-GI-LABEL: sv8i16_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: movi v4.4h, #100
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: fmov w13, s0
-; CHECK-GI-NEXT: mov w10, v1.s[1]
-; CHECK-GI-NEXT: mov w14, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v1.s[2]
-; CHECK-GI-NEXT: mov w15, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v1.s[3]
-; CHECK-GI-NEXT: mov w16, v0.s[3]
-; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w13, w13, w8
-; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s3, w13
-; CHECK-GI-NEXT: sdiv w14, w14, w8
-; CHECK-GI-NEXT: mov v2.s[1], w10
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v3.s[1], w14
-; CHECK-GI-NEXT: sdiv w15, w15, w8
-; CHECK-GI-NEXT: mov v2.s[2], w11
-; CHECK-GI-NEXT: sdiv w12, w12, w8
-; CHECK-GI-NEXT: mov v3.s[2], w15
-; CHECK-GI-NEXT: sdiv w8, w16, w8
-; CHECK-GI-NEXT: mov v2.s[3], w12
-; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s
-; CHECK-GI-NEXT: mov v3.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT: adrp x8, .LCPI47_0
+; CHECK-GI-NEXT: movi v3.8h, #100
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI47_0]
+; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT: sshr v2.8h, v1.8h, #3
+; CHECK-GI-NEXT: ushr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT: ssra v2.8h, v1.8h, #3
+; CHECK-GI-NEXT: mls v0.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: ret
entry:
%s = srem <8 x i16> %d, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
@@ -2499,17 +2392,16 @@ define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) {
;
; CHECK-GI-LABEL: sv2i32_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: movi v2.2s, #7
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w8, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: adrp x8, .LCPI56_0
+; CHECK-GI-NEXT: movi v3.2s, #7
+; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #2
+; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31
+; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #2
+; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i32> %d, <i32 7, i32 7>
@@ -2532,17 +2424,15 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) {
;
; CHECK-GI-LABEL: sv2i32_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: movi v2.2s, #100
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w8, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: mov v1.s[1], w8
-; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: adrp x8, .LCPI57_0
+; CHECK-GI-NEXT: movi v3.2s, #100
+; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32
+; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5
+; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31
+; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5
+; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i32> %d, <i32 100, i32 100>
@@ -2664,21 +2554,17 @@ define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) {
;
; CHECK-GI-LABEL: sv4i32_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: movi v2.4s, #7
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w8, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: adrp x8, .LCPI60_0
+; CHECK-GI-NEXT: movi v3.4s, #7
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: sshr v2.4s, v1.4s, #2
+; CHECK-GI-NEXT: ushr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT: ssra v2.4s, v1.4s, #2
+; CHECK-GI-NEXT: mls v0.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i32> %d, <i32 7, i32 7, i32 7, i32 7>
@@ -2702,21 +2588,16 @@ define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) {
;
; CHECK-GI-LABEL: sv4i32_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: movi v2.4s, #100
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w8, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w8
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: adrp x8, .LCPI61_0
+; CHECK-GI-NEXT: movi v3.4s, #100
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: sshr v2.4s, v1.4s, #5
+; CHECK-GI-NEXT: ushr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT: ssra v2.4s, v1.4s, #5
+; CHECK-GI-NEXT: mls v0.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: ret
entry:
%s = srem <4 x i32> %d, <i32 100, i32 100, i32 100, i32 100>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 530f4cf53321e..1eb8457cd4a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -254,27 +254,13 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
; CHECK-LABEL: v_srem_i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000
-; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x80000001
+; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0
+; CHECK-NEXT: v_ashrrev_i32_e32 v1, 11, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v1, 12, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i32 %num, 4096
@@ -327,42 +313,21 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
; CGP-LABEL: v_srem_v2i32_pow2k_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000
-; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000
-; CGP-NEXT: v_mov_b32_e32 v5, 0x1000
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_mul_lo_u32 v7, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT: v_mul_hi_u32 v7, v0, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v7
+; CGP-NEXT: v_mov_b32_e32 v2, 0x80000001
+; CGP-NEXT: v_mul_hi_i32 v3, v0, v2
+; CGP-NEXT: v_mul_hi_i32 v2, v1, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1
+; CGP-NEXT: v_ashrrev_i32_e32 v3, 11, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 11, v2
+; CGP-NEXT: v_lshrrev_b32_e32 v4, 31, v3
+; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xfffff000, v1
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT: v_lshlrev_b32_e32 v2, 12, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
@@ -372,27 +337,14 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
; CHECK-LABEL: v_srem_i32_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x12d8fb
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT: v_mul_lo_u32 v2, v2, v4
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0xd9528441
+; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0
+; CHECK-NEXT: v_ashrrev_i32_e32 v1, 20, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb
+; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i32 %num, 1235195
@@ -445,42 +397,22 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
; CGP-LABEL: v_srem_v2i32_oddk_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8
-; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705
-; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_mul_lo_u32 v7, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT: v_mul_hi_u32 v7, v0, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT: v_mov_b32_e32 v2, 0xd9528441
+; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb
+; CGP-NEXT: v_mul_hi_i32 v4, v0, v2
+; CGP-NEXT: v_mul_hi_i32 v2, v1, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 20, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 20, v2
+; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v4
+; CGP-NEXT: v_lshrrev_b32_e32 v6, 31, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT: v_mul_lo_u32 v2, v2, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i32> %num, <i32 1235195, i32 1235195>
ret <2 x i32> %result
More information about the llvm-commits
mailing list