[llvm] [GlobalISel] Allow expansion of srem by constant in prelegalizer (PR #148845)

Tue Jul 15 06:05:17 PDT 2025

llvmbot wrote:



@llvm/pr-subscribers-backend-aarch64

@llvm/pr-subscribers-llvm-globalisel

Author: None (jyli0116)

<details>
<summary>Changes</summary>

This patch allows srem by a constant to be expanded more efficiently to avoid the need for expensive sdiv instructions. This is the last part of the patches which fixes #118090 

---

Patch is 56.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148845.diff


5 Files Affected:

- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+12-11) 
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+13-7) 
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+29-16) 
- (modified) llvm/test/CodeGen/AArch64/rem-by-const.ll (+350-469) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll (+45-113) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 31f1197b9723b..da829046cc421 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -700,18 +700,19 @@ class CombinerHelper {
   /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
   /// return an expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
+  MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const;
   /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
-  bool matchUDivorURemByConst(MachineInstr &MI) const;
-  void applyUDivorURemByConst(MachineInstr &MI) const;
-
-  /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
-  /// expression that implements it by multiplying by a magic number.
-  /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
-  /// Combine G_SDIV by constant into a multiply by magic constant.
-  bool matchSDivByConst(MachineInstr &MI) const;
-  void applySDivByConst(MachineInstr &MI) const;
+  bool matchUDivOrURemByConst(MachineInstr &MI) const;
+  void applyUDivOrURemByConst(MachineInstr &MI) const;
+
+  /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by
+  /// constant, return an expression that implements it by multiplying by a
+  /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's
+  /// Guide".
+  MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const;
+  /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant.
+  bool matchSDivOrSRemByConst(MachineInstr &MI) const;
+  void applySDivOrSRemByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
   /// return expressions that implements it by shifting.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 66051d756c808..fc81ab76dc72d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
 def udiv_by_const : GICombineRule<
   (defs root:$root),
   (match (G_UDIV $dst, $x, $y):$root,
-   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
 
 def sdiv_by_const : GICombineRule<
   (defs root:$root),
   (match (G_SDIV $dst, $x, $y):$root,
-   [{ return Helper.matchSDivByConst(*${root}); }]),
-  (apply [{ Helper.applySDivByConst(*${root}); }])>;
+   [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+  (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
 
 def sdiv_by_pow2 : GICombineRule<
   (defs root:$root),
@@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2,
 def urem_by_const : GICombineRule<
   (defs root:$root),
   (match (G_UREM $dst, $x, $y):$root,
-   [{ return Helper.matchUDivorURemByConst(*${root}); }]),
-  (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+   [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+  (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
 
-def intrem_combines : GICombineGroup<[urem_by_const]>;
+def srem_by_const : GICombineRule<
+  (defs root:$root),
+  (match (G_SREM $dst, $x, $y):$root,
+   [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+  (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
 
 def reassoc_ptradd : GICombineRule<
   (defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3922eba55e195..e8f513ad5a7a9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   auto &UDivorRem = cast<GenericMachineInstr>(MI);
@@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
   return ret;
 }
 
-bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
   Register Dst = MI.getOperand(0).getReg();
@@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
-  auto *NewMI = buildUDivorURemUsingMul(MI);
+void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildUDivOrURemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
   LLT DstTy = MRI.getType(Dst);
@@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
     return false;
 
   // If the sdiv has an 'exact' flag we can use a simpler lowering.
-  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+  if (Opcode == TargetOpcode::G_SDIV &&
+      MI.getFlag(MachineInstr::MIFlag::IsExact)) {
     return matchUnaryPredicate(
         MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
   }
@@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
     if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
         !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
       return false;
+    if (Opcode == TargetOpcode::G_SREM &&
+        !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+      return false;
   }
 
   return matchUnaryPredicate(
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
-  auto *NewMI = buildSDivUsingMul(MI);
+void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const {
+  auto *NewMI = buildSDivOrSRemUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
-  auto &SDiv = cast<GenericMachineInstr>(MI);
-  Register Dst = SDiv.getReg(0);
-  Register LHS = SDiv.getReg(1);
-  Register RHS = SDiv.getReg(2);
+MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV ||
+         Opcode == TargetOpcode::G_SREM);
+  auto &SDivorRem = cast<GenericMachineInstr>(MI);
+  Register Dst = SDivorRem.getReg(0);
+  Register LHS = SDivorRem.getReg(1);
+  Register RHS = SDivorRem.getReg(2);
   LLT Ty = MRI.getType(Dst);
   LLT ScalarTy = Ty.getScalarType();
   const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
   auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
   auto T = MIB.buildLShr(Ty, Q, SignShift);
   T = MIB.buildAnd(Ty, T, ShiftMask);
-  return MIB.buildAdd(Ty, Q, T);
+  auto ret = MIB.buildAdd(Ty, Q, T);
+
+  if (Opcode == TargetOpcode::G_SREM) {
+    auto Prod = MIB.buildMul(Ty, ret, RHS);
+    return MIB.buildSub(Ty, LHS, Prod);
+  }
+  return ret;
 }
 
 bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1376f5d9a380d..b124042265d40 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) {
 ; CHECK-GI-LABEL: si8_7:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    mov w9, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #-109 // =0xffffff93
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    add w8, w0, w8, asr #8
+; CHECK-GI-NEXT:    sbfx w8, w8, #2, #6
+; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -45,8 +50,14 @@ define i8 @si8_100(i8 %a, i8 %b) {
 ; CHECK-GI-LABEL: si8_100:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    mov w9, #41 // =0x29
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    sbfx w8, w8, #8, #8
+; CHECK-GI-NEXT:    asr w8, w8, #4
+; CHECK-GI-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -129,8 +140,12 @@ define i16 @si16_7(i16 %a, i16 %b) {
 ; CHECK-GI-LABEL: si16_7:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
-; CHECK-GI-NEXT:    mov w9, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    asr w8, w8, #16
+; CHECK-GI-NEXT:    asr w8, w8, #1
+; CHECK-GI-NEXT:    ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -155,8 +170,13 @@ define i16 @si16_100(i16 %a, i16 %b) {
 ; CHECK-GI-LABEL: si16_100:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    asr w8, w8, #16
+; CHECK-GI-NEXT:    asr w8, w8, #3
+; CHECK-GI-NEXT:    ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -240,8 +260,13 @@ define i32 @si32_7(i32 %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: si32_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv w8, w0, w8
+; CHECK-GI-NEXT:    mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT:    movk w8, #37449, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    add w8, w8, w0
+; CHECK-GI-NEXT:    asr w8, w8, #2
+; CHECK-GI-NEXT:    add w8, w8, w8, lsr #31
 ; CHECK-GI-NEXT:    lsl w9, w8, #3
 ; CHECK-GI-NEXT:    sub w8, w9, w8
 ; CHECK-GI-NEXT:    sub w0, w0, w8
@@ -265,9 +290,14 @@ define i32 @si32_100(i32 %a, i32 %b) {
 ;
 ; CHECK-GI-LABEL: si32_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv w9, w0, w8
-; CHECK-GI-NEXT:    msub w0, w9, w8, w0
+; CHECK-GI-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk w8, #20971, lsl #16
+; CHECK-GI-NEXT:    smull x8, w0, w8
+; CHECK-GI-NEXT:    asr x8, x8, #32
+; CHECK-GI-NEXT:    asr w8, w8, #5
+; CHECK-GI-NEXT:    add w8, w8, w8, lsr #31
+; CHECK-GI-NEXT:    msub w0, w8, w9, w0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem i32 %a, 100
@@ -348,8 +378,13 @@ define i64 @si64_7(i64 %a, i64 %b) {
 ;
 ; CHECK-GI-LABEL: si64_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    sdiv x8, x0, x8
+; CHECK-GI-NEXT:    mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT:    movk x8, #9362, lsl #16
+; CHECK-GI-NEXT:    movk x8, #37449, lsl #32
+; CHECK-GI-NEXT:    movk x8, #18724, lsl #48
+; CHECK-GI-NEXT:    smulh x8, x0, x8
+; CHECK-GI-NEXT:    asr x8, x8, #1
+; CHECK-GI-NEXT:    add x8, x8, x8, lsr #63
 ; CHECK-GI-NEXT:    lsl x9, x8, #3
 ; CHECK-GI-NEXT:    sub x8, x9, x8
 ; CHECK-GI-NEXT:    sub x0, x0, x8
@@ -376,9 +411,16 @@ define i64 @si64_100(i64 %a, i64 %b) {
 ;
 ; CHECK-GI-LABEL: si64_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    sdiv x9, x0, x8
-; CHECK-GI-NEXT:    msub x0, x9, x8, x0
+; CHECK-GI-NEXT:    mov x8, #55051 // =0xd70b
+; CHECK-GI-NEXT:    mov w9, #100 // =0x64
+; CHECK-GI-NEXT:    movk x8, #28835, lsl #16
+; CHECK-GI-NEXT:    movk x8, #2621, lsl #32
+; CHECK-GI-NEXT:    movk x8, #41943, lsl #48
+; CHECK-GI-NEXT:    smulh x8, x0, x8
+; CHECK-GI-NEXT:    add x8, x8, x0
+; CHECK-GI-NEXT:    asr x8, x8, #6
+; CHECK-GI-NEXT:    add x8, x8, x8, lsr #63
+; CHECK-GI-NEXT:    msub x0, x8, x9, x0
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem i64 %a, 100
@@ -644,25 +686,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov w8, #65427 // =0xff93
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    smov w11, v1.h[1]
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    smov w8, v1.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[1]
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    neg v2.8b, v3.8b
+; CHECK-GI-NEXT:    movi v3.2s, #7
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w10, v1.b[1]
+; CHECK-GI-NEXT:    umov w9, v2.b[0]
+; CHECK-GI-NEXT:    umov w11, v2.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s1, w10
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i8> %d, <i8 7, i8 7>
@@ -687,25 +753,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv2i8_100:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #100 // =0x64
-; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT:    mov w8, #41 // =0x29
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NEXT:    mov v1.h[1], w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT:    smov w11, v1.h[1]
-; CHECK-GI-NEXT:    sdiv w8, w10, w8
-; CHECK-GI-NEXT:    smov w10, v1.h[0]
+; CHECK-GI-NEXT:    smov w8, v1.h[0]
+; CHECK-GI-NEXT:    smov w9, v1.h[1]
+; CHECK-GI-NEXT:    shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT:    sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov w8, #8 // =0x8
+; CHECK-GI-NEXT:    mov v2.s[1], w9
+; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov w8, #4 // =0x4
+; CHECK-GI-NEXT:    uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    neg v2.4h, v2.4h
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    mov w8, #7 // =0x7
+; CHECK-GI-NEXT:    sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    neg v3.8b, v3.8b
+; CHECK-GI-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    mov v2.b[1], w8
+; CHECK-GI-NEXT:    sshl v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    neg v2.8b, v2.8b
+; CHECK-GI-NEXT:    movi v3.2s, #100
+; CHECK-GI-NEXT:    ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT:    umov w8, v1.b[0]
+; CHECK-GI-NEXT:    umov w10, v1.b[1]
+; CHECK-GI-NEXT:    umov w9, v2.b[0]
+; CHECK-GI-NEXT:    umov w11, v2.b[1]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s1, w10
-; CHECK-GI-NEXT:    mov v1.s[1], w11
-; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov v1.s[1], w10
+; CHECK-GI-NEXT:    mov v2.s[1], w11
+; CHECK-GI-NEXT:    add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT:    mls v0.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <2 x i8> %d, <i8 100, i8 100>
@@ -872,30 +959,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv4i8_7:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov w8, #7 // =0x7
-; CHECK-GI-NEXT:    movi v3.4h, #7
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    mov v2.h[1], w8
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    mov w10, v0.s[1]
-; CHECK-GI-NEXT:    mov w11, v0.s[2]
-; CHECK-GI-NEXT:    mov w12, v0.s[3]
-; CHECK-GI-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT:    sdiv w9, w9, w8
-; CHECK-GI-NEXT:    sdiv w10, w10, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    sdiv w11, w11, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w10
-; CHECK-GI-NEXT:    sdiv w9, w12, w8
-; CHECK-GI-NEXT:    mov v1.s[2], w11
-; CHECK-GI-NEXT:    mov v1.s[3], w9
-; CHECK-GI-NEXT:    mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov w8, #147 // =0x93
+; CHECK-GI-NEXT:    shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT:    mov w9, #7 // =0x7
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w8
+; CHECK-GI-NEXT:    mov v4.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    mov w8, #2 // =0x2
+; CHECK-GI-NEXT:    mov v4.b[3], w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v3.b[1], w8
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT:    fmov d2, d0
+; CHECK-GI-NEXT:    mov v3.b[2], w8
+; CHECK-GI-NEXT:    ssra v2.4h, v1.4h...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/148845