[llvm] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem / udiv+urem into single divmodti4 / udivmodti4 call (PR #187908)

Sun Mar 22 06:05:08 PDT 2026

https://github.com/Takashiidobe updated https://github.com/llvm/llvm-project/pull/187908

>From 3a8e98ffb03c1c039323db38b4edbee32ec8a428 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 23:34:25 -0400
Subject: [PATCH 1/5] [test] Add i128 sdiv+srem missed-optimization test for
 divmod fusion

Add a lit test documenting the current (unoptimized) codegen for i128
sdiv+srem and udiv+urem pairs on x86_64 and AArch64. Both targets
currently emit two separate helper calls (__divti3 + __modti3 or
__udivti3 + __umodti3) rather than a single fused __divmodti4 /
__udivmodti4 call.

The test serves as a baseline and will be updated when the optimization
lands in a follow-up commit.
---
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll | 170 +++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/i128-divrem-libcall.ll

diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
new file mode 100644
index 0000000000000..c8258ec7873de
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
+
+; Test that sdiv+srem / udiv+urem on i128 with the same operands are candidates
+; for fusing into a single __divmodti4 / __udivmodti4 call.
+;
+; Currently this is a missed optimization: two separate helper calls are emitted
+; (__divti3 + __modti3, or __udivti3 + __umodti3) instead of one fused call.
+; See: DAGCombiner::useDivRem, ExpandIntRes_DIVREM, RuntimeLibcalls SDIVREM_I128.
+
+define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
+; X64-LABEL: sdivrem_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    callq __divti3 at PLT
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    callq __modti3 at PLT
+; X64-NEXT:    movq %rbp, 8(%r13)
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, (%r13)
+; X64-NEXT:    movq %rdx, 24(%r13)
+; X64-NEXT:    movq %rax, 16(%r13)
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; A64-LABEL: sdivrem_i128:
+; A64:       // %bb.0:
+; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; A64-NEXT:    mov x23, x0
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; A64-NEXT:    mov x21, x3
+; A64-NEXT:    mov x22, x2
+; A64-NEXT:    mov x1, x3
+; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x3, x5
+; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; A64-NEXT:    mov x19, x5
+; A64-NEXT:    mov x20, x4
+; A64-NEXT:    bl __divti3
+; A64-NEXT:    mov x24, x0
+; A64-NEXT:    mov x25, x1
+; A64-NEXT:    mov x0, x22
+; A64-NEXT:    mov x1, x21
+; A64-NEXT:    mov x2, x20
+; A64-NEXT:    mov x3, x19
+; A64-NEXT:    bl __modti3
+; A64-NEXT:    stp x24, x25, [x23]
+; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; A64-NEXT:    stp x0, x1, [x23, #16]
+; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    ret
+  %q = sdiv i128 %n, %d
+  %r = srem i128 %n, %d
+  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
+  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
+  store i128 %q, ptr %p0, align 16
+  store i128 %r, ptr %p1, align 16
+  ret void
+}
+
+define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
+; X64-LABEL: udivrem_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    callq __udivti3 at PLT
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    callq __umodti3 at PLT
+; X64-NEXT:    movq %rbp, 8(%r13)
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, (%r13)
+; X64-NEXT:    movq %rdx, 24(%r13)
+; X64-NEXT:    movq %rax, 16(%r13)
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; A64-LABEL: udivrem_i128:
+; A64:       // %bb.0:
+; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; A64-NEXT:    mov x23, x0
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; A64-NEXT:    mov x21, x3
+; A64-NEXT:    mov x22, x2
+; A64-NEXT:    mov x1, x3
+; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x3, x5
+; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; A64-NEXT:    mov x19, x5
+; A64-NEXT:    mov x20, x4
+; A64-NEXT:    bl __udivti3
+; A64-NEXT:    mov x24, x0
+; A64-NEXT:    mov x25, x1
+; A64-NEXT:    mov x0, x22
+; A64-NEXT:    mov x1, x21
+; A64-NEXT:    mov x2, x20
+; A64-NEXT:    mov x3, x19
+; A64-NEXT:    bl __umodti3
+; A64-NEXT:    stp x24, x25, [x23]
+; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; A64-NEXT:    stp x0, x1, [x23, #16]
+; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    ret
+  %q = udiv i128 %n, %d
+  %r = urem i128 %n, %d
+  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
+  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
+  store i128 %q, ptr %p0, align 16
+  store i128 %r, ptr %p1, align 16
+  ret void
+}

>From 1ccf77b06be1578c48f9304b71866501d072859a Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 23:35:11 -0400
Subject: [PATCH 2/5] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem into
 single __divmodti4 call

When both the quotient and remainder of a signed (or unsigned) 128-bit
division are needed, LLVM previously emitted two separate helper calls
(__divti3 + __modti3). This patch fuses them into a single call to
__divmodti4 / __udivmodti4, which is already provided by compiler-rt
and libgcc for 64-bit targets.

Three changes are required:

1. RuntimeLibcalls.td: Register __divmodti4 and __udivmodti4 as the
   libcall implementations for SDIVREM_I128 and UDIVREM_I128 in
   Int128RTLibcalls. This set is already gated to 64-bit targets
   (AArch64 ILP64, RISC-V64, PPC64, x86_64, Wasm, etc.) following
   the same pattern as __multi3.

2. DAGCombiner.cpp: Fix the early-exit guard in useDivRem() that
   unconditionally bailed for non-legal types (including i128).
   The condition now allows the combination to proceed when a fused
   divrem libcall is available, consistent with the comment above it
   ("DivMod lib calls can still work on non-legal types").

3. LegalizeIntegerTypes.cpp: Add ExpandIntRes_DIVREM() to handle
   ISD::SDIVREM and ISD::UDIVREM during type legalization. Without
   this handler the type legalizer would crash ("Do not know how to
   expand the result of this operator!") because SDIVREM with an i128
   result type had no expansion path. The new handler emits the fused
   libcall (quoting the stack-temp ABI used by __divmodti4) and falls
   back to separate SDIV + SREM nodes when no fused libcall is
   registered (e.g. on 32-bit targets).

Fixes the missed optimization tracked in llvm/llvm-project#46350.
---
 llvm/include/llvm/IR/RuntimeLibcalls.td       |   2 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   3 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  69 +++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll  | 171 ++++++------------
 5 files changed, 127 insertions(+), 119 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index e4a926d3cb1d3..668af2b175c6b 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1786,6 +1786,8 @@ defset list<RuntimeLibcallImpl> Int128RTLibcalls = {
   def __lshrti3 : RuntimeLibcallImpl<SRL_I128>;
   def __ashrti3 : RuntimeLibcallImpl<SRA_I128>;
   def __multi3 : RuntimeLibcallImpl<MUL_I128>;
+  def __divmodti4 : RuntimeLibcallImpl<SDIVREM_I128>;
+  def __udivmodti4 : RuntimeLibcallImpl<UDIVREM_I128>;
 }
 
 //--------------------------------------------------------------------
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 82f8fd572bf19..b0ecc7c1a4788 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5056,7 +5056,8 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
-  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
+  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT) &&
+      !isDivRemLibcallAvailable(Node, isSigned, DAG))
     return SDValue();
 
   // If DIVREM is going to get expanded into a libcall,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 4a27f804d6720..8c027318120ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3101,6 +3101,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::READCYCLECOUNTER:
   case ISD::READSTEADYCOUNTER: ExpandIntRes_READCOUNTER(N, Lo, Hi); break;
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    ExpandIntRes_DIVREM(N, Lo, Hi);
+    break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
   case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
@@ -4888,6 +4892,71 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
   ReplaceValueWith(SDValue(Node, 1), Ovf);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool isSigned = (N->getOpcode() == ISD::SDIVREM);
+  RTLIB::Libcall LC = isSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+
+  // If no fused divrem libcall is available, fall back to separate div and rem
+  // nodes that the existing type-legalization handlers can expand
+  // independently.
+  if (DAG.getLibcalls().getLibcallImpl(LC) == RTLIB::Unsupported) {
+    unsigned DivOp = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned RemOp = isSigned ? ISD::SREM : ISD::UREM;
+    SDValue Ops[2] = {N->getOperand(0), N->getOperand(1)};
+    SDValue Q = DAG.getNode(DivOp, dl, VT, Ops);
+    SDValue R = DAG.getNode(RemOp, dl, VT, Ops);
+    SplitInteger(Q, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), R);
+    return;
+  }
+
+  // Emit __divmodti4 / __udivmodti4:
+  //   RetTy libcall(RetTy a, RetTy b, RetTy *rem)
+  // The quotient is the return value; the remainder is written via the pointer.
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  for (const SDValue &Op : N->op_values()) {
+    TargetLowering::ArgListEntry Entry(
+        Op, Op.getValueType().getTypeForEVT(*DAG.getContext()));
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  TargetLowering::ArgListEntry PtrEntry(
+      FIPtr, PointerType::getUnqual(RetTy->getContext()));
+  PtrEntry.IsSExt = PtrEntry.IsZExt = false;
+  Args.push_back(PtrEntry);
+
+  RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(
+          DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
+          DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout())),
+          std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  // Quotient is the return value; split it into Lo/Hi for the expanded type.
+  SplitInteger(CallInfo.first, Lo, Hi);
+
+  // Remainder is written to the stack temporary; load it back and register
+  // it as the replacement for result 1 of the original SDIVREM/UDIVREM node.
+  int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
+  SDValue Rem = DAG.getLoad(
+      VT, dl, CallInfo.second, FIPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  ReplaceValueWith(SDValue(N, 1), Rem);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 14f361f8bcaed..edf869780e560 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -472,6 +472,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
index c8258ec7873de..ee40d39bbd0e0 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -2,83 +2,50 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
 
-; Test that sdiv+srem / udiv+urem on i128 with the same operands are candidates
-; for fusing into a single __divmodti4 / __udivmodti4 call.
-;
-; Currently this is a missed optimization: two separate helper calls are emitted
-; (__divti3 + __modti3, or __udivti3 + __umodti3) instead of one fused call.
-; See: DAGCombiner::useDivRem, ExpandIntRes_DIVREM, RuntimeLibcalls SDIVREM_I128.
+; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
+; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
+; DAGCombiner::useDivRem fuses the pair into ISD::SDIVREM/UDIVREM, which is
+; then expanded to the fused libcall via ExpandIntRes_DIVREM in the type
+; legalizer.
 
 define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; X64-LABEL: sdivrem_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    pushq %r13
-; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq %rsp, %r8
 ; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r15, %rsi
-; X64-NEXT:    movq %r14, %rdx
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    callq __modti3 at PLT
-; X64-NEXT:    movq %rbp, 8(%r13)
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, (%r13)
-; X64-NEXT:    movq %rdx, 24(%r13)
-; X64-NEXT:    movq %rax, 16(%r13)
-; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    callq __divmodti4 at PLT
+; X64-NEXT:    movaps (%rsp), %xmm0
+; X64-NEXT:    movq %rdx, 8(%rbx)
+; X64-NEXT:    movq %rax, (%rbx)
+; X64-NEXT:    movaps %xmm0, 16(%rbx)
+; X64-NEXT:    addq $16, %rsp
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
-; X64-NEXT:    popq %r13
-; X64-NEXT:    popq %r14
-; X64-NEXT:    popq %r15
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; A64-LABEL: sdivrem_i128:
 ; A64:       // %bb.0:
-; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
-; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x23, x0
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; A64-NEXT:    mov x21, x3
-; A64-NEXT:    mov x22, x2
+; A64-NEXT:    sub sp, sp, #32
+; A64-NEXT:    mov x8, x4
+; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x19, x0
+; A64-NEXT:    mov x4, sp
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    mov x2, x8
 ; A64-NEXT:    mov x3, x5
-; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; A64-NEXT:    mov x19, x5
-; A64-NEXT:    mov x20, x4
-; A64-NEXT:    bl __divti3
-; A64-NEXT:    mov x24, x0
-; A64-NEXT:    mov x25, x1
-; A64-NEXT:    mov x0, x22
-; A64-NEXT:    mov x1, x21
-; A64-NEXT:    mov x2, x20
-; A64-NEXT:    mov x3, x19
-; A64-NEXT:    bl __modti3
-; A64-NEXT:    stp x24, x25, [x23]
-; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; A64-NEXT:    stp x0, x1, [x23, #16]
-; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    bl __divmodti4
+; A64-NEXT:    ldp x8, x9, [sp]
+; A64-NEXT:    stp x0, x1, [x19]
+; A64-NEXT:    stp x8, x9, [x19, #16]
+; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
@@ -92,73 +59,41 @@ define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; X64-LABEL: udivrem_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    pushq %r13
-; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq %rsp, %r8
 ; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    callq __udivti3 at PLT
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r15, %rsi
-; X64-NEXT:    movq %r14, %rdx
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    callq __umodti3 at PLT
-; X64-NEXT:    movq %rbp, 8(%r13)
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, (%r13)
-; X64-NEXT:    movq %rdx, 24(%r13)
-; X64-NEXT:    movq %rax, 16(%r13)
-; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    callq __udivmodti4 at PLT
+; X64-NEXT:    movaps (%rsp), %xmm0
+; X64-NEXT:    movq %rdx, 8(%rbx)
+; X64-NEXT:    movq %rax, (%rbx)
+; X64-NEXT:    movaps %xmm0, 16(%rbx)
+; X64-NEXT:    addq $16, %rsp
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
-; X64-NEXT:    popq %r13
-; X64-NEXT:    popq %r14
-; X64-NEXT:    popq %r15
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; A64-LABEL: udivrem_i128:
 ; A64:       // %bb.0:
-; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
-; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x23, x0
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; A64-NEXT:    mov x21, x3
-; A64-NEXT:    mov x22, x2
+; A64-NEXT:    sub sp, sp, #32
+; A64-NEXT:    mov x8, x4
+; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x19, x0
+; A64-NEXT:    mov x4, sp
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    mov x2, x8
 ; A64-NEXT:    mov x3, x5
-; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; A64-NEXT:    mov x19, x5
-; A64-NEXT:    mov x20, x4
-; A64-NEXT:    bl __udivti3
-; A64-NEXT:    mov x24, x0
-; A64-NEXT:    mov x25, x1
-; A64-NEXT:    mov x0, x22
-; A64-NEXT:    mov x1, x21
-; A64-NEXT:    mov x2, x20
-; A64-NEXT:    mov x3, x19
-; A64-NEXT:    bl __umodti3
-; A64-NEXT:    stp x24, x25, [x23]
-; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; A64-NEXT:    stp x0, x1, [x23, #16]
-; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    bl __udivmodti4
+; A64-NEXT:    ldp x8, x9, [sp]
+; A64-NEXT:    stp x0, x1, [x19]
+; A64-NEXT:    stp x8, x9, [x19, #16]
+; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
   %q = udiv i128 %n, %d
   %r = urem i128 %n, %d

>From 0ba22730585431d3ebd35dc2194322fb5ce27194 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 23:41:26 -0400
Subject: [PATCH 3/5] [test] Add i686 fallback check and Windows validation for
 i128 divrem

Extend the i128-divrem-libcall.ll test with:

- A third RUN line for i686-linux-gnu using CHECK-NOT to verify that
  __divmodti4/__udivmodti4 are never emitted on 32-bit targets where
  Int128RTLibcalls (and therefore the SDIVREM_I128 libcall) is not
  registered. This exercises the fallback path in ExpandIntRes_DIVREM
  that replaces the SDIVREM node with separate SDIV + SREM.

- Confirmed that x86_64-pc-windows-msvc and x86_64-win32 triples emit
  __divmodti4 correctly: compiler-rt includes divmodti4.c in
  GENERIC_SOURCES for all targets, and the existing divmod128.ll
  Windows tests continue to pass without modification.
---
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll | 57 ++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
index ee40d39bbd0e0..2486ddb6cbbde 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
+; RUN: llc < %s -mtriple=i686-linux-gnu     | FileCheck %s --check-prefix=X86-32
 
 ; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
 ; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
@@ -103,3 +104,59 @@ define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
   store i128 %r, ptr %p1, align 16
   ret void
 }
+
+; On 32-bit targets __divmodti4/__udivmodti4 are not registered (Int128RTLibcalls
+; is only added to 64-bit targets), so the fallback path in ExpandIntRes_DIVREM
+; fires: the SDIVREM node is replaced by separate SDIV + SREM, which then expand
+; inline. Verify no fused call is emitted and the code compiles without crashing.
+
+; X86-32-NOT: __divmodti4
+; X86-32-NOT: __udivmodti4
+
+define void @sdivrem_i128_fallback(ptr %out, i128 %n, i128 %d) nounwind {
+; X64-LABEL: sdivrem_i128_fallback:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq %rsp, %r8
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    callq __divmodti4 at PLT
+; X64-NEXT:    movaps (%rsp), %xmm0
+; X64-NEXT:    movq %rdx, 8(%rbx)
+; X64-NEXT:    movq %rax, (%rbx)
+; X64-NEXT:    movaps %xmm0, 16(%rbx)
+; X64-NEXT:    addq $16, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; A64-LABEL: sdivrem_i128_fallback:
+; A64:       // %bb.0:
+; A64-NEXT:    sub sp, sp, #32
+; A64-NEXT:    mov x8, x4
+; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; A64-NEXT:    mov x1, x3
+; A64-NEXT:    mov x19, x0
+; A64-NEXT:    mov x4, sp
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    mov x2, x8
+; A64-NEXT:    mov x3, x5
+; A64-NEXT:    bl __divmodti4
+; A64-NEXT:    ldp x8, x9, [sp]
+; A64-NEXT:    stp x0, x1, [x19]
+; A64-NEXT:    stp x8, x9, [x19, #16]
+; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    add sp, sp, #32
+; A64-NEXT:    ret
+  %q = sdiv i128 %n, %d
+  %r = srem i128 %n, %d
+  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
+  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
+  store i128 %q, ptr %p0, align 16
+  store i128 %r, ptr %p1, align 16
+  ret void
+}

>From f34c09176d6a5cab182a2beb05e190406fc9d26b Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 00:01:07 -0400
Subject: [PATCH 4/5] [test] Add RISC-V64 coverage to i128 divrem libcall test

Add a riscv64-linux-gnu RUN line (-mattr=+m) and corresponding RV64
check blocks for sdivrem_i128, udivrem_i128, and sdivrem_i128_fallback.

Verifies that the divmod fusion fires on RISC-V64 just as on x86_64 and
AArch64: a single `call __divmodti4` / `call __udivmodti4` is emitted
instead of separate __divti3 + __modti3 calls.
---
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll | 79 +++++++++++++++++++-
 1 file changed, 76 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
index 2486ddb6cbbde..c50c3dda5e02f 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
-; RUN: llc < %s -mtriple=i686-linux-gnu     | FileCheck %s --check-prefix=X86-32
+; RUN: llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefix=A64
+; RUN: llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefix=X86-32
 
 ; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
 ; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
@@ -48,6 +49,30 @@ define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
+;
+; RV64-LABEL: sdivrem_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    mv a3, a5
+; RV64-NEXT:    call __divmodti4
+; RV64-NEXT:    ld a2, 0(sp)
+; RV64-NEXT:    ld a3, 8(sp)
+; RV64-NEXT:    sd a0, 0(s0)
+; RV64-NEXT:    sd a1, 8(s0)
+; RV64-NEXT:    sd a2, 16(s0)
+; RV64-NEXT:    sd a3, 24(s0)
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
   %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
@@ -96,6 +121,30 @@ define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
+;
+; RV64-LABEL: udivrem_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    mv a3, a5
+; RV64-NEXT:    call __udivmodti4
+; RV64-NEXT:    ld a2, 0(sp)
+; RV64-NEXT:    ld a3, 8(sp)
+; RV64-NEXT:    sd a0, 0(s0)
+; RV64-NEXT:    sd a1, 8(s0)
+; RV64-NEXT:    sd a2, 16(s0)
+; RV64-NEXT:    sd a3, 24(s0)
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
   %q = udiv i128 %n, %d
   %r = urem i128 %n, %d
   %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
@@ -152,6 +201,30 @@ define void @sdivrem_i128_fallback(ptr %out, i128 %n, i128 %d) nounwind {
 ; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
+;
+; RV64-LABEL: sdivrem_i128_fallback:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    mv a3, a5
+; RV64-NEXT:    call __divmodti4
+; RV64-NEXT:    ld a2, 0(sp)
+; RV64-NEXT:    ld a3, 8(sp)
+; RV64-NEXT:    sd a0, 0(s0)
+; RV64-NEXT:    sd a1, 8(s0)
+; RV64-NEXT:    sd a2, 16(s0)
+; RV64-NEXT:    sd a3, 24(s0)
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
   %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0

>From b3b82a3ccf9e6188dad49153d82b7c4e3a30adf3 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 09:04:52 -0400
Subject: [PATCH 5/5] update tests to handle more targets.

---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  25 +-
 .../CodeGen/Generic/i128-divrem-libcall.ll    |  53 ++++
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll  | 235 ------------------
 3 files changed, 65 insertions(+), 248 deletions(-)
 create mode 100644 llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
 delete mode 100644 llvm/test/CodeGen/X86/i128-divrem-libcall.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8c027318120ed..98779a45a36d7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4896,15 +4896,13 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-  bool isSigned = (N->getOpcode() == ISD::SDIVREM);
-  RTLIB::Libcall LC = isSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+  bool IsSigned = (N->getOpcode() == ISD::SDIVREM);
+  RTLIB::Libcall LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
 
-  // If no fused divrem libcall is available, fall back to separate div and rem
-  // nodes that the existing type-legalization handlers can expand
-  // independently.
+  // If no fused divrem libcall is available, fall back to separate div and rem.
   if (DAG.getLibcalls().getLibcallImpl(LC) == RTLIB::Unsupported) {
-    unsigned DivOp = isSigned ? ISD::SDIV : ISD::UDIV;
-    unsigned RemOp = isSigned ? ISD::SREM : ISD::UREM;
+    unsigned DivOp = IsSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned RemOp = IsSigned ? ISD::SREM : ISD::UREM;
     SDValue Ops[2] = {N->getOperand(0), N->getOperand(1)};
     SDValue Q = DAG.getNode(DivOp, dl, VT, Ops);
     SDValue R = DAG.getNode(RemOp, dl, VT, Ops);
@@ -4915,21 +4913,22 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
 
   // Emit __divmodti4 / __udivmodti4:
   //   RetTy libcall(RetTy a, RetTy b, RetTy *rem)
-  // The quotient is the return value; the remainder is written via the pointer.
+  // The quotient is the return value; the remainder is written via pointer.
   Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   for (const SDValue &Op : N->op_values()) {
     TargetLowering::ArgListEntry Entry(
         Op, Op.getValueType().getTypeForEVT(*DAG.getContext()));
-    Entry.IsSExt = isSigned;
-    Entry.IsZExt = !isSigned;
+    Entry.IsSExt = IsSigned;
+    Entry.IsZExt = !IsSigned;
     Args.push_back(Entry);
   }
 
+  // The libcall writes the remainder via a pointer argument; allocate a stack
+  // slot for it and pass its address as the third argument.
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   TargetLowering::ArgListEntry PtrEntry(
       FIPtr, PointerType::getUnqual(RetTy->getContext()));
-  PtrEntry.IsSExt = PtrEntry.IsZExt = false;
   Args.push_back(PtrEntry);
 
   RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
@@ -4940,8 +4939,8 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
           DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
           DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout())),
           std::move(Args))
-      .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setSExtResult(IsSigned)
+      .setZExtResult(!IsSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
new file mode 100644
index 0000000000000..759ae41fa2226
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
@@ -0,0 +1,53 @@
+; 64-bit targets: fused __divmodti4 / __udivmodti4
+; RUN: %if x86-registered-target			   %{ llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-pc-windows-msvc        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if powerpc-registered-target     %{ llc < %s -mtriple=powerpc64-linux-gnu           | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if sparc-registered-target       %{ llc < %s -mtriple=sparcv9-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if loongarch-registered-target   %{ llc < %s -mtriple=loongarch64-linux-gnu         | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if webassembly-registered-target %{ llc < %s -mtriple=wasm32-unknown-unknown        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if webassembly-registered-target %{ llc < %s -mtriple=wasm64-unknown-unknown        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+
+; 32-bit / ILP32 targets: no fused libcall 
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i386-linux-gnu                | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv6-linux-gnueabihf         | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if aarch64-registered-target  %{ llc < %s -mtriple=aarch64_32-apple-watchos      | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu             | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-none-eabi               | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+
+; Verify that sdiv+srem / udiv+urem on i128 fuse into a single __divmodti4 /
+; __udivmodti4 call on targets where the libcall is available (64-bit targets
+; and wasm), and do not on targets where it is not (32-bit / ILP32).
+;
+; The lowering varies by target:
+;   64-bit targets and wasm: fused __divmodti4 / __udivmodti4
+;   32-bit targets that lack the fused call may lower to:
+;     - separate __divti3 + __modti3 / __udivti3 + __umodti3 calls, or
+;     - fully inline expansion (e.g. i686)
+
+define void @sdivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
+; CHECK-LABEL: sdivrem_i128:
+; FUSED:           __divmodti4
+; SPLIT-NOT:       __divmodti4
+  %q = sdiv i128 %n, %d
+  %r = srem i128 %n, %d
+  store i128 %q, ptr %q_out
+  store i128 %r, ptr %r_out
+  ret void
+}
+
+define void @udivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
+; CHECK-LABEL: udivrem_i128:
+; FUSED:           __udivmodti4
+; SPLIT-NOT:       __udivmodti4
+  %q = udiv i128 %n, %d
+  %r = urem i128 %n, %d
+  store i128 %q, ptr %q_out
+  store i128 %r, ptr %r_out
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
deleted file mode 100644
index c50c3dda5e02f..0000000000000
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefix=A64
-; RUN: llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefix=RV64
-; RUN: llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefix=X86-32
-
-; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
-; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
-; DAGCombiner::useDivRem fuses the pair into ISD::SDIVREM/UDIVREM, which is
-; then expanded to the fused libcall via ExpandIntRes_DIVREM in the type
-; legalizer.
-
-define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
-; X64-LABEL: sdivrem_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rsp, %r8
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    callq __divmodti4 at PLT
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movq %rdx, 8(%rbx)
-; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movaps %xmm0, 16(%rbx)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    retq
-;
-; A64-LABEL: sdivrem_i128:
-; A64:       // %bb.0:
-; A64-NEXT:    sub sp, sp, #32
-; A64-NEXT:    mov x8, x4
-; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x19, x0
-; A64-NEXT:    mov x4, sp
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    mov x2, x8
-; A64-NEXT:    mov x3, x5
-; A64-NEXT:    bl __divmodti4
-; A64-NEXT:    ldp x8, x9, [sp]
-; A64-NEXT:    stp x0, x1, [x19]
-; A64-NEXT:    stp x8, x9, [x19, #16]
-; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    add sp, sp, #32
-; A64-NEXT:    ret
-;
-; RV64-LABEL: sdivrem_i128:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -32
-; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    mv a4, sp
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:    mv a2, a3
-; RV64-NEXT:    mv a3, a5
-; RV64-NEXT:    call __divmodti4
-; RV64-NEXT:    ld a2, 0(sp)
-; RV64-NEXT:    ld a3, 8(sp)
-; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sd a1, 8(s0)
-; RV64-NEXT:    sd a2, 16(s0)
-; RV64-NEXT:    sd a3, 24(s0)
-; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 32
-; RV64-NEXT:    ret
-  %q = sdiv i128 %n, %d
-  %r = srem i128 %n, %d
-  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
-  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
-  store i128 %q, ptr %p0, align 16
-  store i128 %r, ptr %p1, align 16
-  ret void
-}
-
-define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
-; X64-LABEL: udivrem_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rsp, %r8
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    callq __udivmodti4 at PLT
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movq %rdx, 8(%rbx)
-; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movaps %xmm0, 16(%rbx)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    retq
-;
-; A64-LABEL: udivrem_i128:
-; A64:       // %bb.0:
-; A64-NEXT:    sub sp, sp, #32
-; A64-NEXT:    mov x8, x4
-; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x19, x0
-; A64-NEXT:    mov x4, sp
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    mov x2, x8
-; A64-NEXT:    mov x3, x5
-; A64-NEXT:    bl __udivmodti4
-; A64-NEXT:    ldp x8, x9, [sp]
-; A64-NEXT:    stp x0, x1, [x19]
-; A64-NEXT:    stp x8, x9, [x19, #16]
-; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    add sp, sp, #32
-; A64-NEXT:    ret
-;
-; RV64-LABEL: udivrem_i128:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -32
-; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    mv a4, sp
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:    mv a2, a3
-; RV64-NEXT:    mv a3, a5
-; RV64-NEXT:    call __udivmodti4
-; RV64-NEXT:    ld a2, 0(sp)
-; RV64-NEXT:    ld a3, 8(sp)
-; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sd a1, 8(s0)
-; RV64-NEXT:    sd a2, 16(s0)
-; RV64-NEXT:    sd a3, 24(s0)
-; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 32
-; RV64-NEXT:    ret
-  %q = udiv i128 %n, %d
-  %r = urem i128 %n, %d
-  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
-  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
-  store i128 %q, ptr %p0, align 16
-  store i128 %r, ptr %p1, align 16
-  ret void
-}
-
-; On 32-bit targets __divmodti4/__udivmodti4 are not registered (Int128RTLibcalls
-; is only added to 64-bit targets), so the fallback path in ExpandIntRes_DIVREM
-; fires: the SDIVREM node is replaced by separate SDIV + SREM, which then expand
-; inline. Verify no fused call is emitted and the code compiles without crashing.
-
-; X86-32-NOT: __divmodti4
-; X86-32-NOT: __udivmodti4
-
-define void @sdivrem_i128_fallback(ptr %out, i128 %n, i128 %d) nounwind {
-; X64-LABEL: sdivrem_i128_fallback:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rsp, %r8
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    callq __divmodti4 at PLT
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movq %rdx, 8(%rbx)
-; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movaps %xmm0, 16(%rbx)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    retq
-;
-; A64-LABEL: sdivrem_i128_fallback:
-; A64:       // %bb.0:
-; A64-NEXT:    sub sp, sp, #32
-; A64-NEXT:    mov x8, x4
-; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x19, x0
-; A64-NEXT:    mov x4, sp
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    mov x2, x8
-; A64-NEXT:    mov x3, x5
-; A64-NEXT:    bl __divmodti4
-; A64-NEXT:    ldp x8, x9, [sp]
-; A64-NEXT:    stp x0, x1, [x19]
-; A64-NEXT:    stp x8, x9, [x19, #16]
-; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    add sp, sp, #32
-; A64-NEXT:    ret
-;
-; RV64-LABEL: sdivrem_i128_fallback:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -32
-; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    mv a4, sp
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:    mv a2, a3
-; RV64-NEXT:    mv a3, a5
-; RV64-NEXT:    call __divmodti4
-; RV64-NEXT:    ld a2, 0(sp)
-; RV64-NEXT:    ld a3, 8(sp)
-; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sd a1, 8(s0)
-; RV64-NEXT:    sd a2, 16(s0)
-; RV64-NEXT:    sd a3, 24(s0)
-; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 32
-; RV64-NEXT:    ret
-  %q = sdiv i128 %n, %d
-  %r = srem i128 %n, %d
-  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
-  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
-  store i128 %q, ptr %p0, align 16
-  store i128 %r, ptr %p1, align 16
-  ret void
-}



[llvm] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem / udiv+urem into single __divmodti4 / __udivmodti4 call (PR #187908)

[llvm] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem / udiv+urem into single divmodti4 / udivmodti4 call (PR #187908)