[llvm] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem / udiv+urem into single divmodti4 / udivmodti4 call (PR #187908)

Tue Mar 31 14:49:39 PDT 2026

https://github.com/Takashiidobe updated https://github.com/llvm/llvm-project/pull/187908

>From 80fc6b7971fef15346bf7aa0621822d02934cb1f Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 23:34:25 -0400
Subject: [PATCH 01/12] [test] Add i128 sdiv+srem missed-optimization test for
 divmod fusion

Add a lit test documenting the current (unoptimized) codegen for i128
sdiv+srem and udiv+urem pairs on x86_64 and AArch64. Both targets
currently emit two separate helper calls (__divti3 + __modti3 or
__udivti3 + __umodti3) rather than a single fused __divmodti4 /
__udivmodti4 call.

The test serves as a baseline and will be updated when the optimization
lands in a follow-up commit.
---
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll | 170 +++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/i128-divrem-libcall.ll

diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
new file mode 100644
index 0000000000000..c8258ec7873de
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
+
+; Test that sdiv+srem / udiv+urem on i128 with the same operands are candidates
+; for fusing into a single __divmodti4 / __udivmodti4 call.
+;
+; Currently this is a missed optimization: two separate helper calls are emitted
+; (__divti3 + __modti3, or __udivti3 + __umodti3) instead of one fused call.
+; See: DAGCombiner::useDivRem, ExpandIntRes_DIVREM, RuntimeLibcalls SDIVREM_I128.
+
+define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
+; X64-LABEL: sdivrem_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    callq __divti3 at PLT
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    callq __modti3 at PLT
+; X64-NEXT:    movq %rbp, 8(%r13)
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, (%r13)
+; X64-NEXT:    movq %rdx, 24(%r13)
+; X64-NEXT:    movq %rax, 16(%r13)
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; A64-LABEL: sdivrem_i128:
+; A64:       // %bb.0:
+; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; A64-NEXT:    mov x23, x0
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; A64-NEXT:    mov x21, x3
+; A64-NEXT:    mov x22, x2
+; A64-NEXT:    mov x1, x3
+; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x3, x5
+; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; A64-NEXT:    mov x19, x5
+; A64-NEXT:    mov x20, x4
+; A64-NEXT:    bl __divti3
+; A64-NEXT:    mov x24, x0
+; A64-NEXT:    mov x25, x1
+; A64-NEXT:    mov x0, x22
+; A64-NEXT:    mov x1, x21
+; A64-NEXT:    mov x2, x20
+; A64-NEXT:    mov x3, x19
+; A64-NEXT:    bl __modti3
+; A64-NEXT:    stp x24, x25, [x23]
+; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; A64-NEXT:    stp x0, x1, [x23, #16]
+; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    ret
+  %q = sdiv i128 %n, %d
+  %r = srem i128 %n, %d
+  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
+  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
+  store i128 %q, ptr %p0, align 16
+  store i128 %r, ptr %p1, align 16
+  ret void
+}
+
+define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
+; X64-LABEL: udivrem_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    callq __udivti3 at PLT
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %r12, %rdi
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %r14, %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    callq __umodti3 at PLT
+; X64-NEXT:    movq %rbp, 8(%r13)
+; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, (%r13)
+; X64-NEXT:    movq %rdx, 24(%r13)
+; X64-NEXT:    movq %rax, 16(%r13)
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+;
+; A64-LABEL: udivrem_i128:
+; A64:       // %bb.0:
+; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
+; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; A64-NEXT:    mov x23, x0
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; A64-NEXT:    mov x21, x3
+; A64-NEXT:    mov x22, x2
+; A64-NEXT:    mov x1, x3
+; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x3, x5
+; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; A64-NEXT:    mov x19, x5
+; A64-NEXT:    mov x20, x4
+; A64-NEXT:    bl __udivti3
+; A64-NEXT:    mov x24, x0
+; A64-NEXT:    mov x25, x1
+; A64-NEXT:    mov x0, x22
+; A64-NEXT:    mov x1, x21
+; A64-NEXT:    mov x2, x20
+; A64-NEXT:    mov x3, x19
+; A64-NEXT:    bl __umodti3
+; A64-NEXT:    stp x24, x25, [x23]
+; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; A64-NEXT:    stp x0, x1, [x23, #16]
+; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    ret
+  %q = udiv i128 %n, %d
+  %r = urem i128 %n, %d
+  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
+  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
+  store i128 %q, ptr %p0, align 16
+  store i128 %r, ptr %p1, align 16
+  ret void
+}

>From 598f30d49dc9fb91ed213ac1248d7208b89d6c5e Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 23:35:11 -0400
Subject: [PATCH 02/12] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem into
 single __divmodti4 call

When both the quotient and remainder of a signed (or unsigned) 128-bit
division are needed, LLVM previously emitted two separate helper calls
(__divti3 + __modti3). This patch fuses them into a single call to
__divmodti4 / __udivmodti4, which is already provided by compiler-rt
and libgcc for 64-bit targets.

Three changes are required:

1. RuntimeLibcalls.td: Register __divmodti4 and __udivmodti4 as the
   libcall implementations for SDIVREM_I128 and UDIVREM_I128 in
   Int128RTLibcalls. This set is already gated to 64-bit targets
   (AArch64 ILP64, RISC-V64, PPC64, x86_64, Wasm, etc.) following
   the same pattern as __multi3.

2. DAGCombiner.cpp: Fix the early-exit guard in useDivRem() that
   unconditionally bailed for non-legal types (including i128).
   The condition now allows the combination to proceed when a fused
   divrem libcall is available, consistent with the comment above it
   ("DivMod lib calls can still work on non-legal types").

3. LegalizeIntegerTypes.cpp: Add ExpandIntRes_DIVREM() to handle
   ISD::SDIVREM and ISD::UDIVREM during type legalization. Without
   this handler the type legalizer would crash ("Do not know how to
   expand the result of this operator!") because SDIVREM with an i128
   result type had no expansion path. The new handler emits the fused
   libcall (quoting the stack-temp ABI used by __divmodti4) and falls
   back to separate SDIV + SREM nodes when no fused libcall is
   registered (e.g. on 32-bit targets).

Fixes the missed optimization tracked in llvm/llvm-project#46350.
---
 llvm/include/llvm/IR/RuntimeLibcalls.td       |   2 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   3 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  69 +++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll  | 171 ++++++------------
 5 files changed, 127 insertions(+), 119 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index a0f505f1fda2f..0e2f20a74aa23 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -1788,6 +1788,8 @@ defset list<RuntimeLibcallImpl> Int128RTLibcalls = {
   def __lshrti3 : RuntimeLibcallImpl<SRL_I128>;
   def __ashrti3 : RuntimeLibcallImpl<SRA_I128>;
   def __multi3 : RuntimeLibcallImpl<MUL_I128>;
+  def __divmodti4 : RuntimeLibcallImpl<SDIVREM_I128>;
+  def __udivmodti4 : RuntimeLibcallImpl<UDIVREM_I128>;
 }
 
 //--------------------------------------------------------------------
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6ad5df3d3272c..49215a16cb66b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5061,7 +5061,8 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
-  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
+  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT) &&
+      !isDivRemLibcallAvailable(Node, isSigned, DAG))
     return SDValue();
 
   // If DIVREM is going to get expanded into a libcall,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c6a4fe0b64cd7..0b036ce546be6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3114,6 +3114,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::READCYCLECOUNTER:
   case ISD::READSTEADYCOUNTER: ExpandIntRes_READCOUNTER(N, Lo, Hi); break;
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    ExpandIntRes_DIVREM(N, Lo, Hi);
+    break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
   case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
@@ -4901,6 +4905,71 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
   ReplaceValueWith(SDValue(Node, 1), Ovf);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool isSigned = (N->getOpcode() == ISD::SDIVREM);
+  RTLIB::Libcall LC = isSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+
+  // If no fused divrem libcall is available, fall back to separate div and rem
+  // nodes that the existing type-legalization handlers can expand
+  // independently.
+  if (DAG.getLibcalls().getLibcallImpl(LC) == RTLIB::Unsupported) {
+    unsigned DivOp = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned RemOp = isSigned ? ISD::SREM : ISD::UREM;
+    SDValue Ops[2] = {N->getOperand(0), N->getOperand(1)};
+    SDValue Q = DAG.getNode(DivOp, dl, VT, Ops);
+    SDValue R = DAG.getNode(RemOp, dl, VT, Ops);
+    SplitInteger(Q, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), R);
+    return;
+  }
+
+  // Emit __divmodti4 / __udivmodti4:
+  //   RetTy libcall(RetTy a, RetTy b, RetTy *rem)
+  // The quotient is the return value; the remainder is written via the pointer.
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  for (const SDValue &Op : N->op_values()) {
+    TargetLowering::ArgListEntry Entry(
+        Op, Op.getValueType().getTypeForEVT(*DAG.getContext()));
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  TargetLowering::ArgListEntry PtrEntry(
+      FIPtr, PointerType::getUnqual(RetTy->getContext()));
+  PtrEntry.IsSExt = PtrEntry.IsZExt = false;
+  Args.push_back(PtrEntry);
+
+  RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(
+          DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
+          DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout())),
+          std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  // Quotient is the return value; split it into Lo/Hi for the expanded type.
+  SplitInteger(CallInfo.first, Lo, Hi);
+
+  // Remainder is written to the stack temporary; load it back and register
+  // it as the replacement for result 1 of the original SDIVREM/UDIVREM node.
+  int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
+  SDValue Rem = DAG.getLoad(
+      VT, dl, CallInfo.second, FIPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  ReplaceValueWith(SDValue(N, 1), Rem);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4362845450acf..e1c11e1c35a31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -472,6 +472,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
index c8258ec7873de..ee40d39bbd0e0 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -2,83 +2,50 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
 
-; Test that sdiv+srem / udiv+urem on i128 with the same operands are candidates
-; for fusing into a single __divmodti4 / __udivmodti4 call.
-;
-; Currently this is a missed optimization: two separate helper calls are emitted
-; (__divti3 + __modti3, or __udivti3 + __umodti3) instead of one fused call.
-; See: DAGCombiner::useDivRem, ExpandIntRes_DIVREM, RuntimeLibcalls SDIVREM_I128.
+; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
+; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
+; DAGCombiner::useDivRem fuses the pair into ISD::SDIVREM/UDIVREM, which is
+; then expanded to the fused libcall via ExpandIntRes_DIVREM in the type
+; legalizer.
 
 define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; X64-LABEL: sdivrem_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    pushq %r13
-; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq %rsp, %r8
 ; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r15, %rsi
-; X64-NEXT:    movq %r14, %rdx
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    callq __modti3 at PLT
-; X64-NEXT:    movq %rbp, 8(%r13)
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, (%r13)
-; X64-NEXT:    movq %rdx, 24(%r13)
-; X64-NEXT:    movq %rax, 16(%r13)
-; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    callq __divmodti4 at PLT
+; X64-NEXT:    movaps (%rsp), %xmm0
+; X64-NEXT:    movq %rdx, 8(%rbx)
+; X64-NEXT:    movq %rax, (%rbx)
+; X64-NEXT:    movaps %xmm0, 16(%rbx)
+; X64-NEXT:    addq $16, %rsp
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
-; X64-NEXT:    popq %r13
-; X64-NEXT:    popq %r14
-; X64-NEXT:    popq %r15
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; A64-LABEL: sdivrem_i128:
 ; A64:       // %bb.0:
-; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
-; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x23, x0
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; A64-NEXT:    mov x21, x3
-; A64-NEXT:    mov x22, x2
+; A64-NEXT:    sub sp, sp, #32
+; A64-NEXT:    mov x8, x4
+; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x19, x0
+; A64-NEXT:    mov x4, sp
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    mov x2, x8
 ; A64-NEXT:    mov x3, x5
-; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; A64-NEXT:    mov x19, x5
-; A64-NEXT:    mov x20, x4
-; A64-NEXT:    bl __divti3
-; A64-NEXT:    mov x24, x0
-; A64-NEXT:    mov x25, x1
-; A64-NEXT:    mov x0, x22
-; A64-NEXT:    mov x1, x21
-; A64-NEXT:    mov x2, x20
-; A64-NEXT:    mov x3, x19
-; A64-NEXT:    bl __modti3
-; A64-NEXT:    stp x24, x25, [x23]
-; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; A64-NEXT:    stp x0, x1, [x23, #16]
-; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    bl __divmodti4
+; A64-NEXT:    ldp x8, x9, [sp]
+; A64-NEXT:    stp x0, x1, [x19]
+; A64-NEXT:    stp x8, x9, [x19, #16]
+; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
@@ -92,73 +59,41 @@ define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; X64-LABEL: udivrem_i128:
 ; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    pushq %r13
-; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    movq %rcx, %r14
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    movq %rdi, %r13
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq %rsp, %r8
 ; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    callq __udivti3 at PLT
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %r12, %rdi
-; X64-NEXT:    movq %r15, %rsi
-; X64-NEXT:    movq %r14, %rdx
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    callq __umodti3 at PLT
-; X64-NEXT:    movq %rbp, 8(%r13)
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, (%r13)
-; X64-NEXT:    movq %rdx, 24(%r13)
-; X64-NEXT:    movq %rax, 16(%r13)
-; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    callq __udivmodti4 at PLT
+; X64-NEXT:    movaps (%rsp), %xmm0
+; X64-NEXT:    movq %rdx, 8(%rbx)
+; X64-NEXT:    movq %rax, (%rbx)
+; X64-NEXT:    movaps %xmm0, 16(%rbx)
+; X64-NEXT:    addq $16, %rsp
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
-; X64-NEXT:    popq %r13
-; X64-NEXT:    popq %r14
-; X64-NEXT:    popq %r15
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; A64-LABEL: udivrem_i128:
 ; A64:       // %bb.0:
-; A64-NEXT:    stp x30, x25, [sp, #-64]! // 16-byte Folded Spill
-; A64-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x23, x0
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; A64-NEXT:    mov x21, x3
-; A64-NEXT:    mov x22, x2
+; A64-NEXT:    sub sp, sp, #32
+; A64-NEXT:    mov x8, x4
+; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x2, x4
+; A64-NEXT:    mov x19, x0
+; A64-NEXT:    mov x4, sp
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    mov x2, x8
 ; A64-NEXT:    mov x3, x5
-; A64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; A64-NEXT:    mov x19, x5
-; A64-NEXT:    mov x20, x4
-; A64-NEXT:    bl __udivti3
-; A64-NEXT:    mov x24, x0
-; A64-NEXT:    mov x25, x1
-; A64-NEXT:    mov x0, x22
-; A64-NEXT:    mov x1, x21
-; A64-NEXT:    mov x2, x20
-; A64-NEXT:    mov x3, x19
-; A64-NEXT:    bl __umodti3
-; A64-NEXT:    stp x24, x25, [x23]
-; A64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; A64-NEXT:    stp x0, x1, [x23, #16]
-; A64-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; A64-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    ldp x30, x25, [sp], #64 // 16-byte Folded Reload
+; A64-NEXT:    bl __udivmodti4
+; A64-NEXT:    ldp x8, x9, [sp]
+; A64-NEXT:    stp x0, x1, [x19]
+; A64-NEXT:    stp x8, x9, [x19, #16]
+; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
   %q = udiv i128 %n, %d
   %r = urem i128 %n, %d

>From fe653c83322e668a24ec512ed980d6f405cc6131 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 23:41:26 -0400
Subject: [PATCH 03/12] [test] Add i686 fallback check and Windows validation
 for i128 divrem

Extend the i128-divrem-libcall.ll test with:

- A third RUN line for i686-linux-gnu using CHECK-NOT to verify that
  __divmodti4/__udivmodti4 are never emitted on 32-bit targets where
  Int128RTLibcalls (and therefore the SDIVREM_I128 libcall) is not
  registered. This exercises the fallback path in ExpandIntRes_DIVREM
  that replaces the SDIVREM node with separate SDIV + SREM.

- Confirmed that x86_64-pc-windows-msvc and x86_64-win32 triples emit
  __divmodti4 correctly: compiler-rt includes divmodti4.c in
  GENERIC_SOURCES for all targets, and the existing divmod128.ll
  Windows tests continue to pass without modification.
---
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll | 57 ++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
index ee40d39bbd0e0..2486ddb6cbbde 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
+; RUN: llc < %s -mtriple=i686-linux-gnu     | FileCheck %s --check-prefix=X86-32
 
 ; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
 ; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
@@ -103,3 +104,59 @@ define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
   store i128 %r, ptr %p1, align 16
   ret void
 }
+
+; On 32-bit targets __divmodti4/__udivmodti4 are not registered (Int128RTLibcalls
+; is only added to 64-bit targets), so the fallback path in ExpandIntRes_DIVREM
+; fires: the SDIVREM node is replaced by separate SDIV + SREM, which then expand
+; inline. Verify no fused call is emitted and the code compiles without crashing.
+
+; X86-32-NOT: __divmodti4
+; X86-32-NOT: __udivmodti4
+
+define void @sdivrem_i128_fallback(ptr %out, i128 %n, i128 %d) nounwind {
+; X64-LABEL: sdivrem_i128_fallback:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $16, %rsp
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq %rsp, %r8
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    callq __divmodti4 at PLT
+; X64-NEXT:    movaps (%rsp), %xmm0
+; X64-NEXT:    movq %rdx, 8(%rbx)
+; X64-NEXT:    movq %rax, (%rbx)
+; X64-NEXT:    movaps %xmm0, 16(%rbx)
+; X64-NEXT:    addq $16, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+;
+; A64-LABEL: sdivrem_i128_fallback:
+; A64:       // %bb.0:
+; A64-NEXT:    sub sp, sp, #32
+; A64-NEXT:    mov x8, x4
+; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; A64-NEXT:    mov x1, x3
+; A64-NEXT:    mov x19, x0
+; A64-NEXT:    mov x4, sp
+; A64-NEXT:    mov x0, x2
+; A64-NEXT:    mov x2, x8
+; A64-NEXT:    mov x3, x5
+; A64-NEXT:    bl __divmodti4
+; A64-NEXT:    ldp x8, x9, [sp]
+; A64-NEXT:    stp x0, x1, [x19]
+; A64-NEXT:    stp x8, x9, [x19, #16]
+; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; A64-NEXT:    add sp, sp, #32
+; A64-NEXT:    ret
+  %q = sdiv i128 %n, %d
+  %r = srem i128 %n, %d
+  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
+  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
+  store i128 %q, ptr %p0, align 16
+  store i128 %r, ptr %p1, align 16
+  ret void
+}

>From 7ada76be81c257f5f2c9e019de868a8895cd6281 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 00:01:07 -0400
Subject: [PATCH 04/12] [test] Add RISC-V64 coverage to i128 divrem libcall
 test

Add a riscv64-linux-gnu RUN line (-mattr=+m) and corresponding RV64
check blocks for sdivrem_i128, udivrem_i128, and sdivrem_i128_fallback.

Verifies that the divmod fusion fires on RISC-V64 just as on x86_64 and
AArch64: a single `call __divmodti4` / `call __udivmodti4` is emitted
instead of separate __divti3 + __modti3 calls.
---
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll | 79 +++++++++++++++++++-
 1 file changed, 76 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
index 2486ddb6cbbde..c50c3dda5e02f 100644
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-linux-gnu   | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu  | FileCheck %s --check-prefix=A64
-; RUN: llc < %s -mtriple=i686-linux-gnu     | FileCheck %s --check-prefix=X86-32
+; RUN: llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefix=A64
+; RUN: llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefix=X86-32
 
 ; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
 ; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
@@ -48,6 +49,30 @@ define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
+;
+; RV64-LABEL: sdivrem_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    mv a3, a5
+; RV64-NEXT:    call __divmodti4
+; RV64-NEXT:    ld a2, 0(sp)
+; RV64-NEXT:    ld a3, 8(sp)
+; RV64-NEXT:    sd a0, 0(s0)
+; RV64-NEXT:    sd a1, 8(s0)
+; RV64-NEXT:    sd a2, 16(s0)
+; RV64-NEXT:    sd a3, 24(s0)
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
   %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
@@ -96,6 +121,30 @@ define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
 ; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
+;
+; RV64-LABEL: udivrem_i128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    mv a3, a5
+; RV64-NEXT:    call __udivmodti4
+; RV64-NEXT:    ld a2, 0(sp)
+; RV64-NEXT:    ld a3, 8(sp)
+; RV64-NEXT:    sd a0, 0(s0)
+; RV64-NEXT:    sd a1, 8(s0)
+; RV64-NEXT:    sd a2, 16(s0)
+; RV64-NEXT:    sd a3, 24(s0)
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
   %q = udiv i128 %n, %d
   %r = urem i128 %n, %d
   %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
@@ -152,6 +201,30 @@ define void @sdivrem_i128_fallback(ptr %out, i128 %n, i128 %d) nounwind {
 ; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; A64-NEXT:    add sp, sp, #32
 ; A64-NEXT:    ret
+;
+; RV64-LABEL: sdivrem_i128_fallback:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv a5, a4
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    mv a4, sp
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    mv a3, a5
+; RV64-NEXT:    call __divmodti4
+; RV64-NEXT:    ld a2, 0(sp)
+; RV64-NEXT:    ld a3, 8(sp)
+; RV64-NEXT:    sd a0, 0(s0)
+; RV64-NEXT:    sd a1, 8(s0)
+; RV64-NEXT:    sd a2, 16(s0)
+; RV64-NEXT:    sd a3, 24(s0)
+; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    ret
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
   %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0

>From 6cf400b3d132c5acc2d2d4b8059949b770a7bb2f Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 09:04:52 -0400
Subject: [PATCH 05/12] update tests to handle more targets.

---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  25 +-
 .../CodeGen/Generic/i128-divrem-libcall.ll    |  53 ++++
 llvm/test/CodeGen/X86/i128-divrem-libcall.ll  | 235 ------------------
 3 files changed, 65 insertions(+), 248 deletions(-)
 create mode 100644 llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
 delete mode 100644 llvm/test/CodeGen/X86/i128-divrem-libcall.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0b036ce546be6..6318a077da6ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4909,15 +4909,13 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-  bool isSigned = (N->getOpcode() == ISD::SDIVREM);
-  RTLIB::Libcall LC = isSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+  bool IsSigned = (N->getOpcode() == ISD::SDIVREM);
+  RTLIB::Libcall LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
 
-  // If no fused divrem libcall is available, fall back to separate div and rem
-  // nodes that the existing type-legalization handlers can expand
-  // independently.
+  // If no fused divrem libcall is available, fall back to separate div and rem.
   if (DAG.getLibcalls().getLibcallImpl(LC) == RTLIB::Unsupported) {
-    unsigned DivOp = isSigned ? ISD::SDIV : ISD::UDIV;
-    unsigned RemOp = isSigned ? ISD::SREM : ISD::UREM;
+    unsigned DivOp = IsSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned RemOp = IsSigned ? ISD::SREM : ISD::UREM;
     SDValue Ops[2] = {N->getOperand(0), N->getOperand(1)};
     SDValue Q = DAG.getNode(DivOp, dl, VT, Ops);
     SDValue R = DAG.getNode(RemOp, dl, VT, Ops);
@@ -4928,21 +4926,22 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
 
   // Emit __divmodti4 / __udivmodti4:
   //   RetTy libcall(RetTy a, RetTy b, RetTy *rem)
-  // The quotient is the return value; the remainder is written via the pointer.
+  // The quotient is the return value; the remainder is written via pointer.
   Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   for (const SDValue &Op : N->op_values()) {
     TargetLowering::ArgListEntry Entry(
         Op, Op.getValueType().getTypeForEVT(*DAG.getContext()));
-    Entry.IsSExt = isSigned;
-    Entry.IsZExt = !isSigned;
+    Entry.IsSExt = IsSigned;
+    Entry.IsZExt = !IsSigned;
     Args.push_back(Entry);
   }
 
+  // The libcall writes the remainder via a pointer argument; allocate a stack
+  // slot for it and pass its address as the third argument.
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   TargetLowering::ArgListEntry PtrEntry(
       FIPtr, PointerType::getUnqual(RetTy->getContext()));
-  PtrEntry.IsSExt = PtrEntry.IsZExt = false;
   Args.push_back(PtrEntry);
 
   RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
@@ -4953,8 +4952,8 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
           DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
           DAG.getExternalSymbol(LCImpl, TLI.getPointerTy(DAG.getDataLayout())),
           std::move(Args))
-      .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setSExtResult(IsSigned)
+      .setZExtResult(!IsSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
new file mode 100644
index 0000000000000..759ae41fa2226
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
@@ -0,0 +1,53 @@
+; 64-bit targets: fused __divmodti4 / __udivmodti4
+; RUN: %if x86-registered-target			   %{ llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-pc-windows-msvc        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if powerpc-registered-target     %{ llc < %s -mtriple=powerpc64-linux-gnu           | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if sparc-registered-target       %{ llc < %s -mtriple=sparcv9-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if loongarch-registered-target   %{ llc < %s -mtriple=loongarch64-linux-gnu         | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if webassembly-registered-target %{ llc < %s -mtriple=wasm32-unknown-unknown        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if webassembly-registered-target %{ llc < %s -mtriple=wasm64-unknown-unknown        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+
+; 32-bit / ILP32 targets: no fused libcall 
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i386-linux-gnu                | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv6-linux-gnueabihf         | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if aarch64-registered-target  %{ llc < %s -mtriple=aarch64_32-apple-watchos      | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu             | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-none-eabi               | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+
+; Verify that sdiv+srem / udiv+urem on i128 fuse into a single __divmodti4 /
+; __udivmodti4 call on targets where the libcall is available (64-bit targets
+; and wasm), and do not on targets where it is not (32-bit / ILP32).
+;
+; The lowering varies by target:
+;   64-bit targets and wasm: fused __divmodti4 / __udivmodti4
+;   32-bit targets that lack the fused call may lower to:
+;     - separate __divti3 + __modti3 / __udivti3 + __umodti3 calls, or
+;     - fully inline expansion (e.g. i686)
+
+define void @sdivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
+; CHECK-LABEL: sdivrem_i128:
+; FUSED:           __divmodti4
+; SPLIT-NOT:       __divmodti4
+  %q = sdiv i128 %n, %d
+  %r = srem i128 %n, %d
+  store i128 %q, ptr %q_out
+  store i128 %r, ptr %r_out
+  ret void
+}
+
+define void @udivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
+; CHECK-LABEL: udivrem_i128:
+; FUSED:           __udivmodti4
+; SPLIT-NOT:       __udivmodti4
+  %q = udiv i128 %n, %d
+  %r = urem i128 %n, %d
+  store i128 %q, ptr %q_out
+  store i128 %r, ptr %r_out
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll b/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
deleted file mode 100644
index c50c3dda5e02f..0000000000000
--- a/llvm/test/CodeGen/X86/i128-divrem-libcall.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefix=A64
-; RUN: llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefix=RV64
-; RUN: llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefix=X86-32
-
-; Verify that sdiv+srem / udiv+urem on i128 with the same operands lower to a
-; single __divmodti4 / __udivmodti4 call rather than two separate helper calls.
-; DAGCombiner::useDivRem fuses the pair into ISD::SDIVREM/UDIVREM, which is
-; then expanded to the fused libcall via ExpandIntRes_DIVREM in the type
-; legalizer.
-
-define void @sdivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
-; X64-LABEL: sdivrem_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rsp, %r8
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    callq __divmodti4 at PLT
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movq %rdx, 8(%rbx)
-; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movaps %xmm0, 16(%rbx)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    retq
-;
-; A64-LABEL: sdivrem_i128:
-; A64:       // %bb.0:
-; A64-NEXT:    sub sp, sp, #32
-; A64-NEXT:    mov x8, x4
-; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x19, x0
-; A64-NEXT:    mov x4, sp
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    mov x2, x8
-; A64-NEXT:    mov x3, x5
-; A64-NEXT:    bl __divmodti4
-; A64-NEXT:    ldp x8, x9, [sp]
-; A64-NEXT:    stp x0, x1, [x19]
-; A64-NEXT:    stp x8, x9, [x19, #16]
-; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    add sp, sp, #32
-; A64-NEXT:    ret
-;
-; RV64-LABEL: sdivrem_i128:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -32
-; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    mv a4, sp
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:    mv a2, a3
-; RV64-NEXT:    mv a3, a5
-; RV64-NEXT:    call __divmodti4
-; RV64-NEXT:    ld a2, 0(sp)
-; RV64-NEXT:    ld a3, 8(sp)
-; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sd a1, 8(s0)
-; RV64-NEXT:    sd a2, 16(s0)
-; RV64-NEXT:    sd a3, 24(s0)
-; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 32
-; RV64-NEXT:    ret
-  %q = sdiv i128 %n, %d
-  %r = srem i128 %n, %d
-  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
-  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
-  store i128 %q, ptr %p0, align 16
-  store i128 %r, ptr %p1, align 16
-  ret void
-}
-
-define void @udivrem_i128(ptr %out, i128 %n, i128 %d) nounwind {
-; X64-LABEL: udivrem_i128:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rsp, %r8
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    callq __udivmodti4 at PLT
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movq %rdx, 8(%rbx)
-; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movaps %xmm0, 16(%rbx)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    retq
-;
-; A64-LABEL: udivrem_i128:
-; A64:       // %bb.0:
-; A64-NEXT:    sub sp, sp, #32
-; A64-NEXT:    mov x8, x4
-; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x19, x0
-; A64-NEXT:    mov x4, sp
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    mov x2, x8
-; A64-NEXT:    mov x3, x5
-; A64-NEXT:    bl __udivmodti4
-; A64-NEXT:    ldp x8, x9, [sp]
-; A64-NEXT:    stp x0, x1, [x19]
-; A64-NEXT:    stp x8, x9, [x19, #16]
-; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    add sp, sp, #32
-; A64-NEXT:    ret
-;
-; RV64-LABEL: udivrem_i128:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -32
-; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    mv a4, sp
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:    mv a2, a3
-; RV64-NEXT:    mv a3, a5
-; RV64-NEXT:    call __udivmodti4
-; RV64-NEXT:    ld a2, 0(sp)
-; RV64-NEXT:    ld a3, 8(sp)
-; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sd a1, 8(s0)
-; RV64-NEXT:    sd a2, 16(s0)
-; RV64-NEXT:    sd a3, 24(s0)
-; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 32
-; RV64-NEXT:    ret
-  %q = udiv i128 %n, %d
-  %r = urem i128 %n, %d
-  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
-  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
-  store i128 %q, ptr %p0, align 16
-  store i128 %r, ptr %p1, align 16
-  ret void
-}
-
-; On 32-bit targets __divmodti4/__udivmodti4 are not registered (Int128RTLibcalls
-; is only added to 64-bit targets), so the fallback path in ExpandIntRes_DIVREM
-; fires: the SDIVREM node is replaced by separate SDIV + SREM, which then expand
-; inline. Verify no fused call is emitted and the code compiles without crashing.
-
-; X86-32-NOT: __divmodti4
-; X86-32-NOT: __udivmodti4
-
-define void @sdivrem_i128_fallback(ptr %out, i128 %n, i128 %d) nounwind {
-; X64-LABEL: sdivrem_i128_fallback:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq %rsp, %r8
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    callq __divmodti4 at PLT
-; X64-NEXT:    movaps (%rsp), %xmm0
-; X64-NEXT:    movq %rdx, 8(%rbx)
-; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movaps %xmm0, 16(%rbx)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    retq
-;
-; A64-LABEL: sdivrem_i128_fallback:
-; A64:       // %bb.0:
-; A64-NEXT:    sub sp, sp, #32
-; A64-NEXT:    mov x8, x4
-; A64-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; A64-NEXT:    mov x1, x3
-; A64-NEXT:    mov x19, x0
-; A64-NEXT:    mov x4, sp
-; A64-NEXT:    mov x0, x2
-; A64-NEXT:    mov x2, x8
-; A64-NEXT:    mov x3, x5
-; A64-NEXT:    bl __divmodti4
-; A64-NEXT:    ldp x8, x9, [sp]
-; A64-NEXT:    stp x0, x1, [x19]
-; A64-NEXT:    stp x8, x9, [x19, #16]
-; A64-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; A64-NEXT:    add sp, sp, #32
-; A64-NEXT:    ret
-;
-; RV64-LABEL: sdivrem_i128_fallback:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -32
-; RV64-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    mv a5, a4
-; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    mv a4, sp
-; RV64-NEXT:    mv a0, a1
-; RV64-NEXT:    mv a1, a2
-; RV64-NEXT:    mv a2, a3
-; RV64-NEXT:    mv a3, a5
-; RV64-NEXT:    call __divmodti4
-; RV64-NEXT:    ld a2, 0(sp)
-; RV64-NEXT:    ld a3, 8(sp)
-; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sd a1, 8(s0)
-; RV64-NEXT:    sd a2, 16(s0)
-; RV64-NEXT:    sd a3, 24(s0)
-; RV64-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 32
-; RV64-NEXT:    ret
-  %q = sdiv i128 %n, %d
-  %r = srem i128 %n, %d
-  %p0 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 0
-  %p1 = getelementptr inbounds {i128, i128}, ptr %out, i32 0, i32 1
-  store i128 %q, ptr %p0, align 16
-  store i128 %r, ptr %p1, align 16
-  ret void
-}

>From 0f80b6d419c5008c1d3bc1a2a7724a05530311e4 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 09:20:55 -0400
Subject: [PATCH 06/12] add extra testing, especially for mac os

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +-
 .../CodeGen/Generic/i128-divrem-libcall.ll    | 44 +++++++++++++------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index e1c11e1c35a31..eabca009c84ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -472,7 +472,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_DIVREM            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
index 759ae41fa2226..36ffdd5f7e563 100644
--- a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
@@ -1,6 +1,8 @@
 ; 64-bit targets: fused __divmodti4 / __udivmodti4
-; RUN: %if x86-registered-target			   %{ llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-linux-gnux32           | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-pc-windows-msvc        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-w64-mingw32            | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,FUSED %}
@@ -10,15 +12,21 @@
 ; RUN: %if webassembly-registered-target %{ llc < %s -mtriple=wasm32-unknown-unknown        | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if webassembly-registered-target %{ llc < %s -mtriple=wasm64-unknown-unknown        | FileCheck %s --check-prefixes=CHECK,FUSED %}
 
-; 32-bit / ILP32 targets: no fused libcall 
-; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i386-linux-gnu                | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv6-linux-gnueabihf         | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if aarch64-registered-target  %{ llc < %s -mtriple=aarch64_32-apple-watchos      | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu             | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,SPLIT %}
-; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-none-eabi               | FileCheck %s --check-prefixes=CHECK,SPLIT %}
+; 32-bit / ILP32 targets that expand inline (no runtime library or no libcall)
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i386-linux-gnu                | FileCheck %s --check-prefixes=CHECK,INLINE %}
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefixes=CHECK,INLINE %}
+; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu             | FileCheck %s --check-prefixes=CHECK,INLINE %}
+; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,INLINE %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-none-eabi               | FileCheck %s --check-prefixes=CHECK,INLINE %}
+
+; 32-bit / ILP32 targets that fall back to separate __divti3 + __modti3 calls
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv6-linux-gnueabihf         | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
+; RUN: %if aarch64-registered-target  %{ llc < %s -mtriple=aarch64_32-apple-watchos      | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
+
+; 64-bit Mac OS: fused ___divmodti4 (extra underscore)
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-apple-macosx           | FileCheck %s --check-prefixes=CHECK,FUSED-DARWIN %}
+; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=arm64-apple-macosx            | FileCheck %s --check-prefixes=CHECK,FUSED-DARWIN %}
 
 ; Verify that sdiv+srem / udiv+urem on i128 fuse into a single __divmodti4 /
 ; __udivmodti4 call on targets where the libcall is available (64-bit targets
@@ -28,12 +36,17 @@
 ;   64-bit targets and wasm: fused __divmodti4 / __udivmodti4
 ;   32-bit targets that lack the fused call may lower to:
 ;     - separate __divti3 + __modti3 / __udivti3 + __umodti3 calls, or
-;     - fully inline expansion (e.g. i686)
+;     - fully inline expansion (e.g. i686, bare metal)
 
 define void @sdivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
 ; CHECK-LABEL: sdivrem_i128:
 ; FUSED:           __divmodti4
-; SPLIT-NOT:       __divmodti4
+; FUSED-DARWIN:    ___divmodti4
+; DIVMOD:          __divti3
+; DIVMOD:          __modti3
+; INLINE-NOT:      __divmodti4
+; INLINE-NOT:      __divti3
+; INLINE-NOT:      __modti3
   %q = sdiv i128 %n, %d
   %r = srem i128 %n, %d
   store i128 %q, ptr %q_out
@@ -44,7 +57,12 @@ define void @sdivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
 define void @udivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
 ; CHECK-LABEL: udivrem_i128:
 ; FUSED:           __udivmodti4
-; SPLIT-NOT:       __udivmodti4
+; FUSED-DARWIN:    ___udivmodti4
+; DIVMOD:          __udivti3
+; DIVMOD:          __umodti3
+; INLINE-NOT:      __udivmodti4
+; INLINE-NOT:      __udivti3
+; INLINE-NOT:      __umodti3
   %q = udiv i128 %n, %d
   %r = urem i128 %n, %d
   store i128 %q, ptr %q_out

>From 938b07f5f8e97d6ec2b3f60b0856b9185f31878b Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 10:08:40 -0400
Subject: [PATCH 07/12] fix failing armv6 + armv7 tests

---
 llvm/test/CodeGen/Generic/i128-divrem-libcall.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
index 36ffdd5f7e563..057270d1aaf07 100644
--- a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
@@ -17,11 +17,11 @@
 ; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i686-linux-gnu                | FileCheck %s --check-prefixes=CHECK,INLINE %}
 ; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu             | FileCheck %s --check-prefixes=CHECK,INLINE %}
 ; RUN: %if riscv-registered-target    %{ llc < %s -mtriple=riscv32-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,INLINE %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv6-linux-gnueabihf         | FileCheck %s --check-prefixes=CHECK,INLINE %}
+; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,INLINE %}
 ; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-none-eabi               | FileCheck %s --check-prefixes=CHECK,INLINE %}
 
-; 32-bit / ILP32 targets that fall back to separate __divti3 + __modti3 calls
-; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv6-linux-gnueabihf         | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
-; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
+; ILP32 targets that fall back to separate __divti3 + __modti3 calls
 ; RUN: %if aarch64-registered-target  %{ llc < %s -mtriple=aarch64_32-apple-watchos      | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
 
 ; 64-bit Mac OS: fused ___divmodti4 (extra underscore)

>From c92ee03f99fb1133781f6af2719371e51c8584f6 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 22 Mar 2026 19:25:00 -0400
Subject: [PATCH 08/12] add support for Win64 i128 ABI and add more testing

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  6 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 11 ++-
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 77 +++++++++++++++++++
 llvm/lib/Target/X86/X86ISelLowering.h         |  2 +
 .../CodeGen/Generic/i128-divrem-libcall.ll    | 65 ++++++++++++++--
 5 files changed, 151 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49215a16cb66b..ef6c168fcd306 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5061,8 +5061,12 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
+  // For non-legal types, only allow the DIVREM node to form when a fused
+  // libcall is available.  ExpandIntRes_DIVREM currently only handles i128;
+  // extending to other widths requires generalizing it to select the libcall
+  // by VT.
   if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT) &&
-      !isDivRemLibcallAvailable(Node, isSigned, DAG))
+      !(VT == MVT::i128 && isDivRemLibcallAvailable(Node, isSigned, DAG)))
     return SDValue();
 
   // If DIVREM is going to get expanded into a libcall,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6318a077da6ed..6838dfb746ebe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4910,10 +4910,17 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   bool IsSigned = (N->getOpcode() == ISD::SDIVREM);
+
+  // Only i128 is handled here; other widths require generalizing this
+  // function to select the libcall by VT.
   RTLIB::Libcall LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+  assert(VT == MVT::i128 &&
+         "ExpandIntRes_DIVREM only handles i128; generalize by VT first");
 
-  // If no fused divrem libcall is available, fall back to separate div and rem.
-  if (DAG.getLibcalls().getLibcallImpl(LC) == RTLIB::Unsupported) {
+  // If no fused divrem libcall is available (or VT is not i128 in release
+  // builds), fall back to separate div and rem.
+  if (VT != MVT::i128 ||
+      DAG.getLibcalls().getLibcallImpl(LC) == RTLIB::Unsupported) {
     unsigned DivOp = IsSigned ? ISD::SDIV : ISD::UDIV;
     unsigned RemOp = IsSigned ? ISD::SREM : ISD::UREM;
     SDValue Ops[2] = {N->getOperand(0), N->getOperand(1)};
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 32796c3e56781..e05f37b423c5a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2670,6 +2670,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
     setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
     setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
@@ -30631,6 +30633,72 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   return DAG.getBitcast(VT, CallInfo.first);
 }
 
+void X86TargetLowering::LowerWin64_i128DIVREM(SDNode *N, SelectionDAG &DAG,
+                                              SDValue &Quot,
+                                              SDValue &Rem) const {
+  assert(Subtarget.isTargetWin64() && "Unexpected target");
+  EVT VT = N->getValueType(0);
+  assert(VT == MVT::i128 && "Unexpected type");
+
+  bool isSigned = N->getOpcode() == ISD::SDIVREM;
+  RTLIB::Libcall LC = isSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
+  RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
+
+  SDLoc dl(N);
+
+  // If no fused divrem libcall is available, fall back to separate div and rem.
+  // This goes through LowerWin64_i128OP with the correct pointer-arg ABI.
+  if (LCImpl == RTLIB::Unsupported) {
+    unsigned DivOp = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned RemOp = isSigned ? ISD::SREM : ISD::UREM;
+    Quot = DAG.getNode(DivOp, dl, VT, N->getOperand(0), N->getOperand(1));
+    Rem = DAG.getNode(RemOp, dl, VT, N->getOperand(0), N->getOperand(1));
+    return;
+  }
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+
+  // Spill both i128 inputs to stack temporaries and pass as pointers as per
+  // Win64 CC (Win64 has no calling convention for passing i128 by value).
+  for (unsigned i = 0; i < 2; ++i) {
+    EVT ArgVT = N->getOperand(i).getValueType();
+    assert(ArgVT == MVT::i128 && "Unexpected argument type");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    MachinePointerInfo MPI =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+    InChain =
+        DAG.getStore(InChain, dl, N->getOperand(i), StackPtr, MPI, Align(16));
+    Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
+  }
+
+  // Allocate a stack slot for the remainder output pointer.
+  MachineFunction &MF = DAG.getMachineFunction();
+  int RemFI = MF.getFrameInfo().CreateStackObject(16, Align(16), false);
+  SDValue RemPtr = DAG.getFrameIndex(RemFI, getPointerTy(DAG.getDataLayout()));
+  Args.emplace_back(RemPtr, PointerType::get(*DAG.getContext(), 0));
+
+  SDValue Callee =
+      DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(
+          DAG.getLibcalls().getLibcallImplCallingConv(LCImpl),
+          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+          std::move(Args))
+      .setInRegister()
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  Quot = DAG.getBitcast(VT, CallInfo.first);
+  Rem = DAG.getLoad(VT, dl, CallInfo.second, RemPtr,
+                    MachinePointerInfo::getFixedStack(MF, RemFI), Align(16));
+}
+
 SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
                                                    SelectionDAG &DAG,
                                                    SDValue &Chain) const {
@@ -34957,6 +35025,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(V);
     return;
   }
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    assert(N->getValueType(0) == MVT::i128 && Subtarget.isTargetWin64());
+    SDValue Q, R;
+    LowerWin64_i128DIVREM(N, DAG, Q, R);
+    Results.push_back(Q);
+    Results.push_back(R);
+    return;
+  }
   case ISD::TRUNCATE: {
     MVT VT = N->getSimpleValueType(0);
     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 5c7c54cacd239..80781e9cac406 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -832,6 +832,8 @@ namespace llvm {
     SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+    void LowerWin64_i128DIVREM(SDNode *N, SelectionDAG &DAG, SDValue &Quot,
+                               SDValue &Rem) const;
     SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
                                     SDValue &Chain) const;
     SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
index 057270d1aaf07..594b4c73d4b06 100644
--- a/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
+++ b/llvm/test/CodeGen/Generic/i128-divrem-libcall.ll
@@ -1,9 +1,9 @@
 ; 64-bit targets: fused __divmodti4 / __udivmodti4
-; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-linux-gnu              | FileCheck %s --check-prefixes=CHECK,FUSED,SYSV-X64 %}
 ; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-linux-gnux32           | FileCheck %s --check-prefixes=CHECK,FUSED %}
-; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-pc-windows-msvc        | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-pc-windows-msvc        | FileCheck %s --check-prefixes=CHECK,WIN64 %}
 ; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-w64-mingw32            | FileCheck %s --check-prefixes=CHECK,FUSED %}
-; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
+; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=aarch64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED,SYSV-A64 %}
 ; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu             | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if riscv-registered-target       %{ llc < %s -mtriple=riscv64-linux-gnu -mattr=+m   | FileCheck %s --check-prefixes=CHECK,FUSED %}
 ; RUN: %if powerpc-registered-target     %{ llc < %s -mtriple=powerpc64-linux-gnu           | FileCheck %s --check-prefixes=CHECK,FUSED %}
@@ -21,27 +21,58 @@
 ; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-linux-gnueabi           | FileCheck %s --check-prefixes=CHECK,INLINE %}
 ; RUN: %if arm-registered-target      %{ llc < %s -mtriple=armv7-none-eabi               | FileCheck %s --check-prefixes=CHECK,INLINE %}
 
+; Win32: i128 fully inline-expanded, no libcalls registered
+; RUN: %if x86-registered-target      %{ llc < %s -mtriple=i686-pc-windows-msvc          | FileCheck %s --check-prefixes=CHECK,WIN32 %}
+
 ; ILP32 targets that fall back to separate __divti3 + __modti3 calls
 ; RUN: %if aarch64-registered-target  %{ llc < %s -mtriple=aarch64_32-apple-watchos      | FileCheck %s --check-prefixes=CHECK,DIVMOD %}
 
-; 64-bit Mac OS: fused ___divmodti4 (extra underscore)
+; 64-bit Mac OS: fused ___divmodti4 (extra underscore, same ABI as Linux AArch64)
 ; RUN: %if x86-registered-target         %{ llc < %s -mtriple=x86_64-apple-macosx           | FileCheck %s --check-prefixes=CHECK,FUSED-DARWIN %}
-; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=arm64-apple-macosx            | FileCheck %s --check-prefixes=CHECK,FUSED-DARWIN %}
+; RUN: %if aarch64-registered-target     %{ llc < %s -mtriple=arm64-apple-macosx            | FileCheck %s --check-prefixes=CHECK,DARWIN-A64 %}
 
 ; Verify that sdiv+srem / udiv+urem on i128 fuse into a single __divmodti4 /
 ; __udivmodti4 call on targets where the libcall is available (64-bit targets
 ; and wasm), and do not on targets where it is not (32-bit / ILP32).
 ;
-; The lowering varies by target:
-;   64-bit targets and wasm: fused __divmodti4 / __udivmodti4
+; Detailed ABI checks for the four most popular calling conventions:
+;   WIN64     (x86_64 Windows): all args spilled to stack and passed as pointers
+;             in %rcx/%rdx/%r8, quotient returned in %xmm0.
+;   DARWIN-A64 (AArch64 macOS): identical to SYSV-A64 but symbol has an extra
+;             leading underscore (___divmodti4).
+;   SYSV-X64  (x86_64 Linux/BSD): i128 args in register pairs, rem pointer via
+;             %rsp in %r8, quotient returned in %rax:%rdx.
+;   SYSV-A64  (AArch64 Linux): i128 args in x0:x1/x2:x3, rem pointer via sp in
+;             x4, quotient returned in x0:x1.
+;   Win32 (i686-windows-msvc): no i128 libcalls registered, fully inline.
 ;   32-bit targets that lack the fused call may lower to:
 ;     - separate __divti3 + __modti3 / __udivti3 + __umodti3 calls, or
 ;     - fully inline expansion (e.g. i686, bare metal)
 
 define void @sdivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
 ; CHECK-LABEL: sdivrem_i128:
+; SYSV-X64:        movq    %rsp, %r8
+; SYSV-A64:        mov     x4, sp
 ; FUSED:           __divmodti4
 ; FUSED-DARWIN:    ___divmodti4
+; SYSV-X64:        movq    (%rsp),
+; SYSV-X64:        movq    %rax,
+; SYSV-X64:        movq    %rdx,
+; SYSV-A64:        ldp     {{.*}}, [sp]
+; SYSV-A64:        stp     x0, x1,
+; DARWIN-A64:      mov     x4, sp
+; DARWIN-A64:      bl      ___divmodti4
+; DARWIN-A64:      ldp     {{.*}}, [sp]
+; DARWIN-A64:      stp     x0, x1,
+; WIN64:           leaq    {{[0-9]+}}(%rsp), %rcx
+; WIN64:           leaq    {{[0-9]+}}(%rsp), %rdx
+; WIN64:           leaq    {{[0-9]+}}(%rsp), %r8
+; WIN64:           callq   __divmodti4
+; WIN64:           movaps  {{[0-9]+}}(%rsp), %xmm1
+; WIN64:           movaps  %xmm0,
+; WIN32-NOT:       __divmodti4
+; WIN32-NOT:       __divti3
+; WIN32-NOT:       __modti3
 ; DIVMOD:          __divti3
 ; DIVMOD:          __modti3
 ; INLINE-NOT:      __divmodti4
@@ -56,8 +87,28 @@ define void @sdivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
 
 define void @udivrem_i128(ptr %q_out, ptr %r_out, i128 %n, i128 %d) {
 ; CHECK-LABEL: udivrem_i128:
+; SYSV-X64:        movq    %rsp, %r8
+; SYSV-A64:        mov     x4, sp
 ; FUSED:           __udivmodti4
 ; FUSED-DARWIN:    ___udivmodti4
+; SYSV-X64:        movq    (%rsp),
+; SYSV-X64:        movq    %rax,
+; SYSV-X64:        movq    %rdx,
+; SYSV-A64:        ldp     {{.*}}, [sp]
+; SYSV-A64:        stp     x0, x1,
+; DARWIN-A64:      mov     x4, sp
+; DARWIN-A64:      bl      ___udivmodti4
+; DARWIN-A64:      ldp     {{.*}}, [sp]
+; DARWIN-A64:      stp     x0, x1,
+; WIN64:           leaq    {{[0-9]+}}(%rsp), %rcx
+; WIN64:           leaq    {{[0-9]+}}(%rsp), %rdx
+; WIN64:           leaq    {{[0-9]+}}(%rsp), %r8
+; WIN64:           callq   __udivmodti4
+; WIN64:           movaps  {{[0-9]+}}(%rsp), %xmm1
+; WIN64:           movaps  %xmm0,
+; WIN32-NOT:       __udivmodti4
+; WIN32-NOT:       __udivti3
+; WIN32-NOT:       __umodti3
 ; DIVMOD:          __udivti3
 ; DIVMOD:          __umodti3
 ; INLINE-NOT:      __udivmodti4

>From 7b43102ccecbf65231d486c5df086e145f8a2cb1 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Thu, 26 Mar 2026 21:34:16 -0400
Subject: [PATCH 09/12] add clang format exemption lines around the header that
 isn't following clang formatting rules

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index eabca009c84ea..d8c0ce41871a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -441,6 +441,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   // Integer Result Expansion.
   void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+	// clang-format off
   void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -502,6 +503,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_VSCALE            (SDNode *N, SDValue &Lo, SDValue &Hi);
+	// clang-format on
   void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_CTTZ_ELTS(SDNode *N, SDValue &Lo, SDValue &Hi);
 

>From 89d03f0b5bbf89c7860ed1aaa377331f4782dd67 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sun, 29 Mar 2026 18:49:12 -0400
Subject: [PATCH 10/12] code review feedback

---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 32 ++++++++++++-------
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  9 +++---
 3 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6838dfb746ebe..442243cf5fcd5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3116,7 +3116,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
   case ISD::SDIVREM:
   case ISD::UDIVREM:
-    ExpandIntRes_DIVREM(N, Lo, Hi);
+    ExpandIntRes_DIVREM(N, ResNo, Lo, Hi);
     break;
   case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
   case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
@@ -4905,8 +4905,8 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
   ReplaceValueWith(SDValue(Node, 1), Ovf);
 }
 
-void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
-                                           SDValue &Hi) {
+void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, unsigned ResNo,
+                                           SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   bool IsSigned = (N->getOpcode() == ISD::SDIVREM);
@@ -4926,8 +4926,13 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
     SDValue Ops[2] = {N->getOperand(0), N->getOperand(1)};
     SDValue Q = DAG.getNode(DivOp, dl, VT, Ops);
     SDValue R = DAG.getNode(RemOp, dl, VT, Ops);
-    SplitInteger(Q, Lo, Hi);
-    ReplaceValueWith(SDValue(N, 1), R);
+    if (ResNo == 0) {
+      SplitInteger(Q, Lo, Hi);
+      ReplaceValueWith(SDValue(N, 1), R);
+    } else {
+      SplitInteger(R, Lo, Hi);
+      ReplaceValueWith(SDValue(N, 0), Q);
+    }
     return;
   }
 
@@ -4964,16 +4969,21 @@ void DAGTypeLegalizer::ExpandIntRes_DIVREM(SDNode *N, SDValue &Lo,
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
-  // Quotient is the return value; split it into Lo/Hi for the expanded type.
-  SplitInteger(CallInfo.first, Lo, Hi);
-
-  // Remainder is written to the stack temporary; load it back and register
-  // it as the replacement for result 1 of the original SDIVREM/UDIVREM node.
+  // Load the remainder from the stack temporary.
   int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
   SDValue Rem = DAG.getLoad(
       VT, dl, CallInfo.second, FIPtr,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-  ReplaceValueWith(SDValue(N, 1), Rem);
+
+  // Split the requested result into Lo/Hi and register the other result as its
+  // replacement.
+  if (ResNo == 0) {
+    SplitInteger(CallInfo.first, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Rem);
+  } else {
+    SplitInteger(Rem, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 0), CallInfo.first);
+  }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d8c0ce41871a6..34ba1db4ebbc8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -473,7 +473,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void ExpandIntRes_DIVREM            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_DIVREM            (SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e05f37b423c5a..ad6b62c94ba17 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30622,9 +30622,10 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
-      .setLibCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl),
-                    EVT(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
-                    std::move(Args))
+      .setLibCallee(
+          DAG.getLibcalls().getLibcallImplCallingConv(LCImpl),
+          EVT(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+          std::move(Args))
       .setInRegister()
       .setSExtResult(isSigned)
       .setZExtResult(!isSigned);
@@ -30687,7 +30688,7 @@ void X86TargetLowering::LowerWin64_i128DIVREM(SDNode *N, SelectionDAG &DAG,
       .setChain(InChain)
       .setLibCallee(
           DAG.getLibcalls().getLibcallImplCallingConv(LCImpl),
-          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+          EVT(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
           std::move(Args))
       .setInRegister()
       .setSExtResult(isSigned)

>From 496d2a61fcf1c1fda6164947cf2ed9961a972d40 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Mon, 30 Mar 2026 18:06:30 -0400
Subject: [PATCH 11/12] code review feedback: demorgan if condition

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ef6c168fcd306..bd9a8026ef807 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5065,8 +5065,8 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   // libcall is available.  ExpandIntRes_DIVREM currently only handles i128;
   // extending to other widths requires generalizing it to select the libcall
   // by VT.
-  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT) &&
-      !(VT == MVT::i128 && isDivRemLibcallAvailable(Node, isSigned, DAG)))
+  if (!(TLI.isTypeLegal(VT) || TLI.isOperationCustom(DivRemOpc, VT) ||
+        (VT == MVT::i128 && isDivRemLibcallAvailable(Node, isSigned, DAG))))
     return SDValue();
 
   // If DIVREM is going to get expanded into a libcall,

>From 7527f1d8631a75e70f8d567ba3bd21b78934dcfe Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Tue, 31 Mar 2026 17:47:45 -0400
Subject: [PATCH 12/12] more feedback: demorgan another condition

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bd9a8026ef807..55e53a582f7e9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5065,8 +5065,8 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   // libcall is available.  ExpandIntRes_DIVREM currently only handles i128;
   // extending to other widths requires generalizing it to select the libcall
   // by VT.
-  if (!(TLI.isTypeLegal(VT) || TLI.isOperationCustom(DivRemOpc, VT) ||
-        (VT == MVT::i128 && isDivRemLibcallAvailable(Node, isSigned, DAG))))
+  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT) &&
+      (VT != MVT::i128 || !isDivRemLibcallAvailable(Node, isSigned, DAG)))
     return SDValue();
 
   // If DIVREM is going to get expanded into a libcall,



[llvm] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem / udiv+urem into single __divmodti4 / __udivmodti4 call (PR #187908)

[llvm] [DAGCombiner][LegalizeTypes] Fuse i128 sdiv+srem / udiv+urem into single divmodti4 / udivmodti4 call (PR #187908)