[llvm] a3eb13b - [X86] remove unnecessary movs when %rdx is an input to mulx (#184462)

Wed Mar 4 07:19:22 PST 2026

Author: Takashi Idobe
Date: 2026-03-04T15:19:16Z
New Revision: a3eb13b5bfd1f216afcc128f9452bbb72daba0d6

URL: https://github.com/llvm/llvm-project/commit/a3eb13b5bfd1f216afcc128f9452bbb72daba0d6
DIFF: https://github.com/llvm/llvm-project/commit/a3eb13b5bfd1f216afcc128f9452bbb72daba0d6.diff

LOG: [X86] remove unnecessary movs when %rdx is an input to mulx (#184462)

Closes: https://github.com/llvm/llvm-project/issues/174912

When generating a `mulx` instruction for a widening multiplication, even
if one input is placed in %rdx, LLVM won't place it in the implicit
first slot, instead it'll generate two movs before calling mulx to swap
the registers, which are unnecessary. GCC already has this optimization
(as shown in the issue) so this puts the two compilers closer to each
other on that front.

Co-authored-by: Aiden Grossman <aidengrossman at google.com>

Added: 
    llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll

Modified: 
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 101ea3e231a5c..cc846e0d1492e 100644

--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5882,6 +5882,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         std::swap(N0, N1);
     }
 
+    // For MULX, the implicit source must be in RDX (LoReg). If N1 is
+    // already a CopyFromReg of LoReg and N0 is not, flip so that N0
+    // (which feeds the CopyToReg below) is the operand already in LoReg,
+    // avoiding an unnecessary register-to-register copy before the multiply.
+    if (UseMULX && !foldedLoad) {
+      MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
+      auto GetPhysReg = [&](SDValue V) -> Register {
+        if (V.getOpcode() != ISD::CopyFromReg)
+          return Register();
+        Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
+        if (Reg.isVirtual())
+          return MRI.getLiveInPhysReg(Reg);
+        return Reg;
+      };
+      if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
+        std::swap(N0, N1);
+    }
+
     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
     SDValue ResHi, ResLo;

diff  --git a/llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll b/llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll
new file mode 100644
index 0000000000000..60f8309440f29
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+bmi2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core-avx2 | FileCheck %s
+;
+; When a MULX operand already lives in RDX (the implicit source register) at
+; function entry, no register-copy preamble should be emitted before the MULX.
+; In SysV x86-64 ABI the 3rd integer argument arrives in %rdx.
+
+; Pure 64x64->128 multiply: arg 'a' is the 3rd argument, arriving in %rdx.
+; The mul i128 node is emitted with operands in (b, a) order (as clang does);
+define void @mul64_u128_a_in_rdx(ptr %hi, ptr %lo, i64 %a, i64 %b) {
+; CHECK-LABEL: mul64_u128_a_in_rdx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulxq %rcx, %rcx, %rax
+; CHECK-NEXT:    movq %rcx, (%rsi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %r = mul nuw i128 %zb, %za
+  %lo_val = trunc i128 %r to i64
+  store i64 %lo_val, ptr %lo
+  %hi_shr = lshr i128 %r, 64
+  %hi_val = trunc nuw i128 %hi_shr to i64
+  store i64 %hi_val, ptr %hi
+  ret void
+}
+
+; Same multiply with operands in natural (a, b) order.
+define void @mul64_u128_a_in_rdx_natural_order(ptr %hi, ptr %lo, i64 %a, i64 %b) {
+; CHECK-LABEL: mul64_u128_a_in_rdx_natural_order:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulxq %rcx, %rcx, %rax
+; CHECK-NEXT:    movq %rcx, (%rsi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %r = mul nuw i128 %za, %zb
+  %lo_val = trunc i128 %r to i64
+  store i64 %lo_val, ptr %lo
+  %hi_shr = lshr i128 %r, 64
+  %hi_val = trunc nuw i128 %hi_shr to i64
+  store i64 %hi_val, ptr %hi
+  ret void
+}
+
+; Multiply-add: hi:lo = a*b + c.  'a' is the 3rd arg in %rdx, 'c' is in %r8.
+define void @muladd64_a_in_rdx(ptr %hi, ptr %lo, i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: muladd64_a_in_rdx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulxq %rcx, %rax, %rcx
+; CHECK-NEXT:    addq %r8, %rax
+; CHECK-NEXT:    adcq $0, %rcx
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    movq %rcx, (%rdi)
+; CHECK-NEXT:    retq
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %r = mul nuw i128 %zb, %za
+  %zc = zext i64 %c to i128
+  %r2 = add nuw i128 %r, %zc
+  %lo_val = trunc i128 %r2 to i64
+  store i64 %lo_val, ptr %lo
+  %hi_shr = lshr i128 %r2, 64
+  %hi_val = trunc nuw i128 %hi_shr to i64
+  store i64 %hi_val, ptr %hi
+  ret void
+}