[llvm] [X86] remove unnecessary movs when %rdx is an input to mulx (PR #184462)

Wed Mar 4 06:49:37 PST 2026

https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/184462

>From 6b18ed078ed7a9a1bdf54d751ac1afa34859c557 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Tue, 3 Mar 2026 17:49:52 -0500
Subject: [PATCH] [X86] remove unnecessary movs when %rdx is an argument to
 mulx

---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       | 18 +++++
 .../CodeGen/X86/mulx64-no-implicit-copy.ll    | 69 +++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 101ea3e231a5c..cc846e0d1492e 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5882,6 +5882,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         std::swap(N0, N1);
     }
 
+    // For MULX, the implicit source must be in RDX (LoReg). If N1 is
+    // already a CopyFromReg of LoReg and N0 is not, flip so that N0
+    // (which feeds the CopyToReg below) is the operand already in LoReg,
+    // avoiding an unnecessary register-to-register copy before the multiply.
+    if (UseMULX && !foldedLoad) {
+      MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
+      auto GetPhysReg = [&](SDValue V) -> Register {
+        if (V.getOpcode() != ISD::CopyFromReg)
+          return Register();
+        Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
+        if (Reg.isVirtual())
+          return MRI.getLiveInPhysReg(Reg);
+        return Reg;
+      };
+      if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
+        std::swap(N0, N1);
+    }
+
     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
     SDValue ResHi, ResLo;
diff --git a/llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll b/llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll
new file mode 100644
index 0000000000000..60f8309440f29
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mulx64-no-implicit-copy.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+bmi2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core-avx2 | FileCheck %s
+;
+; When a MULX operand already lives in RDX (the implicit source register) at
+; function entry, no register-copy preamble should be emitted before the MULX.
+; In SysV x86-64 ABI the 3rd integer argument arrives in %rdx.
+
+; Pure 64x64->128 multiply: arg 'a' is the 3rd argument, arriving in %rdx.
+; The mul i128 node is emitted with operands in (b, a) order (as clang does);
+define void @mul64_u128_a_in_rdx(ptr %hi, ptr %lo, i64 %a, i64 %b) {
+; CHECK-LABEL: mul64_u128_a_in_rdx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulxq %rcx, %rcx, %rax
+; CHECK-NEXT:    movq %rcx, (%rsi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %r = mul nuw i128 %zb, %za
+  %lo_val = trunc i128 %r to i64
+  store i64 %lo_val, ptr %lo
+  %hi_shr = lshr i128 %r, 64
+  %hi_val = trunc nuw i128 %hi_shr to i64
+  store i64 %hi_val, ptr %hi
+  ret void
+}
+
+; Same multiply with operands in natural (a, b) order.
+define void @mul64_u128_a_in_rdx_natural_order(ptr %hi, ptr %lo, i64 %a, i64 %b) {
+; CHECK-LABEL: mul64_u128_a_in_rdx_natural_order:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulxq %rcx, %rcx, %rax
+; CHECK-NEXT:    movq %rcx, (%rsi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %r = mul nuw i128 %za, %zb
+  %lo_val = trunc i128 %r to i64
+  store i64 %lo_val, ptr %lo
+  %hi_shr = lshr i128 %r, 64
+  %hi_val = trunc nuw i128 %hi_shr to i64
+  store i64 %hi_val, ptr %hi
+  ret void
+}
+
+; Multiply-add: hi:lo = a*b + c.  'a' is the 3rd arg in %rdx, 'c' is in %r8.
+define void @muladd64_a_in_rdx(ptr %hi, ptr %lo, i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: muladd64_a_in_rdx:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mulxq %rcx, %rax, %rcx
+; CHECK-NEXT:    addq %r8, %rax
+; CHECK-NEXT:    adcq $0, %rcx
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    movq %rcx, (%rdi)
+; CHECK-NEXT:    retq
+  %za = zext i64 %a to i128
+  %zb = zext i64 %b to i128
+  %r = mul nuw i128 %zb, %za
+  %zc = zext i64 %c to i128
+  %r2 = add nuw i128 %r, %zc
+  %lo_val = trunc i128 %r2 to i64
+  store i64 %lo_val, ptr %lo
+  %hi_shr = lshr i128 %r2, 64
+  %hi_val = trunc nuw i128 %hi_shr to i64
+  store i64 %hi_val, ptr %hi
+  ret void
+}