[llvm] 07e8a78 - [X86] Add pseudo instructions to use MULX with a single destination when the low result isn't used.

Sat May 30 16:01:46 PDT 2020

Author: Craig Topper
Date: 2020-05-30T16:01:01-07:00
New Revision: 07e8a780d81bb58a0c7bd4da6cc0b9beaec3c788

URL: https://github.com/llvm/llvm-project/commit/07e8a780d81bb58a0c7bd4da6cc0b9beaec3c788
DIFF: https://github.com/llvm/llvm-project/commit/07e8a780d81bb58a0c7bd4da6cc0b9beaec3c788.diff

LOG: [X86] Add pseudo instructions to use MULX with a single destination when the low result isn't used.

The instruction is defined to only produce high result if both
destinations are the same. We can exploit this to avoid
unnecessarily clobbering a register.

In order to hide this from register allocation we use a pseudo
instruction and expand the result during MCInst creation.

Differential Revision: https://reviews.llvm.org/D80500

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
    llvm/lib/Target/X86/X86InstrArithmetic.td
    llvm/lib/Target/X86/X86MCInstLower.cpp
    llvm/test/CodeGen/X86/atomic-unordered.ll
    llvm/test/CodeGen/X86/i128-mul.ll
    llvm/test/CodeGen/X86/pr35636.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index a5fa98ec8d92..efdea78e1db9 100644

--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4759,20 +4759,25 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     unsigned LoReg, HiReg;
     bool IsSigned = Opcode == ISD::SMUL_LOHI;
     bool UseMULX = !IsSigned && Subtarget->hasBMI2();
+    bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
-      Opc  = UseMULX ? X86::MULX32rr :
+      Opc  = UseMULXHi ? X86::MULX32Hrr :
+             UseMULX ? X86::MULX32rr :
              IsSigned ? X86::IMUL32r : X86::MUL32r;
-      MOpc = UseMULX ? X86::MULX32rm :
+      MOpc = UseMULXHi ? X86::MULX32Hrm :
+             UseMULX ? X86::MULX32rm :
              IsSigned ? X86::IMUL32m : X86::MUL32m;
       LoReg = UseMULX ? X86::EDX : X86::EAX;
       HiReg = X86::EDX;
       break;
     case MVT::i64:
-      Opc  = UseMULX ? X86::MULX64rr :
+      Opc  = UseMULXHi ? X86::MULX64Hrr :
+             UseMULX ? X86::MULX64rr :
              IsSigned ? X86::IMUL64r : X86::MUL64r;
-      MOpc = UseMULX ? X86::MULX64rm :
+      MOpc = UseMULXHi ? X86::MULX64Hrm :
+             UseMULX ? X86::MULX64rm :
              IsSigned ? X86::IMUL64m : X86::MUL64m;
       LoReg = UseMULX ? X86::RDX : X86::RAX;
       HiReg = X86::RDX;
@@ -4796,7 +4801,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      if (UseMULX) {
+      if (UseMULXHi) {
+        SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        Chain = SDValue(CNode, 1);
+      } else if (UseMULX) {
         SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
         CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
         ResHi = SDValue(CNode, 0);
@@ -4815,7 +4825,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       SDValue Ops[] = { N1, InFlag };
-      if (UseMULX) {
+      if (UseMULXHi) {
+        SDVTList VTs = CurDAG->getVTList(NVT);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+      } else if (UseMULX) {
         SDVTList VTs = CurDAG->getVTList(NVT, NVT);
         SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
         ResHi = SDValue(CNode, 0);

diff  --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 57ad893402aa..8bb3b755135b 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1313,7 +1313,17 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+
              []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+
+  // Pseudo instructions to be used when the low result isn't used. The
+  // instruction is defined to keep the high if both destinations are the same.
+  def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
+                    []>, Sched<[sched]>;
+
+  let mayLoad = 1 in
+  def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
+                    []>, Sched<[sched.Folded]>;
 }
 }
 

diff  --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 1e5596b2a90a..1d0df17e1ceb 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -509,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
            "LEA has segment specified!");
     break;
 
+  case X86::MULX32Hrr:
+  case X86::MULX32Hrm:
+  case X86::MULX64Hrr:
+  case X86::MULX64Hrm: {
+    // Turn into regular MULX by duplicating the destination.
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
+    case X86::MULX32Hrm: NewOpc = X86::MULX32rr; break;
+    case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
+    case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
+    }
+    OutMI.setOpcode(NewOpc);
+    // Duplicate the destination.
+    unsigned DestReg = OutMI.getOperand(0).getReg();
+    OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
+    break;
+  }
+
   // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
   // if one of the registers is extended, but other isn't.
   case X86::VMOVZPQILo2PQIrr:

diff  --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index b321820cf506..16fde4074ea0 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -839,14 +839,14 @@ define i64 @load_fold_udiv1(i64* %p) {
 ; CHECK-O3-CUR:       # %bb.0:
 ; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
 ; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT:    mulxq %rax, %rcx, %rax
+; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
 ; CHECK-O3-CUR-NEXT:    shrq $3, %rax
 ; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: load_fold_udiv1:
 ; CHECK-O3-EX:       # %bb.0:
 ; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rcx, %rax
+; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
 ; CHECK-O3-EX-NEXT:    shrq $3, %rax
 ; CHECK-O3-EX-NEXT:    retq
   %v = load atomic i64, i64* %p unordered, align 8
@@ -1034,9 +1034,9 @@ define i64 @load_fold_urem1(i64* %p) {
 ; CHECK-O3-NEXT:    movq (%rdi), %rax
 ; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
 ; CHECK-O3-NEXT:    movq %rax, %rdx
-; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rdx
-; CHECK-O3-NEXT:    shrq $3, %rdx
-; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rcx
+; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rcx
+; CHECK-O3-NEXT:    shrq $3, %rcx
+; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rcx
 ; CHECK-O3-NEXT:    leaq (%rcx,%rcx,2), %rcx
 ; CHECK-O3-NEXT:    subq %rcx, %rax
 ; CHECK-O3-NEXT:    retq
@@ -1693,7 +1693,7 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
 ; CHECK-O0:       # %bb.0:
 ; CHECK-O0-NEXT:    movq (%rdi), %rdx
 ; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O0-NEXT:    mulxq %rax, %rcx, %rax
+; CHECK-O0-NEXT:    mulxq %rax, %rax, %rax
 ; CHECK-O0-NEXT:    shrq $3, %rax
 ; CHECK-O0-NEXT:    movq %rax, (%rdi)
 ; CHECK-O0-NEXT:    retq
@@ -1702,17 +1702,17 @@ define void @rmw_fold_udiv1(i64* %p, i64 %v) {
 ; CHECK-O3-CUR:       # %bb.0:
 ; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
 ; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rcx
-; CHECK-O3-CUR-NEXT:    shrq $3, %rcx
-; CHECK-O3-CUR-NEXT:    movq %rcx, (%rdi)
+; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
+; CHECK-O3-CUR-NEXT:    shrq $3, %rax
+; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
 ; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
 ; CHECK-O3-EX:       # %bb.0:
 ; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rcx
-; CHECK-O3-EX-NEXT:    shrq $3, %rcx
-; CHECK-O3-EX-NEXT:    movq %rcx, (%rdi)
+; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
+; CHECK-O3-EX-NEXT:    shrq $3, %rax
+; CHECK-O3-EX-NEXT:    movq %rax, (%rdi)
 ; CHECK-O3-EX-NEXT:    retq
   %prev = load atomic i64, i64* %p unordered, align 8
   %val = udiv i64 %prev, 15
@@ -1840,7 +1840,7 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
 ; CHECK-O0-NEXT:    movq (%rdi), %rax
 ; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
 ; CHECK-O0-NEXT:    movq %rax, %rdx
-; CHECK-O0-NEXT:    mulxq %rcx, %rdx, %rcx
+; CHECK-O0-NEXT:    mulxq %rcx, %rcx, %rcx
 ; CHECK-O0-NEXT:    shrq $3, %rcx
 ; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
 ; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
@@ -1852,9 +1852,9 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
 ; CHECK-O3:       # %bb.0:
 ; CHECK-O3-NEXT:    movq (%rdi), %rdx
 ; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-NEXT:    mulxq %rax, %rax, %rcx
-; CHECK-O3-NEXT:    shrq $3, %rcx
-; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rax
+; CHECK-O3-NEXT:    mulxq %rax, %rax, %rax
+; CHECK-O3-NEXT:    shrq $3, %rax
+; CHECK-O3-NEXT:    leaq (%rax,%rax,4), %rax
 ; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
 ; CHECK-O3-NEXT:    subq %rax, %rdx
 ; CHECK-O3-NEXT:    movq %rdx, (%rdi)

diff  --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 45834f2eeecd..118fd94342f5 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -54,7 +54,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI-NEXT:    movl %eax, %edx
-; X86-BMI-NEXT:    mulxl %esi, %edx, %ebx
+; X86-BMI-NEXT:    mulxl %esi, %ebx, %ebx
 ; X86-BMI-NEXT:    movl %ecx, %edx
 ; X86-BMI-NEXT:    mulxl %esi, %esi, %ebp
 ; X86-BMI-NEXT:    addl %ebx, %esi
@@ -85,7 +85,7 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
 ; X64-BMI-LABEL: foo:
 ; X64-BMI:       # %bb.0:
 ; X64-BMI-NEXT:    movq %rdi, %rdx
-; X64-BMI-NEXT:    mulxq %rsi, %rcx, %rax
+; X64-BMI-NEXT:    mulxq %rsi, %rax, %rax
 ; X64-BMI-NEXT:    retq
   %tmp0 = zext i64 %x to i128
   %tmp1 = zext i64 %y to i128

diff  --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll
index 07fb37f4b62a..ed4ef292c660 100644
--- a/llvm/test/CodeGen/X86/pr35636.ll
+++ b/llvm/test/CodeGen/X86/pr35636.ll
@@ -7,9 +7,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ; HSW:       # %bb.0: # %bb
 ; HSW-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
 ; HSW-NEXT:    movq %rdi, %rdx
-; HSW-NEXT:    mulxq %rax, %rax, %rcx
-; HSW-NEXT:    shrq $42, %rcx
-; HSW-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; HSW-NEXT:    mulxq %rax, %rax, %rax
+; HSW-NEXT:    shrq $42, %rax
+; HSW-NEXT:    imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
 ; HSW-NEXT:    shrq $20, %rax
 ; HSW-NEXT:    leal (%rax,%rax,4), %eax
 ; HSW-NEXT:    addl $5, %eax
@@ -24,9 +24,9 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
 ; ZN:       # %bb.0: # %bb
 ; ZN-NEXT:    movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
 ; ZN-NEXT:    movq %rdi, %rdx
-; ZN-NEXT:    mulxq %rax, %rax, %rcx
-; ZN-NEXT:    shrq $42, %rcx
-; ZN-NEXT:    imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; ZN-NEXT:    mulxq %rax, %rax, %rax
+; ZN-NEXT:    shrq $42, %rax
+; ZN-NEXT:    imulq $281474977, %rax, %rax # imm = 0x10C6F7A1
 ; ZN-NEXT:    shrq $20, %rax
 ; ZN-NEXT:    leal 5(%rax,%rax,4), %eax
 ; ZN-NEXT:    andl $134217727, %eax # imm = 0x7FFFFFF