[llvm] 8e7e6a8 - [X86] Restore selection of MULX on BMI2 targets.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed May 27 12:01:37 PDT 2020
Author: Craig Topper
Date: 2020-05-27T12:01:18-07:00
New Revision: 8e7e6a8d6bae19c5a18e0d0daa0614272b85598c
URL: https://github.com/llvm/llvm-project/commit/8e7e6a8d6bae19c5a18e0d0daa0614272b85598c
DIFF: https://github.com/llvm/llvm-project/commit/8e7e6a8d6bae19c5a18e0d0daa0614272b85598c.diff
LOG: [X86] Restore selection of MULX on BMI2 targets.
Looking back over gcc and icc behavior it looks like icc does
use mulx32 on 32-bit targets and mulx64 on 64-bit targets. It's
also used when dividing i32 by constant on 32-bit targets and
i64 by constant on 64-bit targets.
gcc uses it multiplies producing a 64 bit result on 32-bit targets
and 128-bit results on a 64-bit target. gcc does not appear to use
it for division by constant.
After this patch clang is closer to the icc behavior. This
basically reverts d1c61861ddc94457b08a5a653d3908b7b38ebb22, but
there were no strong feelings at the time.
Fixes PR45518.
Differential Revision: https://reviews.llvm.org/D80498
Added:
Modified:
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/test/CodeGen/X86/atomic-unordered.ll
llvm/test/CodeGen/X86/bmi2-x86_64.ll
llvm/test/CodeGen/X86/bmi2.ll
llvm/test/CodeGen/X86/hoist-invariant-load.ll
llvm/test/CodeGen/X86/i128-mul.ll
llvm/test/CodeGen/X86/mulx32.ll
llvm/test/CodeGen/X86/mulx64.ll
llvm/test/CodeGen/X86/pr35636.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 91e0cdb80386..a5fa98ec8d92 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4758,17 +4758,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned Opc, MOpc;
unsigned LoReg, HiReg;
bool IsSigned = Opcode == ISD::SMUL_LOHI;
+ bool UseMULX = !IsSigned && Subtarget->hasBMI2();
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i32:
- Opc = IsSigned ? X86::IMUL32r : X86::MUL32r;
- MOpc = IsSigned ? X86::IMUL32m : X86::MUL32m;
- LoReg = X86::EAX; HiReg = X86::EDX;
+ Opc = UseMULX ? X86::MULX32rr :
+ IsSigned ? X86::IMUL32r : X86::MUL32r;
+ MOpc = UseMULX ? X86::MULX32rm :
+ IsSigned ? X86::IMUL32m : X86::MUL32m;
+ LoReg = UseMULX ? X86::EDX : X86::EAX;
+ HiReg = X86::EDX;
break;
case MVT::i64:
- Opc = IsSigned ? X86::IMUL64r : X86::MUL64r;
- MOpc = IsSigned ? X86::IMUL64m : X86::MUL64m;
- LoReg = X86::RAX; HiReg = X86::RDX;
+ Opc = UseMULX ? X86::MULX64rr :
+ IsSigned ? X86::IMUL64r : X86::MUL64r;
+ MOpc = UseMULX ? X86::MULX64rm :
+ IsSigned ? X86::IMUL64m : X86::MUL64m;
+ LoReg = UseMULX ? X86::RDX : X86::RAX;
+ HiReg = X86::RDX;
break;
}
@@ -4783,15 +4790,24 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
N0, SDValue()).getValue(1);
+ SDValue ResHi, ResLo;
if (foldedLoad) {
SDValue Chain;
MachineSDNode *CNode = nullptr;
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
- SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
- CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
- Chain = SDValue(CNode, 0);
- InFlag = SDValue(CNode, 1);
+ if (UseMULX) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ Chain = SDValue(CNode, 2);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ Chain = SDValue(CNode, 0);
+ InFlag = SDValue(CNode, 1);
+ }
// Update the chain.
ReplaceUses(N1.getValue(1), Chain);
@@ -4799,27 +4815,38 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
SDValue Ops[] = { N1, InFlag };
- SDVTList VTs = CurDAG->getVTList(MVT::Glue);
- SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
- InFlag = SDValue(CNode, 0);
+ if (UseMULX) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 0);
+ }
}
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
- assert(LoReg && "Register for low half is not defined!");
- SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
- NVT, InFlag);
- InFlag = ResLo.getValue(2);
+ if (!ResLo) {
+ assert(LoReg && "Register for low half is not defined!");
+ ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
+ NVT, InFlag);
+ InFlag = ResLo.getValue(2);
+ }
ReplaceUses(SDValue(Node, 0), ResLo);
LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
dbgs() << '\n');
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
- assert(HiReg && "Register for high half is not defined!");
- SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
- NVT, InFlag);
- InFlag = ResHi.getValue(2);
+ if (!ResHi) {
+ assert(HiReg && "Register for high half is not defined!");
+ ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
+ NVT, InFlag);
+ InFlag = ResHi.getValue(2);
+ }
ReplaceUses(SDValue(Node, 1), ResHi);
LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
dbgs() << '\n');
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 9843bf81e905..b321820cf506 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -837,18 +837,16 @@ define i64 @load_fold_udiv1(i64* %p) {
;
; CHECK-O3-CUR-LABEL: load_fold_udiv1:
; CHECK-O3-CUR: # %bb.0:
-; CHECK-O3-CUR-NEXT: movq (%rdi), %rax
-; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT: mulq %rcx
-; CHECK-O3-CUR-NEXT: movq %rdx, %rax
+; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
+; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-CUR-NEXT: mulxq %rax, %rcx, %rax
; CHECK-O3-CUR-NEXT: shrq $3, %rax
; CHECK-O3-CUR-NEXT: retq
;
; CHECK-O3-EX-LABEL: load_fold_udiv1:
; CHECK-O3-EX: # %bb.0:
-; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT: mulq (%rdi)
-; CHECK-O3-EX-NEXT: movq %rdx, %rax
+; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-O3-EX-NEXT: mulxq (%rdi), %rcx, %rax
; CHECK-O3-EX-NEXT: shrq $3, %rax
; CHECK-O3-EX-NEXT: retq
%v = load atomic i64, i64* %p unordered, align 8
@@ -1033,15 +1031,14 @@ define i64 @load_fold_urem1(i64* %p) {
;
; CHECK-O3-LABEL: load_fold_urem1:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movq (%rdi), %rcx
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-NEXT: movq %rcx, %rax
-; CHECK-O3-NEXT: mulq %rdx
+; CHECK-O3-NEXT: movq (%rdi), %rax
+; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
+; CHECK-O3-NEXT: movq %rax, %rdx
+; CHECK-O3-NEXT: mulxq %rcx, %rcx, %rdx
; CHECK-O3-NEXT: shrq $3, %rdx
-; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rax
-; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
-; CHECK-O3-NEXT: subq %rax, %rcx
-; CHECK-O3-NEXT: movq %rcx, %rax
+; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rcx
+; CHECK-O3-NEXT: leaq (%rcx,%rcx,2), %rcx
+; CHECK-O3-NEXT: subq %rcx, %rax
; CHECK-O3-NEXT: retq
%v = load atomic i64, i64* %p unordered, align 8
%ret = urem i64 %v, 15
@@ -1694,28 +1691,28 @@ define void @rmw_fold_sdiv2(i64* %p, i64 %v) {
define void @rmw_fold_udiv1(i64* %p, i64 %v) {
; CHECK-O0-LABEL: rmw_fold_udiv1:
; CHECK-O0: # %bb.0:
-; CHECK-O0-NEXT: movq (%rdi), %rax
-; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O0-NEXT: mulq %rcx
-; CHECK-O0-NEXT: shrq $3, %rdx
-; CHECK-O0-NEXT: movq %rdx, (%rdi)
+; CHECK-O0-NEXT: movq (%rdi), %rdx
+; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O0-NEXT: mulxq %rax, %rcx, %rax
+; CHECK-O0-NEXT: shrq $3, %rax
+; CHECK-O0-NEXT: movq %rax, (%rdi)
; CHECK-O0-NEXT: retq
;
; CHECK-O3-CUR-LABEL: rmw_fold_udiv1:
; CHECK-O3-CUR: # %bb.0:
-; CHECK-O3-CUR-NEXT: movq (%rdi), %rax
-; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O3-CUR-NEXT: mulq %rcx
-; CHECK-O3-CUR-NEXT: shrq $3, %rdx
-; CHECK-O3-CUR-NEXT: movq %rdx, (%rdi)
+; CHECK-O3-CUR-NEXT: movq (%rdi), %rdx
+; CHECK-O3-CUR-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-CUR-NEXT: mulxq %rax, %rax, %rcx
+; CHECK-O3-CUR-NEXT: shrq $3, %rcx
+; CHECK-O3-CUR-NEXT: movq %rcx, (%rdi)
; CHECK-O3-CUR-NEXT: retq
;
; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
; CHECK-O3-EX: # %bb.0:
-; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
-; CHECK-O3-EX-NEXT: mulq (%rdi)
-; CHECK-O3-EX-NEXT: shrq $3, %rdx
-; CHECK-O3-EX-NEXT: movq %rdx, (%rdi)
+; CHECK-O3-EX-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
+; CHECK-O3-EX-NEXT: mulxq (%rdi), %rax, %rcx
+; CHECK-O3-EX-NEXT: shrq $3, %rcx
+; CHECK-O3-EX-NEXT: movq %rcx, (%rdi)
; CHECK-O3-EX-NEXT: retq
%prev = load atomic i64, i64* %p unordered, align 8
%val = udiv i64 %prev, 15
@@ -1842,27 +1839,25 @@ define void @rmw_fold_urem1(i64* %p, i64 %v) {
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movq (%rdi), %rax
; CHECK-O0-NEXT: movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
-; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT: mulq %rcx
-; CHECK-O0-NEXT: shrq $3, %rdx
-; CHECK-O0-NEXT: leaq (%rdx,%rdx,4), %rax
-; CHECK-O0-NEXT: leaq (%rax,%rax,2), %rax
-; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; CHECK-O0-NEXT: subq %rax, %rcx
-; CHECK-O0-NEXT: movq %rcx, (%rdi)
+; CHECK-O0-NEXT: movq %rax, %rdx
+; CHECK-O0-NEXT: mulxq %rcx, %rdx, %rcx
+; CHECK-O0-NEXT: shrq $3, %rcx
+; CHECK-O0-NEXT: leaq (%rcx,%rcx,4), %rcx
+; CHECK-O0-NEXT: leaq (%rcx,%rcx,2), %rcx
+; CHECK-O0-NEXT: subq %rcx, %rax
+; CHECK-O0-NEXT: movq %rax, (%rdi)
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: rmw_fold_urem1:
; CHECK-O3: # %bb.0:
-; CHECK-O3-NEXT: movq (%rdi), %rcx
-; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
-; CHECK-O3-NEXT: movq %rcx, %rax
-; CHECK-O3-NEXT: mulq %rdx
-; CHECK-O3-NEXT: shrq $3, %rdx
-; CHECK-O3-NEXT: leaq (%rdx,%rdx,4), %rax
+; CHECK-O3-NEXT: movq (%rdi), %rdx
+; CHECK-O3-NEXT: movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
+; CHECK-O3-NEXT: mulxq %rax, %rax, %rcx
+; CHECK-O3-NEXT: shrq $3, %rcx
+; CHECK-O3-NEXT: leaq (%rcx,%rcx,4), %rax
; CHECK-O3-NEXT: leaq (%rax,%rax,2), %rax
-; CHECK-O3-NEXT: subq %rax, %rcx
-; CHECK-O3-NEXT: movq %rcx, (%rdi)
+; CHECK-O3-NEXT: subq %rax, %rdx
+; CHECK-O3-NEXT: movq %rdx, (%rdi)
; CHECK-O3-NEXT: retq
%prev = load atomic i64, i64* %p unordered, align 8
%val = urem i64 %prev, 15
diff --git a/llvm/test/CodeGen/X86/bmi2-x86_64.ll b/llvm/test/CodeGen/X86/bmi2-x86_64.ll
index 6333732ae0f2..bb03138ccf76 100644
--- a/llvm/test/CodeGen/X86/bmi2-x86_64.ll
+++ b/llvm/test/CodeGen/X86/bmi2-x86_64.ll
@@ -68,8 +68,8 @@ define i64 @mulx64(i64 %x, i64 %y, i64* %p) {
; CHECK-LABEL: mulx64:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: mulxq %rsi, %rax, %rdx
; CHECK-NEXT: movq %rdx, (%rcx)
; CHECK-NEXT: retq
%x1 = zext i64 %x to i128
@@ -86,8 +86,8 @@ define i64 @mulx64_load(i64 %x, i64* %y, i64* %p) {
; CHECK-LABEL: mulx64_load:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq (%rsi)
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: mulxq (%rsi), %rax, %rdx
; CHECK-NEXT: movq %rdx, (%rcx)
; CHECK-NEXT: retq
%y1 = load i64, i64* %y
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index 114f9ac5479a..bf78cb4f72ef 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -120,11 +120,11 @@ define i32 @mulx32(i32 %x, i32 %y, i32* %p) {
; X86-LABEL: mulx32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: addl %eax, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: addl %edx, %edx
-; X86-NEXT: mull %edx
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: mulxl %eax, %eax, %edx
; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
@@ -156,10 +156,10 @@ define i32 @mulx32_load(i32 %x, i32* %y, i32* %p) {
; X86-LABEL: mulx32_load:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: addl %eax, %eax
-; X86-NEXT: mull (%edx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: mulxl (%eax), %eax, %edx
; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/hoist-invariant-load.ll b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
index 13b72bdfc6dc..73cf898223bc 100644
--- a/llvm/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
@@ -215,22 +215,21 @@ declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
define void @test_multi_def(i64* dereferenceable(8) %x1,
; CHECK-LABEL: test_multi_def:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: movq %rdx, %r8
-; CHECK-NEXT: xorl %r9d, %r9d
-; CHECK-NEXT: movq (%rdi), %rdi
-; CHECK-NEXT: movq (%rsi), %rsi
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: movq (%rdi), %rdx
+; CHECK-NEXT: movq (%rsi), %r9
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB4_2: ## %for.body
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %rsi
-; CHECK-NEXT: addq %rax, (%r8)
-; CHECK-NEXT: adcq %rdx, 8(%r8)
+; CHECK-NEXT: mulxq %r9, %rsi, %rdi
+; CHECK-NEXT: addq %rsi, (%rax)
+; CHECK-NEXT: adcq %rdi, 8(%rax)
; CHECK-NEXT: ## %bb.1: ## %for.check
; CHECK-NEXT: ## in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT: incq %r9
-; CHECK-NEXT: addq $16, %r8
-; CHECK-NEXT: cmpq %rcx, %r9
+; CHECK-NEXT: incq %r8
+; CHECK-NEXT: addq $16, %rax
+; CHECK-NEXT: cmpq %rcx, %r8
; CHECK-NEXT: jl LBB4_2
; CHECK-NEXT: ## %bb.3: ## %exit
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index e40f10a67dd1..45834f2eeecd 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -7,48 +7,86 @@
; PR1198
define i64 @foo(i64 %x, i64 %y) nounwind {
-; X86-LABEL: foo:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: setb %al
-; X86-NEXT: movzbl %al, %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86-NOBMI-LABEL: foo:
+; X86-NOBMI: # %bb.0:
+; X86-NOBMI-NEXT: pushl %ebp
+; X86-NOBMI-NEXT: pushl %ebx
+; X86-NOBMI-NEXT: pushl %edi
+; X86-NOBMI-NEXT: pushl %esi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: mull %ebx
+; X86-NOBMI-NEXT: movl %edx, %edi
+; X86-NOBMI-NEXT: movl %ebp, %eax
+; X86-NOBMI-NEXT: mull %ebx
+; X86-NOBMI-NEXT: movl %edx, %ebx
+; X86-NOBMI-NEXT: movl %eax, %ebp
+; X86-NOBMI-NEXT: addl %edi, %ebp
+; X86-NOBMI-NEXT: adcl $0, %ebx
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: mull %esi
+; X86-NOBMI-NEXT: movl %edx, %ecx
+; X86-NOBMI-NEXT: addl %ebp, %eax
+; X86-NOBMI-NEXT: adcl %ebx, %ecx
+; X86-NOBMI-NEXT: setb %al
+; X86-NOBMI-NEXT: movzbl %al, %edi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: mull %esi
+; X86-NOBMI-NEXT: addl %ecx, %eax
+; X86-NOBMI-NEXT: adcl %edi, %edx
+; X86-NOBMI-NEXT: popl %esi
+; X86-NOBMI-NEXT: popl %edi
+; X86-NOBMI-NEXT: popl %ebx
+; X86-NOBMI-NEXT: popl %ebp
+; X86-NOBMI-NEXT: retl
;
-; X64-LABEL: foo:
-; X64: # %bb.0:
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rax
-; X64-NEXT: retq
+; X86-BMI-LABEL: foo:
+; X86-BMI: # %bb.0:
+; X86-BMI-NEXT: pushl %ebp
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: movl %eax, %edx
+; X86-BMI-NEXT: mulxl %esi, %edx, %ebx
+; X86-BMI-NEXT: movl %ecx, %edx
+; X86-BMI-NEXT: mulxl %esi, %esi, %ebp
+; X86-BMI-NEXT: addl %ebx, %esi
+; X86-BMI-NEXT: adcl $0, %ebp
+; X86-BMI-NEXT: movl %eax, %edx
+; X86-BMI-NEXT: mulxl %edi, %eax, %ebx
+; X86-BMI-NEXT: addl %esi, %eax
+; X86-BMI-NEXT: adcl %ebp, %ebx
+; X86-BMI-NEXT: setb %al
+; X86-BMI-NEXT: movzbl %al, %esi
+; X86-BMI-NEXT: movl %ecx, %edx
+; X86-BMI-NEXT: mulxl %edi, %eax, %edx
+; X86-BMI-NEXT: addl %ebx, %eax
+; X86-BMI-NEXT: adcl %esi, %edx
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: popl %ebp
+; X86-BMI-NEXT: retl
+;
+; X64-NOBMI-LABEL: foo:
+; X64-NOBMI: # %bb.0:
+; X64-NOBMI-NEXT: movq %rdi, %rax
+; X64-NOBMI-NEXT: mulq %rsi
+; X64-NOBMI-NEXT: movq %rdx, %rax
+; X64-NOBMI-NEXT: retq
+;
+; X64-BMI-LABEL: foo:
+; X64-BMI: # %bb.0:
+; X64-BMI-NEXT: movq %rdi, %rdx
+; X64-BMI-NEXT: mulxq %rsi, %rcx, %rax
+; X64-BMI-NEXT: retq
%tmp0 = zext i64 %x to i128
%tmp1 = zext i64 %y to i128
%tmp2 = mul i128 %tmp0, %tmp1
@@ -62,107 +100,202 @@ define i64 @foo(i64 %x, i64 %y) nounwind {
; zero-extended value.
define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind {
-; X86-LABEL: mul1:
-; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: je .LBB1_3
-; X86-NEXT: # %bb.1: # %for.body.preheader
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: xorl %ebp, %ebp
-; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB1_2: # %for.body
-; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax,%ebp,8), %esi
-; X86-NEXT: movl 4(%eax,%ebp,8), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: mull %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: adcl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, (%esi,%ebp,8)
-; X86-NEXT: movl %edi, 4(%esi,%ebp,8)
-; X86-NEXT: addl $1, %ebp
-; X86-NEXT: movl (%esp), %edi # 4-byte Reload
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: xorl %ebx, %esi
-; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: jne .LBB1_2
-; X86-NEXT: .LBB1_3: # %for.end
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: addl $24, %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86-NOBMI-LABEL: mul1:
+; X86-NOBMI: # %bb.0: # %entry
+; X86-NOBMI-NEXT: pushl %ebp
+; X86-NOBMI-NEXT: pushl %ebx
+; X86-NOBMI-NEXT: pushl %edi
+; X86-NOBMI-NEXT: pushl %esi
+; X86-NOBMI-NEXT: subl $24, %esp
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: orl %ecx, %eax
+; X86-NOBMI-NEXT: je .LBB1_3
+; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader
+; X86-NOBMI-NEXT: xorl %eax, %eax
+; X86-NOBMI-NEXT: xorl %edx, %edx
+; X86-NOBMI-NEXT: xorl %ebp, %ebp
+; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill
+; X86-NOBMI-NEXT: .p2align 4, 0x90
+; X86-NOBMI-NEXT: .LBB1_2: # %for.body
+; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT: movl (%eax,%ebp,8), %esi
+; X86-NOBMI-NEXT: movl 4(%eax,%ebp,8), %ecx
+; X86-NOBMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %esi, %eax
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT: mull %edi
+; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOBMI-NEXT: movl %ecx, %eax
+; X86-NOBMI-NEXT: mull %edi
+; X86-NOBMI-NEXT: movl %edx, %ecx
+; X86-NOBMI-NEXT: movl %eax, %ebx
+; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NOBMI-NEXT: adcl $0, %ecx
+; X86-NOBMI-NEXT: movl %esi, %eax
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT: mull %edx
+; X86-NOBMI-NEXT: movl %edx, %esi
+; X86-NOBMI-NEXT: movl %eax, %edi
+; X86-NOBMI-NEXT: addl %ebx, %edi
+; X86-NOBMI-NEXT: adcl %ecx, %esi
+; X86-NOBMI-NEXT: setb %bl
+; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT: addl %esi, %eax
+; X86-NOBMI-NEXT: movzbl %bl, %esi
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT: adcl %esi, %edx
+; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NOBMI-NEXT: adcl $0, %eax
+; X86-NOBMI-NEXT: adcl $0, %edx
+; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebp,8)
+; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebp,8)
+; X86-NOBMI-NEXT: addl $1, %ebp
+; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload
+; X86-NOBMI-NEXT: adcl $0, %edi
+; X86-NOBMI-NEXT: movl %ebp, %esi
+; X86-NOBMI-NEXT: xorl %ebx, %esi
+; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT: orl %esi, %edi
+; X86-NOBMI-NEXT: jne .LBB1_2
+; X86-NOBMI-NEXT: .LBB1_3: # %for.end
+; X86-NOBMI-NEXT: xorl %eax, %eax
+; X86-NOBMI-NEXT: xorl %edx, %edx
+; X86-NOBMI-NEXT: addl $24, %esp
+; X86-NOBMI-NEXT: popl %esi
+; X86-NOBMI-NEXT: popl %edi
+; X86-NOBMI-NEXT: popl %ebx
+; X86-NOBMI-NEXT: popl %ebp
+; X86-NOBMI-NEXT: retl
+;
+; X86-BMI-LABEL: mul1:
+; X86-BMI: # %bb.0: # %entry
+; X86-BMI-NEXT: pushl %ebp
+; X86-BMI-NEXT: pushl %ebx
+; X86-BMI-NEXT: pushl %edi
+; X86-BMI-NEXT: pushl %esi
+; X86-BMI-NEXT: subl $16, %esp
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: orl %ecx, %eax
+; X86-BMI-NEXT: je .LBB1_3
+; X86-BMI-NEXT: # %bb.1: # %for.body.preheader
+; X86-BMI-NEXT: xorl %ecx, %ecx
+; X86-BMI-NEXT: xorl %edx, %edx
+; X86-BMI-NEXT: xorl %ebx, %ebx
+; X86-BMI-NEXT: xorl %ebp, %ebp
+; X86-BMI-NEXT: .p2align 4, 0x90
+; X86-BMI-NEXT: .LBB1_2: # %for.body
+; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: movl (%ecx,%ebx,8), %eax
+; X86-BMI-NEXT: movl 4(%ecx,%ebx,8), %esi
+; X86-BMI-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-BMI-NEXT: movl %eax, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI-NEXT: mulxl %ecx, %edx, %edi
+; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-BMI-NEXT: movl %esi, %edx
+; X86-BMI-NEXT: mulxl %ecx, %esi, %ecx
+; X86-BMI-NEXT: addl %edi, %esi
+; X86-BMI-NEXT: adcl $0, %ecx
+; X86-BMI-NEXT: movl %eax, %edx
+; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edi, %eax
+; X86-BMI-NEXT: addl %esi, %edi
+; X86-BMI-NEXT: adcl %ecx, %eax
+; X86-BMI-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx
+; X86-BMI-NEXT: setb (%esp) # 1-byte Folded Spill
+; X86-BMI-NEXT: addl %eax, %ecx
+; X86-BMI-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-BMI-NEXT: adcl %eax, %edx
+; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-BMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-BMI-NEXT: adcl $0, %ecx
+; X86-BMI-NEXT: adcl $0, %edx
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl %esi, (%eax,%ebx,8)
+; X86-BMI-NEXT: movl %edi, 4(%eax,%ebx,8)
+; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-BMI-NEXT: addl $1, %ebx
+; X86-BMI-NEXT: adcl $0, %ebp
+; X86-BMI-NEXT: movl %ebx, %eax
+; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X86-BMI-NEXT: movl %ebp, %esi
+; X86-BMI-NEXT: xorl %edi, %esi
+; X86-BMI-NEXT: orl %eax, %esi
+; X86-BMI-NEXT: jne .LBB1_2
+; X86-BMI-NEXT: .LBB1_3: # %for.end
+; X86-BMI-NEXT: xorl %eax, %eax
+; X86-BMI-NEXT: xorl %edx, %edx
+; X86-BMI-NEXT: addl $16, %esp
+; X86-BMI-NEXT: popl %esi
+; X86-BMI-NEXT: popl %edi
+; X86-BMI-NEXT: popl %ebx
+; X86-BMI-NEXT: popl %ebp
+; X86-BMI-NEXT: retl
+;
+; X64-NOBMI-LABEL: mul1:
+; X64-NOBMI: # %bb.0: # %entry
+; X64-NOBMI-NEXT: testq %rdi, %rdi
+; X64-NOBMI-NEXT: je .LBB1_3
+; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader
+; X64-NOBMI-NEXT: movq %rcx, %r8
+; X64-NOBMI-NEXT: movq %rdx, %r9
+; X64-NOBMI-NEXT: xorl %r10d, %r10d
+; X64-NOBMI-NEXT: xorl %ecx, %ecx
+; X64-NOBMI-NEXT: .p2align 4, 0x90
+; X64-NOBMI-NEXT: .LBB1_2: # %for.body
+; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NOBMI-NEXT: movq %r8, %rax
+; X64-NOBMI-NEXT: mulq (%r9,%rcx,8)
+; X64-NOBMI-NEXT: addq %r10, %rax
+; X64-NOBMI-NEXT: adcq $0, %rdx
+; X64-NOBMI-NEXT: movq %rax, (%rsi,%rcx,8)
+; X64-NOBMI-NEXT: incq %rcx
+; X64-NOBMI-NEXT: cmpq %rcx, %rdi
+; X64-NOBMI-NEXT: movq %rdx, %r10
+; X64-NOBMI-NEXT: jne .LBB1_2
+; X64-NOBMI-NEXT: .LBB1_3: # %for.end
+; X64-NOBMI-NEXT: xorl %eax, %eax
+; X64-NOBMI-NEXT: retq
;
-; X64-LABEL: mul1:
-; X64: # %bb.0: # %entry
-; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB1_3
-; X64-NEXT: # %bb.1: # %for.body.preheader
-; X64-NEXT: movq %rcx, %r8
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: xorl %r10d, %r10d
-; X64-NEXT: xorl %ecx, %ecx
-; X64-NEXT: .p2align 4, 0x90
-; X64-NEXT: .LBB1_2: # %for.body
-; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq (%r9,%rcx,8)
-; X64-NEXT: addq %r10, %rax
-; X64-NEXT: adcq $0, %rdx
-; X64-NEXT: movq %rax, (%rsi,%rcx,8)
-; X64-NEXT: incq %rcx
-; X64-NEXT: cmpq %rcx, %rdi
-; X64-NEXT: movq %rdx, %r10
-; X64-NEXT: jne .LBB1_2
-; X64-NEXT: .LBB1_3: # %for.end
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: retq
+; X64-BMI-LABEL: mul1:
+; X64-BMI: # %bb.0: # %entry
+; X64-BMI-NEXT: testq %rdi, %rdi
+; X64-BMI-NEXT: je .LBB1_3
+; X64-BMI-NEXT: # %bb.1: # %for.body.preheader
+; X64-BMI-NEXT: movq %rcx, %r8
+; X64-BMI-NEXT: movq %rdx, %r9
+; X64-BMI-NEXT: xorl %r10d, %r10d
+; X64-BMI-NEXT: xorl %ecx, %ecx
+; X64-BMI-NEXT: .p2align 4, 0x90
+; X64-BMI-NEXT: .LBB1_2: # %for.body
+; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-BMI-NEXT: movq %r8, %rdx
+; X64-BMI-NEXT: mulxq (%r9,%rcx,8), %rax, %rdx
+; X64-BMI-NEXT: addq %r10, %rax
+; X64-BMI-NEXT: adcq $0, %rdx
+; X64-BMI-NEXT: movq %rax, (%rsi,%rcx,8)
+; X64-BMI-NEXT: incq %rcx
+; X64-BMI-NEXT: cmpq %rcx, %rdi
+; X64-BMI-NEXT: movq %rdx, %r10
+; X64-BMI-NEXT: jne .LBB1_2
+; X64-BMI-NEXT: .LBB1_3: # %for.end
+; X64-BMI-NEXT: xorl %eax, %eax
+; X64-BMI-NEXT: retq
entry:
%conv = zext i64 %y to i128
%cmp11 = icmp eq i64 %n, 0
diff --git a/llvm/test/CodeGen/X86/mulx32.ll b/llvm/test/CodeGen/X86/mulx32.ll
index faf299f3a2df..872e72d503aa 100644
--- a/llvm/test/CodeGen/X86/mulx32.ll
+++ b/llvm/test/CodeGen/X86/mulx32.ll
@@ -5,8 +5,8 @@
define i64 @f1(i32 %a, i32 %b) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: mull {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %edx
; CHECK-NEXT: retl
%x = zext i32 %a to i64
%y = zext i32 %b to i64
@@ -17,9 +17,9 @@ define i64 @f1(i32 %a, i32 %b) {
define i64 @f2(i32 %a, i32* %p) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: mull (%ecx)
+; CHECK-NEXT: mulxl (%eax), %eax, %edx
; CHECK-NEXT: retl
%b = load i32, i32* %p
%x = zext i32 %a to i64
diff --git a/llvm/test/CodeGen/X86/mulx64.ll b/llvm/test/CodeGen/X86/mulx64.ll
index 38f1d3ea5ab3..e038f3300093 100644
--- a/llvm/test/CodeGen/X86/mulx64.ll
+++ b/llvm/test/CodeGen/X86/mulx64.ll
@@ -5,8 +5,8 @@
define i128 @f1(i64 %a, i64 %b) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: mulxq %rsi, %rax, %rdx
; CHECK-NEXT: retq
%x = zext i64 %a to i128
%y = zext i64 %b to i128
@@ -17,8 +17,8 @@ define i128 @f1(i64 %a, i64 %b) {
define i128 @f2(i64 %a, i64* %p) {
; CHECK-LABEL: f2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: mulq (%rsi)
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: mulxq (%rsi), %rax, %rdx
; CHECK-NEXT: retq
%b = load i64, i64* %p
%x = zext i64 %a to i128
diff --git a/llvm/test/CodeGen/X86/pr35636.ll b/llvm/test/CodeGen/X86/pr35636.ll
index a97af6a1ac67..07fb37f4b62a 100644
--- a/llvm/test/CodeGen/X86/pr35636.ll
+++ b/llvm/test/CodeGen/X86/pr35636.ll
@@ -5,11 +5,11 @@
define void @_Z15uint64_to_asciimPc(i64 %arg) {
; HSW-LABEL: _Z15uint64_to_asciimPc:
; HSW: # %bb.0: # %bb
-; HSW-NEXT: movq %rdi, %rax
-; HSW-NEXT: movabsq $811296384146066817, %rcx # imm = 0xB424DC35095CD81
-; HSW-NEXT: mulq %rcx
-; HSW-NEXT: shrq $42, %rdx
-; HSW-NEXT: imulq $281474977, %rdx, %rax # imm = 0x10C6F7A1
+; HSW-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; HSW-NEXT: movq %rdi, %rdx
+; HSW-NEXT: mulxq %rax, %rax, %rcx
+; HSW-NEXT: shrq $42, %rcx
+; HSW-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
; HSW-NEXT: shrq $20, %rax
; HSW-NEXT: leal (%rax,%rax,4), %eax
; HSW-NEXT: addl $5, %eax
@@ -22,11 +22,11 @@ define void @_Z15uint64_to_asciimPc(i64 %arg) {
;
; ZN-LABEL: _Z15uint64_to_asciimPc:
; ZN: # %bb.0: # %bb
-; ZN-NEXT: movq %rdi, %rax
-; ZN-NEXT: movabsq $811296384146066817, %rcx # imm = 0xB424DC35095CD81
-; ZN-NEXT: mulq %rcx
-; ZN-NEXT: shrq $42, %rdx
-; ZN-NEXT: imulq $281474977, %rdx, %rax # imm = 0x10C6F7A1
+; ZN-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; ZN-NEXT: movq %rdi, %rdx
+; ZN-NEXT: mulxq %rax, %rax, %rcx
+; ZN-NEXT: shrq $42, %rcx
+; ZN-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
; ZN-NEXT: shrq $20, %rax
; ZN-NEXT: leal 5(%rax,%rax,4), %eax
; ZN-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
More information about the llvm-commits
mailing list