[llvm] 2090e85 - [llvm/CodeGen] Enable the ExpandLargeDivRem pass for X86, Arm and AArch64
Matthias Gehre via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 6 07:32:17 PDT 2022
Author: Matthias Gehre
Date: 2022-09-06T15:32:04+01:00
New Revision: 2090e85fee9b2d2a1ca6402b5f44c7d41d1e353f
URL: https://github.com/llvm/llvm-project/commit/2090e85fee9b2d2a1ca6402b5f44c7d41d1e353f
DIFF: https://github.com/llvm/llvm-project/commit/2090e85fee9b2d2a1ca6402b5f44c7d41d1e353f.diff
LOG: [llvm/CodeGen] Enable the ExpandLargeDivRem pass for X86, Arm and AArch64
This adds the ExpandLargeDivRem to the default pass pipeline.
The limit at which it expands div/rem instructions is configured
via a new TargetTransformInfo hook (default: no expansion)
X86, Arm and AArch64 backends implement this hook to expand div/rem
instructions with more than 128 bits.
Differential Revision: https://reviews.llvm.org/D130076
Added:
llvm/test/CodeGen/AArch64/udivmodei5.ll
llvm/test/CodeGen/ARM/udivmodei5.ll
llvm/test/CodeGen/X86/udivmodei5.ll
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/CodeGen/ExpandLargeDivRem.cpp
llvm/lib/CodeGen/TargetPassConfig.cpp
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/lib/Target/X86/X86TargetTransformInfo.h
llvm/test/CodeGen/AArch64/O0-pipeline.ll
llvm/test/CodeGen/AArch64/O3-pipeline.ll
llvm/test/CodeGen/ARM/O3-pipeline.ll
llvm/test/CodeGen/X86/O0-pipeline.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
llvm/test/CodeGen/X86/i128-sdiv.ll
llvm/test/CodeGen/X86/i128-udiv.ll
llvm/test/CodeGen/X86/opt-pipeline.ll
llvm/test/CodeGen/X86/pr38539.ll
Removed:
llvm/test/CodeGen/X86/libcall-sret.ll
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 525b5db10e35c..2b3c5a0a469fb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -687,6 +687,9 @@ class TargetTransformInfo {
/// would typically be allowed using throughput or size cost models.
bool hasDivRemOp(Type *DataType, bool IsSigned) const;
+ /// Returns the maximum bitwidth of legal div and rem instructions.
+ unsigned maxLegalDivRemBitWidth() const;
+
/// Return true if the given instruction (assumed to be a memory access
/// instruction) has a volatile variant. If that's the case then we can avoid
/// addrspacecast to generic AS for volatile loads/stores. Default
@@ -1641,6 +1644,7 @@ class TargetTransformInfo::Concept {
const SmallBitVector &OpcodeMask) const = 0;
virtual bool enableOrderedReductions() = 0;
virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
+ virtual unsigned maxLegalDivRemBitWidth() = 0;
virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
virtual bool prefersVectorizedAddressing() = 0;
virtual InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
@@ -2088,6 +2092,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool hasDivRemOp(Type *DataType, bool IsSigned) override {
return Impl.hasDivRemOp(DataType, IsSigned);
}
+ unsigned maxLegalDivRemBitWidth() override {
+ return Impl.maxLegalDivRemBitWidth();
+ }
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) override {
return Impl.hasVolatileVariant(I, AddrSpace);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 8342a82197ea8..487a439264433 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -291,6 +291,10 @@ class TargetTransformInfoImplBase {
bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; }
+ bool maxLegalDivRemBitWidth() const {
+ return llvm::IntegerType::MAX_INT_BITS;
+ }
+
bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) const {
return false;
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index d009f2fc0bdd0..46b39669daec7 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -451,6 +451,10 @@ bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const {
return TTIImpl->hasDivRemOp(DataType, IsSigned);
}
+unsigned TargetTransformInfo::maxLegalDivRemBitWidth() const {
+ return TTIImpl->maxLegalDivRemBitWidth();
+}
+
bool TargetTransformInfo::hasVolatileVariant(Instruction *I,
unsigned AddrSpace) const {
return TTIImpl->hasVolatileVariant(I, AddrSpace);
diff --git a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
index fa1288a287d3d..1fa2993fd5f90 100644
--- a/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
+++ b/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
@@ -30,14 +31,37 @@
using namespace llvm;
static cl::opt<unsigned>
- ExpandDivRemBits("expand-div-rem-bits", cl::Hidden, cl::init(128),
+ ExpandDivRemBits("expand-div-rem-bits", cl::Hidden,
+ cl::init(llvm::IntegerType::MAX_INT_BITS),
cl::desc("div and rem instructions on integers with "
"more than <N> bits are expanded."));
-static bool runImpl(Function &F) {
+static bool isConstantPowerOfTwo(llvm::Value *V, bool SignedOp) {
+ auto *C = dyn_cast<ConstantInt>(V);
+ if (!C)
+ return false;
+
+ APInt Val = C->getValue();
+ if (SignedOp && Val.isNegative())
+ Val = -Val;
+ return Val.isPowerOf2();
+}
+
+static bool isSigned(unsigned int Opcode) {
+ return Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+}
+
+static bool runImpl(Function &F, const TargetTransformInfo &TTI) {
SmallVector<BinaryOperator *, 4> Replace;
bool Modified = false;
+ unsigned MaxLegalDivRemBitWidth = TTI.maxLegalDivRemBitWidth();
+ if (ExpandDivRemBits != llvm::IntegerType::MAX_INT_BITS)
+ MaxLegalDivRemBitWidth = ExpandDivRemBits;
+
+ if (MaxLegalDivRemBitWidth >= llvm::IntegerType::MAX_INT_BITS)
+ return false;
+
for (auto &I : instructions(F)) {
switch (I.getOpcode()) {
case Instruction::UDiv:
@@ -46,7 +70,11 @@ static bool runImpl(Function &F) {
case Instruction::SRem: {
// TODO: This doesn't handle vectors.
auto *IntTy = dyn_cast<IntegerType>(I.getType());
- if (!IntTy || IntTy->getIntegerBitWidth() <= ExpandDivRemBits)
+ if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth)
+ continue;
+
+ // The backend has peephole optimizations for powers of two.
+ if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode())))
continue;
Replace.push_back(&cast<BinaryOperator>(I));
@@ -77,7 +105,8 @@ static bool runImpl(Function &F) {
PreservedAnalyses ExpandLargeDivRemPass::run(Function &F,
FunctionAnalysisManager &AM) {
- bool Changed = runImpl(F);
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+ bool Changed = runImpl(F, TTI);
if (Changed)
return PreservedAnalyses::none();
@@ -93,9 +122,13 @@ class ExpandLargeDivRemLegacyPass : public FunctionPass {
initializeExpandLargeDivRemLegacyPassPass(*PassRegistry::getPassRegistry());
}
- bool runOnFunction(Function &F) override { return runImpl(F); }
+ bool runOnFunction(Function &F) override {
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return runImpl(F, TTI);
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index ab8a872699ed2..09cc3143a74e8 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1113,6 +1113,7 @@ bool TargetPassConfig::addISelPasses() {
addPass(createPreISelIntrinsicLoweringPass());
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+ addPass(createExpandLargeDivRemPass());
addIRPasses();
addCodeGenPrepare();
addPassesToHandleExceptions();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 955e90e19ffa4..bf917b5b9d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -319,6 +319,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
bool enableOrderedReductions() const { return true; }
+ unsigned maxLegalDivRemBitWidth() const { return 128; }
+
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index c0180923c240a..3466be0001a8f 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -207,6 +207,8 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
return isLegalMaskedGather(Ty, Alignment);
}
+ unsigned maxLegalDivRemBitWidth() const { return 64; }
+
InstructionCost getMemcpyCost(const Instruction *I);
int getNumMemOps(const IntrinsicInst *I) const;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 538e724ed28c6..2aaac9926c80a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5734,6 +5734,10 @@ bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) {
return BaseT::isExpensiveToSpeculativelyExecute(I);
}
+unsigned X86TTIImpl::maxLegalDivRemBitWidth() const {
+ return ST->is64Bit() ? 128 : 64;
+}
+
bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
return false;
}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index f74433beb02af..18db9999a467a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -255,6 +255,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const SmallBitVector &OpcodeMask) const;
bool hasDivRemOp(Type *DataType, bool IsSigned);
bool isExpensiveToSpeculativelyExecute(const Instruction *I);
+ unsigned maxLegalDivRemBitWidth() const;
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index a3e0943c70149..3c42d1adecd31 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -15,6 +15,7 @@
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Module Verifier
; CHECK-NEXT: Lower Garbage Collection Instructions
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 70ffa2640475d..5c016b7c9b38b 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -18,6 +18,7 @@
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: SVE intrinsics optimizations
; CHECK-NEXT: FunctionPass Manager
diff --git a/llvm/test/CodeGen/AArch64/udivmodei5.ll b/llvm/test/CodeGen/AArch64/udivmodei5.ll
new file mode 100644
index 0000000000000..61573779b587d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/udivmodei5.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi < %s | FileCheck %s
+
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; CHECK-LABEL: udiv65:
+; CHECK-NOT: call
+ %res = udiv i65 %a, %b
+ ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: udiv129:
+; CHECK-NOT: call
+ %res = udiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: urem129:
+; CHECK-NOT: call
+ %res = urem i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: sdiv129:
+; CHECK-NOT: call
+ %res = sdiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: srem129:
+; CHECK-NOT: call
+ %res = srem i129 %a, %b
+ ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; CHECK-LABEL: sdiv257:
+; CHECK-NOT: call
+ %res = sdiv i257 %a, %b
+ ret i257 %res
+}
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 098422a4a770d..f1a12254866c9 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -5,6 +5,7 @@
; CHECK: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Dominator Tree Construction
diff --git a/llvm/test/CodeGen/ARM/udivmodei5.ll b/llvm/test/CodeGen/ARM/udivmodei5.ll
new file mode 100644
index 0000000000000..4f337f340d109
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/udivmodei5.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=arm-eabi < %s | FileCheck %s
+
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; CHECK-LABEL: udiv65:
+; CHECK-NOT: call
+ %res = udiv i65 %a, %b
+ ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: udiv129:
+; CHECK-NOT: call
+ %res = udiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: urem129:
+; CHECK-NOT: call
+ %res = urem i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: sdiv129:
+; CHECK-NOT: call
+ %res = sdiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; CHECK-LABEL: srem129:
+; CHECK-NOT: call
+ %res = srem i129 %a, %b
+ ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; CHECK-LABEL: sdiv257:
+; CHECK-NOT: call
+ %res = sdiv i257 %a, %b
+ ret i257 %res
+}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 1c80d677f78f6..d762a5212fd2c 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -17,6 +17,7 @@
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index 1b73acbcb6828..914a2f1032398 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -171,101 +171,8 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl 44(%ebp), %edi
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pushl 40(%ebp)
-; X86-NEXT: pushl 36(%ebp)
-; X86-NEXT: pushl 32(%ebp)
-; X86-NEXT: pushl 28(%ebp)
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __divti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl %ecx, 12(%edi)
-; X86-NEXT: movl %esi, 8(%edi)
-; X86-NEXT: movl %eax, 4(%edi)
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ebx, (%edx)
-; X86-NEXT: movl 28(%ebp), %eax
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: imull 32(%ebp), %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: imull %ebx, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl 28(%ebp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%ebp), %esi
-; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: sbbl %edx, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ebx, 12(%eax)
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: scalar_i128:
; X64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 6643ada2f42b4..67650ec1a6e3f 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -171,101 +171,8 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-LABEL: scalar_i128:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $40, %esp
-; X86-NEXT: movl 44(%ebp), %edi
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pushl 40(%ebp)
-; X86-NEXT: pushl 36(%ebp)
-; X86-NEXT: pushl 32(%ebp)
-; X86-NEXT: pushl 28(%ebp)
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl %ecx, 12(%edi)
-; X86-NEXT: movl %esi, 8(%edi)
-; X86-NEXT: movl %eax, 4(%edi)
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %ebx, (%edx)
-; X86-NEXT: movl 28(%ebp), %eax
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: imull 32(%ebp), %esi
-; X86-NEXT: addl %edx, %esi
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: imull %edi, %ecx
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl 40(%ebp), %eax
-; X86-NEXT: imull %ebx, %eax
-; X86-NEXT: addl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl 28(%ebp), %ecx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%ebp), %esi
-; X86-NEXT: sbbl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: sbbl %edx, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ebx, 12(%eax)
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: scalar_i128:
; X64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index 5e0c79a229794..717f52f198ee8 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -107,40 +107,8 @@ define i128 @test2(i128 %x) nounwind {
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %esp, %eax
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-5
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-3
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __divti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl (%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 12(%esi)
-; X86-NEXT: movl %edx, 8(%esi)
-; X86-NEXT: movl %ecx, 4(%esi)
-; X86-NEXT: movl %eax, (%esi)
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: test3:
; X64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 05049dc6254a6..3f890b7f2443a 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -31,40 +31,8 @@ define i128 @test1(i128 %x) nounwind {
define i128 @test2(i128 %x) nounwind {
; X86-LABEL: test2:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %esp, %eax
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-4
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl (%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 12(%esi)
-; X86-NEXT: movl %edx, 8(%esi)
-; X86-NEXT: movl %ecx, 4(%esi)
-; X86-NEXT: movl %eax, (%esi)
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: test2:
; X64: # %bb.0:
@@ -80,40 +48,8 @@ define i128 @test2(i128 %x) nounwind {
define i128 @test3(i128 %x) nounwind {
; X86-LABEL: test3:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %esp, %eax
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-5
-; X86-NEXT: pushl $-1
-; X86-NEXT: pushl $-3
-; X86-NEXT: pushl 24(%ebp)
-; X86-NEXT: pushl 20(%ebp)
-; X86-NEXT: pushl 16(%ebp)
-; X86-NEXT: pushl 12(%ebp)
-; X86-NEXT: pushl %eax
-; X86-NEXT: calll __udivti3
-; X86-NEXT: addl $32, %esp
-; X86-NEXT: movl (%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, 12(%esi)
-; X86-NEXT: movl %edx, 8(%esi)
-; X86-NEXT: movl %ecx, 4(%esi)
-; X86-NEXT: movl %eax, (%esi)
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl $4
+; X86 doesn't have __divti3, so the urem is expanded into a loop.
+; X86: udiv-do-while
;
; X64-LABEL: test3:
; X64: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/libcall-sret.ll b/llvm/test/CodeGen/X86/libcall-sret.ll
deleted file mode 100644
index 661c631fc945e..0000000000000
--- a/llvm/test/CodeGen/X86/libcall-sret.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=i686-linux-gnu -o - %s | FileCheck %s
-
- at var = global i128 0
-
-; We were trying to convert the i128 operation into a libcall, but failing to
-; perform sret demotion when we couldn't return the result in registers. Make
-; sure we marshal the return properly:
-
-define void @test_sret_libcall(i128 %l, i128 %r) {
-; CHECK-LABEL: test_sret_libcall:
-
- ; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
- ; (aligned) place for the actual sret data is %esp + 20.
-; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl 72(%esp)
-; CHECK: pushl [[SRET_ADDR]]
-
-; CHECK: calll __udivti3
-
-; CHECK: addl $44, %esp
-; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
-; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
-; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
-; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
-; CHECK-DAG: movl [[RES0]], var
-; CHECK-DAG: movl [[RES1]], var+4
-; CHECK-DAG: movl [[RES2]], var+8
-; CHECK-DAG: movl [[RES3]], var+12
- %quot = udiv i128 %l, %r
- store i128 %quot, ptr @var
- ret void
-}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 3f9acba27810f..f9952db9d4cb5 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -26,6 +26,7 @@
; CHECK-NEXT: ModulePass Manager
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
+; CHECK-NEXT: Expand large div/rem
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: Lower AMX intrinsics
; CHECK-NEXT: Lower AMX type for load/store
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index 8736d8e91e768..97f5985cf9092 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -13,26 +13,6 @@ define void @f() {
; X64-NEXT: movq %rax, (%rax)
; X64-NEXT: movb $0, (%rax)
; X64-NEXT: retq
-;
-; X86-LABEL: f:
-; X86: # %bb.0: # %BB
-; X86-NEXT: pushl %ebp
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: .cfi_def_cfa_register %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movzbl (%eax), %eax
-; X86-NEXT: cmpb $0, (%eax)
-; X86-NEXT: setne (%eax)
-; X86-NEXT: leal -{{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%eax)
-; X86-NEXT: movb $0, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: .cfi_def_cfa %esp, 4
-; X86-NEXT: retl
BB:
%A30 = alloca i66
%L17 = load i66, ptr %A30
diff --git a/llvm/test/CodeGen/X86/udivmodei5.ll b/llvm/test/CodeGen/X86/udivmodei5.ll
new file mode 100644
index 0000000000000..2c30357180e40
--- /dev/null
+++ b/llvm/test/CodeGen/X86/udivmodei5.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+; On i686, this is expanded into a loop. On x86_64, this calls __udivti3.
+define i65 @udiv65(i65 %a, i65 %b) nounwind {
+; X86-LABEL: udiv65:
+; X86-NOT: call
+;
+; X64-LABEL: udiv65:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: callq __udivti3 at PLT
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+ %res = udiv i65 %a, %b
+ ret i65 %res
+}
+
+define i129 @udiv129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: udiv129:
+; X86-NOT: call
+;
+; X64-LABEL: udiv129:
+; X64-NOT: call
+ %res = udiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @urem129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: urem129:
+; X86-NOT: call
+;
+; X64-LABEL: urem129:
+; X64-NOT: call
+ %res = urem i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @sdiv129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: sdiv129:
+; X86-NOT: call
+;
+; X64-LABEL: sdiv129:
+; X64-NOT: call
+ %res = sdiv i129 %a, %b
+ ret i129 %res
+}
+
+define i129 @srem129(i129 %a, i129 %b) nounwind {
+; X86-LABEL: srem129:
+; X86-NOT: call
+;
+; X64-LABEL: srem129:
+; X64-NOT: call
+ %res = srem i129 %a, %b
+ ret i129 %res
+}
+
+; Some higher sizes
+define i257 @sdiv257(i257 %a, i257 %b) nounwind {
+; X86-LABEL: sdiv257:
+; X86-NOT: call
+;
+; X64-LABEL: sdiv257:
+; X64-NOT: call
+ %res = sdiv i257 %a, %b
+ ret i257 %res
+}
More information about the llvm-commits
mailing list