[llvm] 3c84819 - CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 9 18:04:53 PDT 2023
Author: Matt Arsenault
Date: 2023-06-09T21:04:37-04:00
New Revision: 3c848194f28decca41b7362f9dd35d4939797724
URL: https://github.com/llvm/llvm-project/commit/3c848194f28decca41b7362f9dd35d4939797724
DIFF: https://github.com/llvm/llvm-project/commit/3c848194f28decca41b7362f9dd35d4939797724.diff
LOG: CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering
Expand large or unknown size memory intrinsics into loops in the
default lowering pipeline if the target doesn't have the corresponding
libfunc. Previously AMDGPU had a custom pass which existed to call the
expansion utilities.
With a default no-libcall option, we can remove the libfunc checks in
LoopIdiomRecognize for these, which never made any sense. This also
provides a path to lifting the immarg restriction on
llvm.memcpy.inline.
There seems to be a bug where TLI reports functions as available if
you use -march and not -mtriple.
Added:
Modified:
llvm/docs/ReleaseNotes.rst
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
llvm/lib/CodeGen/TargetPassConfig.cpp
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/lib/Target/X86/X86TargetTransformInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
Removed:
llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
################################################################################
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 6176439deb166..4dd483f68544a 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -268,14 +268,12 @@ Changes to the C API
* ``LLVMConstSelect``
-Changes to the FastISel infrastructure
---------------------------------------
-
-* ...
-
-Changes to the DAG infrastructure
----------------------------------
+Changes to the CodeGen infrastructure
+-------------------------------------
+* ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` are now
+ expanded into loops by default for targets which do not report the
+ corresponding library function is available.
Changes to the Metadata Info
---------------------------------
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0e10162a6435d..2a5953f3e0b1e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -356,6 +356,10 @@ class TargetTransformInfo {
/// source/destination type and alignment and the number of bytes copied.
InstructionCost getMemcpyCost(const Instruction *I) const;
+ /// Returns the maximum memset / memcpy size in bytes that still makes it
+ /// profitable to inline the call.
+ uint64_t getMaxMemIntrinsicInlineSizeThreshold() const;
+
/// \return The estimated number of case clusters when lowering \p 'SI'.
/// \p JTSize Set a jump table size only when \p SI is suitable for a jump
/// table.
@@ -1673,6 +1677,7 @@ class TargetTransformInfo::Concept {
virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
virtual int getInlinerVectorBonusPercent() const = 0;
virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
+ virtual uint64_t getMaxMemIntrinsicInlineSizeThreshold() const = 0;
virtual unsigned
getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
ProfileSummaryInfo *PSI,
@@ -2044,6 +2049,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
InstructionCost getMemcpyCost(const Instruction *I) override {
return Impl.getMemcpyCost(I);
}
+
+ uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
+ return Impl.getMaxMemIntrinsicInlineSizeThreshold();
+ }
+
InstructionCost getInstructionCost(const User *U,
ArrayRef<const Value *> Operands,
TargetCostKind CostKind) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d7b1538d640eb..de94e33f1ad02 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -77,6 +77,10 @@ class TargetTransformInfoImplBase {
return TTI::TCC_Expensive;
}
+ uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+ return 64;
+ }
+
// Although this default value is arbitrary, it is not random. It is assumed
// that a condition that evaluates the same way by a higher percentage than
// this is best represented as control flow. Therefore, the default value N
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a7b89b2df4527..e1bb963fd4654 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1035,6 +1035,10 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
return Cost;
}
+uint64_t TargetTransformInfo::getMaxMemIntrinsicInlineSizeThreshold() const {
+ return TTIImpl->getMaxMemIntrinsicInlineSizeThreshold();
+}
+
InstructionCost TargetTransformInfo::getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) const {
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 228557b7a74ca..0bdb6b59d3ac5 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -6,14 +6,16 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
-// intrinsics.
+// This pass implements IR lowering for the llvm.memcpy, llvm.memmove,
+// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
#include "llvm/Analysis/ObjCARCInstKind.h"
#include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
@@ -24,9 +26,44 @@
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
using namespace llvm;
+/// Threshold to leave statically sized memory intrinsic calls. Calls of known
+/// size larger than this will be expanded by the pass. Calls of unknown or
+/// lower size will be left for expansion in codegen.
+static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
+ "mem-intrinsic-expand-size",
+ cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
+ cl::Hidden);
+
+namespace {
+
+struct PreISelIntrinsicLowering {
+ const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
+ const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo;
+
+ /// If this is true, assume it's preferably to leave memory intrinsic calls
+ /// for replacement with a library call later. Otherwise this depends on
+ /// TargetLibraryInfo availability of the corresponding function.
+ const bool UseMemIntrinsicLibFunc;
+
+ explicit PreISelIntrinsicLowering(
+ function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
+ function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_,
+ bool UseMemIntrinsicLibFunc_ = true)
+ : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_),
+ UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
+
+ static bool shouldExpandMemIntrinsicWithSize(Value *Size,
+ const TargetTransformInfo &TTI);
+ bool expandMemIntrinsicUses(Function &F) const;
+ bool lowerIntrinsics(Module &M) const;
+};
+
+} // namespace
+
static bool lowerLoadRelative(Function &F) {
if (F.use_empty())
return false;
@@ -133,12 +170,100 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
return true;
}
-static bool lowerIntrinsics(Module &M) {
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
+ Value *Size, const TargetTransformInfo &TTI) {
+ ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+ if (!CI)
+ return true;
+ uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences()
+ ? MemIntrinsicExpandSizeThresholdOpt
+ : TTI.getMaxMemIntrinsicInlineSizeThreshold();
+ uint64_t SizeVal = CI->getZExtValue();
+
+ // Treat a threshold of 0 as a special case to force expansion of all
+ // intrinsics, including size 0.
+ return SizeVal > Threshold || Threshold == 0;
+}
+
+// TODO: Handle atomic memcpy and memcpy.inline
+// TODO: Pass ScalarEvolution
+bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
+ Intrinsic::ID ID = F.getIntrinsicID();
+ bool Changed = false;
+
+ for (User *U : llvm::make_early_inc_range(F.users())) {
+ Instruction *Inst = cast<Instruction>(U);
+
+ switch (ID) {
+ case Intrinsic::memcpy: {
+ auto *Memcpy = cast<MemCpyInst>(Inst);
+ Function *ParentFunc = Memcpy->getFunction();
+ const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+ if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
+ if (UseMemIntrinsicLibFunc &&
+ LookupLibInfo(*ParentFunc).has(LibFunc_memcpy))
+ break;
+
+ expandMemCpyAsLoop(Memcpy, TTI);
+ Changed = true;
+ Memcpy->eraseFromParent();
+ }
+
+ break;
+ }
+ case Intrinsic::memmove: {
+ auto *Memmove = cast<MemMoveInst>(Inst);
+ Function *ParentFunc = Memmove->getFunction();
+ const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+ if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
+ if (UseMemIntrinsicLibFunc &&
+ LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
+ break;
+
+ expandMemMoveAsLoop(Memmove);
+ Changed = true;
+ Memmove->eraseFromParent();
+ }
+
+ break;
+ }
+ case Intrinsic::memset: {
+ auto *Memset = cast<MemSetInst>(Inst);
+ Function *ParentFunc = Memset->getFunction();
+ const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+ if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
+ if (UseMemIntrinsicLibFunc &&
+ LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset))
+ break;
+
+ expandMemSetAsLoop(Memset);
+ Changed = true;
+ Memset->eraseFromParent();
+ }
+
+ break;
+ }
+ default:
+ llvm_unreachable("unhandled intrinsic");
+ }
+ }
+
+ return Changed;
+}
+
+bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
bool Changed = false;
for (Function &F : M) {
switch (F.getIntrinsicID()) {
default:
break;
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ Changed |= expandMemIntrinsicUses(F);
+ break;
case Intrinsic::load_relative:
Changed |= lowerLoadRelative(F);
break;
@@ -230,7 +355,23 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
- bool runOnModule(Module &M) override { return lowerIntrinsics(M); }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnModule(Module &M) override {
+ auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
+ return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ };
+
+ auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+
+ PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+ return Lowering.lowerIntrinsics(M);
+ }
};
} // end anonymous namespace
@@ -247,7 +388,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() {
PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
ModuleAnalysisManager &AM) {
- if (!lowerIntrinsics(M))
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+ auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+
+ auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+ if (!Lowering.lowerIntrinsics(M))
return PreservedAnalyses::all();
else
return PreservedAnalyses::none();
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 86490c0d6417d..8ece4c764f619 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1088,8 +1088,8 @@ bool TargetPassConfig::addISelPasses() {
if (TM->useEmulatedTLS())
addPass(createLowerEmuTLSPass());
- addPass(createPreISelIntrinsicLoweringPass());
PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+ addPass(createPreISelIntrinsicLoweringPass());
addPass(createExpandLargeDivRemPass());
addPass(createExpandLargeFpConvertPass());
addIRPasses();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 887aca9c2c095..3e15fc07c71d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -90,10 +90,6 @@ FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy);
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
extern char &AMDGPUAtomicOptimizerID;
-ModulePass *createAMDGPULowerIntrinsicsPass();
-void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
-extern char &AMDGPULowerIntrinsicsID;
-
ModulePass *createAMDGPUCtorDtorLoweringLegacyPass();
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &);
extern char &AMDGPUCtorDtorLoweringLegacyPassID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
deleted file mode 100644
index f9b21e07ed6a7..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
-
-#define DEBUG_TYPE "amdgpu-lower-intrinsics"
-
-using namespace llvm;
-
-namespace {
-
-static int MaxStaticSize;
-
-static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
- "amdgpu-mem-intrinsic-expand-size",
- cl::desc("Set minimum mem intrinsic size to expand in IR"),
- cl::location(MaxStaticSize),
- cl::init(1024),
- cl::Hidden);
-
-
-class AMDGPULowerIntrinsics : public ModulePass {
-public:
- static char ID;
-
- AMDGPULowerIntrinsics() : ModulePass(ID) {}
-
- bool runOnModule(Module &M) override;
- bool expandMemIntrinsicUses(Function &F);
- StringRef getPassName() const override {
- return "AMDGPU Lower Intrinsics";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-};
-
-}
-
-char AMDGPULowerIntrinsics::ID = 0;
-
-char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
-
-INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
- false)
-
-// TODO: Should refine based on estimated number of accesses (e.g. does it
-// require splitting based on alignment)
-static bool shouldExpandOperationWithSize(Value *Size) {
- ConstantInt *CI = dyn_cast<ConstantInt>(Size);
- return !CI || (CI->getSExtValue() > MaxStaticSize);
-}
-
-bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
- Intrinsic::ID ID = F.getIntrinsicID();
- bool Changed = false;
-
- for (User *U : llvm::make_early_inc_range(F.users())) {
- Instruction *Inst = cast<Instruction>(U);
-
- switch (ID) {
- case Intrinsic::memcpy: {
- auto *Memcpy = cast<MemCpyInst>(Inst);
- if (shouldExpandOperationWithSize(Memcpy->getLength())) {
- Function *ParentFunc = Memcpy->getParent()->getParent();
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
- expandMemCpyAsLoop(Memcpy, TTI);
- Changed = true;
- Memcpy->eraseFromParent();
- }
-
- break;
- }
- case Intrinsic::memmove: {
- auto *Memmove = cast<MemMoveInst>(Inst);
- if (shouldExpandOperationWithSize(Memmove->getLength())) {
- expandMemMoveAsLoop(Memmove);
- Changed = true;
- Memmove->eraseFromParent();
- }
-
- break;
- }
- case Intrinsic::memset: {
- auto *Memset = cast<MemSetInst>(Inst);
- if (shouldExpandOperationWithSize(Memset->getLength())) {
- expandMemSetAsLoop(Memset);
- Changed = true;
- Memset->eraseFromParent();
- }
-
- break;
- }
- default:
- break;
- }
- }
-
- return Changed;
-}
-
-bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
- bool Changed = false;
-
- for (Function &F : M) {
- if (!F.isDeclaration())
- continue;
-
- switch (F.getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memmove:
- case Intrinsic::memset:
- if (expandMemIntrinsicUses(F))
- Changed = true;
- break;
- default:
- break;
- }
- }
-
- return Changed;
-}
-
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
- return new AMDGPULowerIntrinsics();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 26ab00ab29833..15373d0d2b58d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -388,7 +388,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
- initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -998,8 +997,6 @@ void AMDGPUPassConfig::addIRPasses() {
// A call to propagate attributes pass in the backend in case opt was not run.
addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
- addPass(createAMDGPULowerIntrinsicsPass());
-
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c2fd67790d9b6..5a9e87deecc14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -267,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
+int64_t AMDGPUTTIImpl::getMaxInlineSizeThreshold() const {
+ return 1024;
+}
+
const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
// Codegen control options which don't matter.
AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
@@ -395,6 +399,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
+int64_t GCNTTIImpl::getMaxInlineSizeThreshold() const {
+ return 1024;
+}
+
// FIXME: Really we would like to issue multiple 128-bit loads and stores per
// iteration. Should we report a larger size and let it legalize?
//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 630804f169bfc..27fb65154fc6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -55,6 +55,8 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
+
+ int64_t getMaxInlineSizeThreshold() const;
};
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@@ -132,6 +134,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
unsigned AddrSpace) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
+
+ int64_t getMaxInlineSizeThreshold() const;
Type *getMemcpyLoopLoweringType(
LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 02a6d1f01333d..8df156f24dcb6 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -67,7 +67,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
- AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 1453450656ad5..f8dae8e5041aa 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -210,6 +210,10 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
InstructionCost getMemcpyCost(const Instruction *I);
+ uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+ return ST->getMaxInlineSizeThreshold();
+ }
+
int getNumMemOps(const IntrinsicInst *I) const;
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 2c5105d2f03f1..857d95eb65839 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -273,6 +273,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
const Function *Callee) const;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
const ArrayRef<Type *> &Type) const;
+
+ uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+ return ST->getMaxInlineSizeThreshold();
+ }
+
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool prefersVectorizedAddressing() const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 48c1ef46c1976..c7d82551530fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -59,62 +59,40 @@ define amdgpu_kernel void @kernel_caller_stack() {
define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-LABEL: kernel_caller_byval:
-; MUBUF: ; %bb.0:
+; MUBUF: ; %bb.0: ; %loadstoreloop.preheader
; MUBUF-NEXT: s_add_u32 flat_scratch_lo, s4, s7
; MUBUF-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
; MUBUF-NEXT: s_add_u32 s0, s0, s7
+; MUBUF-NEXT: s_mov_b32 s5, 0
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; MUBUF-NEXT: s_movk_i32 s4, 0x80
; MUBUF-NEXT: v_mov_b32_e32 v0, 0
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:44
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:48
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:52
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:56
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:60
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
-; MUBUF-NEXT: s_nop 0
-; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:12
-; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:16
-; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:20
-; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:24
-; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:28
-; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:32
-; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:36
-; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:40
-; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:44
-; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:48
-; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:52
-; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:56
-; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:60
-; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:64
-; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:68
+; MUBUF-NEXT: v_mov_b32_e32 v1, s5
; MUBUF-NEXT: s_movk_i32 s32, 0x1400
+; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop
+; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
+; MUBUF-NEXT: v_add_u32_e32 v2, 4, v1
+; MUBUF-NEXT: v_add_u32_e32 v1, 1, v1
+; MUBUF-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
+; MUBUF-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen
+; MUBUF-NEXT: s_cbranch_vccnz .LBB1_1
+; MUBUF-NEXT: ; %bb.2: ; %split
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
+; MUBUF-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12
+; MUBUF-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16
+; MUBUF-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:20
+; MUBUF-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:24
+; MUBUF-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:28
+; MUBUF-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:32
+; MUBUF-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:36
+; MUBUF-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:40
+; MUBUF-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:44
+; MUBUF-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:48
+; MUBUF-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:52
+; MUBUF-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:56
+; MUBUF-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:60
+; MUBUF-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:64
; MUBUF-NEXT: s_getpc_b64 s[4:5]
; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
@@ -154,38 +132,31 @@ define amdgpu_kernel void @kernel_caller_byval() {
; MUBUF-NEXT: s_endpgm
;
; FLATSCR-LABEL: kernel_caller_byval:
-; FLATSCR: ; %bb.0:
+; FLATSCR: ; %bb.0: ; %loadstoreloop.preheader
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
-; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT: s_mov_b32 s0, 0
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:24
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:32
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:40
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:48
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:56
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:64
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:72
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:80
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:88
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:96
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:104
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:112
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:120
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:128
-; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:8
-; FLATSCR-NEXT: s_nop 0
-; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:16
-; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:24
-; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:32
-; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:40
-; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:48
-; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:56
-; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:64
+; FLATSCR-NEXT: s_mov_b32 s1, 0
+; FLATSCR-NEXT: s_movk_i32 s0, 0x80
+; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
; FLATSCR-NEXT: s_movk_i32 s32, 0x50
+; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop
+; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
+; FLATSCR-NEXT: v_add_u32_e32 v2, 4, v1
+; FLATSCR-NEXT: v_add_u32_e32 v1, 1, v1
+; FLATSCR-NEXT: v_cmp_gt_u32_e32 vcc, s0, v1
+; FLATSCR-NEXT: scratch_store_byte v2, v0, off
+; FLATSCR-NEXT: s_cbranch_vccnz .LBB1_1
+; FLATSCR-NEXT: ; %bb.2: ; %split
+; FLATSCR-NEXT: s_mov_b32 s0, 0
+; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:12
+; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:20
+; FLATSCR-NEXT: scratch_load_dwordx2 v[6:7], off, s0 offset:28
+; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:36
+; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:44
+; FLATSCR-NEXT: scratch_load_dwordx2 v[12:13], off, s0 offset:52
+; FLATSCR-NEXT: scratch_load_dwordx2 v[14:15], off, s0 offset:60
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
index 49502b3435609..bf956c3ca8239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index bb7770701a631..6b05455613515 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index c7a1e163c04aa..466147cac3439 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index dd3f9f12111dc..7cd3babc70909 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 20282ff2992b9..36e1476f7de8d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -34,7 +34,6 @@
; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Early propagate attributes from kernels to functions
-; GCN-O0-NEXT: AMDGPU Lower Intrinsics
; GCN-O0-NEXT: AMDGPU Inline All Functions
; GCN-O0-NEXT: Inliner for always_inline functions
; GCN-O0-NEXT: FunctionPass Manager
@@ -182,7 +181,6 @@
; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Early propagate attributes from kernels to functions
-; GCN-O1-NEXT: AMDGPU Lower Intrinsics
; GCN-O1-NEXT: AMDGPU Inline All Functions
; GCN-O1-NEXT: Inliner for always_inline functions
; GCN-O1-NEXT: FunctionPass Manager
@@ -458,7 +456,6 @@
; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions
-; GCN-O1-OPTS-NEXT: AMDGPU Lower Intrinsics
; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
; GCN-O1-OPTS-NEXT: FunctionPass Manager
@@ -766,7 +763,6 @@
; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Early propagate attributes from kernels to functions
-; GCN-O2-NEXT: AMDGPU Lower Intrinsics
; GCN-O2-NEXT: AMDGPU Inline All Functions
; GCN-O2-NEXT: Inliner for always_inline functions
; GCN-O2-NEXT: FunctionPass Manager
@@ -1077,7 +1073,6 @@
; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Early propagate attributes from kernels to functions
-; GCN-O3-NEXT: AMDGPU Lower Intrinsics
; GCN-O3-NEXT: AMDGPU Inline All Functions
; GCN-O3-NEXT: Inliner for always_inline functions
; GCN-O3-NEXT: FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
index cd720e93a48f2..e9d42dc70cbb9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
-; Test the -amdgpu-mem-intrinsic-expand-size flag works.
+; Test the -mem-intrinsic-expand-size flag works.
; Make sure we can always eliminate the intrinsic, even at 0.
define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
@@ -17,19 +17,19 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_0(
-; OPT0-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
+; OPT0-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT0: loadstoreloop:
+; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
+; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
+; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
+; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT0: split:
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_0(
-; OPT_NEG-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG: loadstoreloop:
-; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
-; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG: split:
+; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false)
@@ -58,15 +58,7 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_4(
-; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG: loadstoreloop:
-; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
-; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG: split:
+; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false)
@@ -103,15 +95,7 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_8(
-; OPT_NEG-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG: loadstoreloop:
-; OPT_NEG-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
-; OPT_NEG-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG: split:
+; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index d9891228e6e20..78280a971c35c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
index 1ca3e8f67eab5..48115f9e405cc 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -S -passes=always-inline -o %t.bc %s
+; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK
; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
diff --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
index 6969811f672d3..b1a939d7aa991 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-- < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 14cd3a5880e7b..36af931b713a9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -153,7 +153,6 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPULegalizerInfo.cpp",
"AMDGPULibCalls.cpp",
"AMDGPULibFunc.cpp",
- "AMDGPULowerIntrinsics.cpp",
"AMDGPULowerKernelArguments.cpp",
"AMDGPULowerKernelAttributes.cpp",
"AMDGPULowerModuleLDSPass.cpp",
More information about the llvm-commits
mailing list