[llvm] 3c84819 - CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering

Fri Jun 9 18:04:53 PDT 2023

Author: Matt Arsenault
Date: 2023-06-09T21:04:37-04:00
New Revision: 3c848194f28decca41b7362f9dd35d4939797724

URL: https://github.com/llvm/llvm-project/commit/3c848194f28decca41b7362f9dd35d4939797724
DIFF: https://github.com/llvm/llvm-project/commit/3c848194f28decca41b7362f9dd35d4939797724.diff

LOG: CodeGen: Expand memory intrinsics in PreISelIntrinsicLowering

Expand large or unknown size memory intrinsics into loops in the
default lowering pipeline if the target doesn't have the corresponding
libfunc. Previously AMDGPU had a custom pass which existed to call the
expansion utilities.

With a default no-libcall option, we can remove the libfunc checks in
LoopIdiomRecognize for these, which never made any sense. This also
provides a path to lifting the immarg restriction on
llvm.memcpy.inline.

There seems to be a bug where TLI reports functions as available if
you use -march and not -mtriple.

Added: 
    

Modified: 
    llvm/docs/ReleaseNotes.rst
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
    llvm/lib/CodeGen/TargetPassConfig.cpp
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/lib/Target/X86/X86TargetTransformInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
    llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
    llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
    llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
    llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
    llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn

Removed: 
    llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp


################################################################################
diff  --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 6176439deb166..4dd483f68544a 100644

--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -268,14 +268,12 @@ Changes to the C API
 
   * ``LLVMConstSelect``
 
-Changes to the FastISel infrastructure
---------------------------------------
-
-* ...
-
-Changes to the DAG infrastructure
----------------------------------
+Changes to the CodeGen infrastructure
+-------------------------------------
 
+* ``llvm.memcpy``, ``llvm.memmove`` and ``llvm.memset`` are now
+  expanded into loops by default for targets which do not report the
+  corresponding library function is available.
 
 Changes to the Metadata Info
 ---------------------------------

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0e10162a6435d..2a5953f3e0b1e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -356,6 +356,10 @@ class TargetTransformInfo {
   /// source/destination type and alignment and the number of bytes copied.
   InstructionCost getMemcpyCost(const Instruction *I) const;
 
+  /// Returns the maximum memset / memcpy size in bytes that still makes it
+  /// profitable to inline the call.
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const;
+
   /// \return The estimated number of case clusters when lowering \p 'SI'.
   /// \p JTSize Set a jump table size only when \p SI is suitable for a jump
   /// table.
@@ -1673,6 +1677,7 @@ class TargetTransformInfo::Concept {
   virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
   virtual int getInlinerVectorBonusPercent() const = 0;
   virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
+  virtual uint64_t getMaxMemIntrinsicInlineSizeThreshold() const = 0;
   virtual unsigned
   getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
                                    ProfileSummaryInfo *PSI,
@@ -2044,6 +2049,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   InstructionCost getMemcpyCost(const Instruction *I) override {
     return Impl.getMemcpyCost(I);
   }
+
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
+    return Impl.getMaxMemIntrinsicInlineSizeThreshold();
+  }
+
   InstructionCost getInstructionCost(const User *U,
                                      ArrayRef<const Value *> Operands,
                                      TargetCostKind CostKind) override {

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d7b1538d640eb..de94e33f1ad02 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -77,6 +77,10 @@ class TargetTransformInfoImplBase {
     return TTI::TCC_Expensive;
   }
 
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+    return 64;
+  }
+
   // Although this default value is arbitrary, it is not random. It is assumed
   // that a condition that evaluates the same way by a higher percentage than
   // this is best represented as control flow. Therefore, the default value N

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a7b89b2df4527..e1bb963fd4654 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1035,6 +1035,10 @@ InstructionCost TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
   return Cost;
 }
 
+uint64_t TargetTransformInfo::getMaxMemIntrinsicInlineSizeThreshold() const {
+  return TTIImpl->getMaxMemIntrinsicInlineSizeThreshold();
+}
+
 InstructionCost TargetTransformInfo::getArithmeticReductionCost(
     unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
     TTI::TargetCostKind CostKind) const {

diff  --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 228557b7a74ca..0bdb6b59d3ac5 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -6,14 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass implements IR lowering for the llvm.load.relative and llvm.objc.*
-// intrinsics.
+// This pass implements IR lowering for the llvm.memcpy, llvm.memmove,
+// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -24,9 +26,44 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 using namespace llvm;
 
+/// Threshold to leave statically sized memory intrinsic calls. Calls of known
+/// size larger than this will be expanded by the pass. Calls of unknown or
+/// lower size will be left for expansion in codegen.
+static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
+    "mem-intrinsic-expand-size",
+    cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
+    cl::Hidden);
+
+namespace {
+
+struct PreISelIntrinsicLowering {
+  const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
+  const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo;
+
+  /// If this is true, assume it's preferably to leave memory intrinsic calls
+  /// for replacement with a library call later. Otherwise this depends on
+  /// TargetLibraryInfo availability of the corresponding function.
+  const bool UseMemIntrinsicLibFunc;
+
+  explicit PreISelIntrinsicLowering(
+      function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
+      function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_,
+      bool UseMemIntrinsicLibFunc_ = true)
+      : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_),
+        UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
+
+  static bool shouldExpandMemIntrinsicWithSize(Value *Size,
+                                               const TargetTransformInfo &TTI);
+  bool expandMemIntrinsicUses(Function &F) const;
+  bool lowerIntrinsics(Module &M) const;
+};
+
+} // namespace
+
 static bool lowerLoadRelative(Function &F) {
   if (F.use_empty())
     return false;
@@ -133,12 +170,100 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
   return true;
 }
 
-static bool lowerIntrinsics(Module &M) {
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
+    Value *Size, const TargetTransformInfo &TTI) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+  if (!CI)
+    return true;
+  uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences()
+                           ? MemIntrinsicExpandSizeThresholdOpt
+                           : TTI.getMaxMemIntrinsicInlineSizeThreshold();
+  uint64_t SizeVal = CI->getZExtValue();
+
+  // Treat a threshold of 0 as a special case to force expansion of all
+  // intrinsics, including size 0.
+  return SizeVal > Threshold || Threshold == 0;
+}
+
+// TODO: Handle atomic memcpy and memcpy.inline
+// TODO: Pass ScalarEvolution
+bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
+  Intrinsic::ID ID = F.getIntrinsicID();
+  bool Changed = false;
+
+  for (User *U : llvm::make_early_inc_range(F.users())) {
+    Instruction *Inst = cast<Instruction>(U);
+
+    switch (ID) {
+    case Intrinsic::memcpy: {
+      auto *Memcpy = cast<MemCpyInst>(Inst);
+      Function *ParentFunc = Memcpy->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            LookupLibInfo(*ParentFunc).has(LibFunc_memcpy))
+          break;
+
+        expandMemCpyAsLoop(Memcpy, TTI);
+        Changed = true;
+        Memcpy->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memmove: {
+      auto *Memmove = cast<MemMoveInst>(Inst);
+      Function *ParentFunc = Memmove->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            LookupLibInfo(*ParentFunc).has(LibFunc_memmove))
+          break;
+
+        expandMemMoveAsLoop(Memmove);
+        Changed = true;
+        Memmove->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memset: {
+      auto *Memset = cast<MemSetInst>(Inst);
+      Function *ParentFunc = Memset->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset))
+          break;
+
+        expandMemSetAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+      }
+
+      break;
+    }
+    default:
+      llvm_unreachable("unhandled intrinsic");
+    }
+  }
+
+  return Changed;
+}
+
+bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
   bool Changed = false;
   for (Function &F : M) {
     switch (F.getIntrinsicID()) {
     default:
       break;
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset:
+      Changed |= expandMemIntrinsicUses(F);
+      break;
     case Intrinsic::load_relative:
       Changed |= lowerLoadRelative(F);
       break;
@@ -230,7 +355,23 @@ class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
 
   PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
 
-  bool runOnModule(Module &M) override { return lowerIntrinsics(M); }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+
+    PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+    return Lowering.lowerIntrinsics(M);
+  }
 };
 
 } // end anonymous namespace
@@ -247,7 +388,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() {
 
 PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
                                                     ModuleAnalysisManager &AM) {
-  if (!lowerIntrinsics(M))
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI);
+  if (!Lowering.lowerIntrinsics(M))
     return PreservedAnalyses::all();
   else
     return PreservedAnalyses::none();

diff  --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 86490c0d6417d..8ece4c764f619 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1088,8 +1088,8 @@ bool TargetPassConfig::addISelPasses() {
   if (TM->useEmulatedTLS())
     addPass(createLowerEmuTLSPass());
 
-  addPass(createPreISelIntrinsicLoweringPass());
   PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+  addPass(createPreISelIntrinsicLoweringPass());
   addPass(createExpandLargeDivRemPass());
   addPass(createExpandLargeFpConvertPass());
   addIRPasses();

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 887aca9c2c095..3e15fc07c71d8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -90,10 +90,6 @@ FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy);
 void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
 extern char &AMDGPUAtomicOptimizerID;
 
-ModulePass *createAMDGPULowerIntrinsicsPass();
-void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
-extern char &AMDGPULowerIntrinsicsID;
-
 ModulePass *createAMDGPUCtorDtorLoweringLegacyPass();
 void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &);
 extern char &AMDGPUCtorDtorLoweringLegacyPassID;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
deleted file mode 100644
index f9b21e07ed6a7..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
-
-#define DEBUG_TYPE "amdgpu-lower-intrinsics"
-
-using namespace llvm;
-
-namespace {
-
-static int MaxStaticSize;
-
-static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
-  "amdgpu-mem-intrinsic-expand-size",
-  cl::desc("Set minimum mem intrinsic size to expand in IR"),
-  cl::location(MaxStaticSize),
-  cl::init(1024),
-  cl::Hidden);
-
-
-class AMDGPULowerIntrinsics : public ModulePass {
-public:
-  static char ID;
-
-  AMDGPULowerIntrinsics() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override;
-  bool expandMemIntrinsicUses(Function &F);
-  StringRef getPassName() const override {
-    return "AMDGPU Lower Intrinsics";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-  }
-};
-
-}
-
-char AMDGPULowerIntrinsics::ID = 0;
-
-char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
-
-INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
-                false)
-
-// TODO: Should refine based on estimated number of accesses (e.g. does it
-// require splitting based on alignment)
-static bool shouldExpandOperationWithSize(Value *Size) {
-  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
-  return !CI || (CI->getSExtValue() > MaxStaticSize);
-}
-
-bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
-  Intrinsic::ID ID = F.getIntrinsicID();
-  bool Changed = false;
-
-  for (User *U : llvm::make_early_inc_range(F.users())) {
-    Instruction *Inst = cast<Instruction>(U);
-
-    switch (ID) {
-    case Intrinsic::memcpy: {
-      auto *Memcpy = cast<MemCpyInst>(Inst);
-      if (shouldExpandOperationWithSize(Memcpy->getLength())) {
-        Function *ParentFunc = Memcpy->getParent()->getParent();
-        const TargetTransformInfo &TTI =
-            getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
-        expandMemCpyAsLoop(Memcpy, TTI);
-        Changed = true;
-        Memcpy->eraseFromParent();
-      }
-
-      break;
-    }
-    case Intrinsic::memmove: {
-      auto *Memmove = cast<MemMoveInst>(Inst);
-      if (shouldExpandOperationWithSize(Memmove->getLength())) {
-        expandMemMoveAsLoop(Memmove);
-        Changed = true;
-        Memmove->eraseFromParent();
-      }
-
-      break;
-    }
-    case Intrinsic::memset: {
-      auto *Memset = cast<MemSetInst>(Inst);
-      if (shouldExpandOperationWithSize(Memset->getLength())) {
-        expandMemSetAsLoop(Memset);
-        Changed = true;
-        Memset->eraseFromParent();
-      }
-
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
-  bool Changed = false;
-
-  for (Function &F : M) {
-    if (!F.isDeclaration())
-      continue;
-
-    switch (F.getIntrinsicID()) {
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-    case Intrinsic::memset:
-      if (expandMemIntrinsicUses(F))
-        Changed = true;
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
-  return new AMDGPULowerIntrinsics();
-}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 26ab00ab29833..15373d0d2b58d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -388,7 +388,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
-  initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -998,8 +997,6 @@ void AMDGPUPassConfig::addIRPasses() {
   // A call to propagate attributes pass in the backend in case opt was not run.
   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
 
-  addPass(createAMDGPULowerIntrinsicsPass());
-
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c2fd67790d9b6..5a9e87deecc14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -267,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getPeelingPreferences(L, SE, PP);
 }
 
+int64_t AMDGPUTTIImpl::getMaxInlineSizeThreshold() const {
+  return 1024;
+}
+
 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
     // Codegen control options which don't matter.
     AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
@@ -395,6 +399,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
+int64_t GCNTTIImpl::getMaxInlineSizeThreshold() const {
+  return 1024;
+}
+
 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
 // iteration. Should we report a larger size and let it legalize?
 //

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 630804f169bfc..27fb65154fc6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -55,6 +55,8 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
+
+  int64_t getMaxInlineSizeThreshold() const;
 };
 
 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@@ -132,6 +134,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
                                    unsigned AddrSpace) const;
   bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                     unsigned AddrSpace) const;
+
+  int64_t getMaxInlineSizeThreshold() const;
   Type *getMemcpyLoopLoweringType(
       LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
       unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 02a6d1f01333d..8df156f24dcb6 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -67,7 +67,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULegalizerInfo.cpp
   AMDGPULibCalls.cpp
   AMDGPULibFunc.cpp
-  AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp

diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 1453450656ad5..f8dae8e5041aa 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -210,6 +210,10 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
 
   InstructionCost getMemcpyCost(const Instruction *I);
 
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+    return ST->getMaxInlineSizeThreshold();
+  }
+
   int getNumMemOps(const IntrinsicInst *I) const;
 
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 2c5105d2f03f1..857d95eb65839 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -273,6 +273,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
                            const Function *Callee) const;
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                              const ArrayRef<Type *> &Type) const;
+
+  uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
+    return ST->getMaxInlineSizeThreshold();
+  }
+
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
   bool prefersVectorizedAddressing() const;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 48c1ef46c1976..c7d82551530fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -59,62 +59,40 @@ define amdgpu_kernel void @kernel_caller_stack() {
 
 define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-LABEL: kernel_caller_byval:
-; MUBUF:       ; %bb.0:
+; MUBUF:       ; %bb.0: ; %loadstoreloop.preheader
 ; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
 ; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; MUBUF-NEXT:    s_add_u32 s0, s0, s7
+; MUBUF-NEXT:    s_mov_b32 s5, 0
 ; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_movk_i32 s4, 0x80
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:20
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:24
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:28
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:32
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:36
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:44
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:48
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:52
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:56
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:60
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:72
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:76
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:80
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:84
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:88
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:92
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:96
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:100
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:104
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:108
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:112
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
-; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
-; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
-; MUBUF-NEXT:    s_nop 0
-; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
-; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:16
-; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:20
-; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:24
-; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:28
-; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:32
-; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:36
-; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:40
-; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:44
-; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:48
-; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:52
-; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:56
-; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:60
-; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:64
-; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:68
+; MUBUF-NEXT:    v_mov_b32_e32 v1, s5
 ; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
+; MUBUF-NEXT:  .LBB1_1: ; %loadstoreloop
+; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; MUBUF-NEXT:    v_add_u32_e32 v2, 4, v1
+; MUBUF-NEXT:    v_add_u32_e32 v1, 1, v1
+; MUBUF-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v1
+; MUBUF-NEXT:    buffer_store_byte v0, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    s_cbranch_vccnz .LBB1_1
+; MUBUF-NEXT:  ; %bb.2: ; %split
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:12
+; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:16
+; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:20
+; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:24
+; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:28
+; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:32
+; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:36
+; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:40
+; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:44
+; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:48
+; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:52
+; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:56
+; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:60
+; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:64
 ; MUBUF-NEXT:    s_getpc_b64 s[4:5]
 ; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval at rel32@lo+4
 ; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval at rel32@hi+12
@@ -154,38 +132,31 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; MUBUF-NEXT:    s_endpgm
 ;
 ; FLATSCR-LABEL: kernel_caller_byval:
-; FLATSCR:       ; %bb.0:
+; FLATSCR:       ; %bb.0: ; %loadstoreloop.preheader
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
-; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
-; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:8
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:24
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:32
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:40
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:48
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:56
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:64
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:72
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:80
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:88
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:96
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:104
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:112
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:120
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:128
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:8
-; FLATSCR-NEXT:    s_nop 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:16
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:24
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:32
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:40
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:48
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:56
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:64
+; FLATSCR-NEXT:    s_mov_b32 s1, 0
+; FLATSCR-NEXT:    s_movk_i32 s0, 0x80
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s1
 ; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
+; FLATSCR-NEXT:  .LBB1_1: ; %loadstoreloop
+; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLATSCR-NEXT:    v_add_u32_e32 v2, 4, v1
+; FLATSCR-NEXT:    v_add_u32_e32 v1, 1, v1
+; FLATSCR-NEXT:    v_cmp_gt_u32_e32 vcc, s0, v1
+; FLATSCR-NEXT:    scratch_store_byte v2, v0, off
+; FLATSCR-NEXT:    s_cbranch_vccnz .LBB1_1
+; FLATSCR-NEXT:  ; %bb.2: ; %split
+; FLATSCR-NEXT:    s_mov_b32 s0, 0
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s0 offset:4
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s0 offset:12
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:20
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s0 offset:28
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s0 offset:36
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s0 offset:44
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s0 offset:52
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s0 offset:60
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval at rel32@hi+12

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
index 49502b3435609..bf956c3ca8239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
 
 declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index bb7770701a631..6b05455613515 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=19 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=21 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index c7a1e163c04aa..466147cac3439 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index dd3f9f12111dc..7cd3babc70909 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs -amdgpu-mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 20282ff2992b9..36e1476f7de8d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -34,7 +34,6 @@
 ; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O0-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O0-NEXT:    AMDGPU Inline All Functions
 ; GCN-O0-NEXT:    Inliner for always_inline functions
 ; GCN-O0-NEXT:      FunctionPass Manager
@@ -182,7 +181,6 @@
 ; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O1-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O1-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-NEXT:    Inliner for always_inline functions
 ; GCN-O1-NEXT:      FunctionPass Manager
@@ -458,7 +456,6 @@
 ; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O1-OPTS-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O1-OPTS-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-OPTS-NEXT:    Inliner for always_inline functions
 ; GCN-O1-OPTS-NEXT:      FunctionPass Manager
@@ -766,7 +763,6 @@
 ; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O2-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O2-NEXT:    AMDGPU Inline All Functions
 ; GCN-O2-NEXT:    Inliner for always_inline functions
 ; GCN-O2-NEXT:      FunctionPass Manager
@@ -1077,7 +1073,6 @@
 ; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Early propagate attributes from kernels to functions
-; GCN-O3-NEXT:    AMDGPU Lower Intrinsics
 ; GCN-O3-NEXT:    AMDGPU Inline All Functions
 ; GCN-O3-NEXT:    Inliner for always_inline functions
 ; GCN-O3-NEXT:      FunctionPass Manager

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
index cd720e93a48f2..e9d42dc70cbb9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
-; RUN: opt -S -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
+; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
 
-; Test the -amdgpu-mem-intrinsic-expand-size flag works.
+; Test the -mem-intrinsic-expand-size flag works.
 
 ; Make sure we can always eliminate the intrinsic, even at 0.
 define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
@@ -17,19 +17,19 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
 ; OPT4-NEXT:    ret void
 ;
 ; OPT0-LABEL: @memset_size_0(
-; OPT0-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
+; OPT0-NEXT:    br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
+; OPT0:       loadstoreloop:
+; OPT0-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
+; OPT0-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
+; OPT0-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
+; OPT0-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
+; OPT0-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
+; OPT0-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
+; OPT0:       split:
 ; OPT0-NEXT:    ret void
 ;
 ; OPT_NEG-LABEL: @memset_size_0(
-; OPT_NEG-NEXT:    br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG:       loadstoreloop:
-; OPT_NEG-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
-; OPT_NEG-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG:       split:
+; OPT_NEG-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
 ; OPT_NEG-NEXT:    ret void
 ;
   call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false)
@@ -58,15 +58,7 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
 ; OPT0-NEXT:    ret void
 ;
 ; OPT_NEG-LABEL: @memset_size_4(
-; OPT_NEG-NEXT:    br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG:       loadstoreloop:
-; OPT_NEG-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
-; OPT_NEG-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG:       split:
+; OPT_NEG-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
 ; OPT_NEG-NEXT:    ret void
 ;
   call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false)
@@ -103,15 +95,7 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
 ; OPT0-NEXT:    ret void
 ;
 ; OPT_NEG-LABEL: @memset_size_8(
-; OPT_NEG-NEXT:    br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT_NEG:       loadstoreloop:
-; OPT_NEG-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT_NEG-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT_NEG-NEXT:    store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT_NEG-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; OPT_NEG-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
-; OPT_NEG-NEXT:    br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT_NEG:       split:
+; OPT_NEG-NEXT:    call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
 ; OPT_NEG-NEXT:    ret void
 ;
   call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false)

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index d9891228e6e20..78280a971c35c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
 
 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
 declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
index 1ca3e8f67eab5..48115f9e405cc 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt -S -passes=always-inline -o %t.bc %s
+; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %t.bc | FileCheck %s --check-prefixes=CHECK
 
 ; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
 ; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
index 6969811f672d3..b1a939d7aa991 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-size-overflow.ll
@@ -1,5 +1,5 @@
-; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: not llc -mtriple=amdgcn-- < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN %s
 
 declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i32, i1) #1
 

diff  --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 14cd3a5880e7b..36af931b713a9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -153,7 +153,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPULegalizerInfo.cpp",
     "AMDGPULibCalls.cpp",
     "AMDGPULibFunc.cpp",
-    "AMDGPULowerIntrinsics.cpp",
     "AMDGPULowerKernelArguments.cpp",
     "AMDGPULowerKernelAttributes.cpp",
     "AMDGPULowerModuleLDSPass.cpp",