[llvm] 46c3d5c - [amdgpu] Add the late codegen preparation pass.
Michael Liao via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 27 11:08:19 PDT 2020
Author: Michael Liao
Date: 2020-10-27T14:07:59-04:00
New Revision: 46c3d5cb05d63ed7ee1935aa3fd0d96307a9dcac
URL: https://github.com/llvm/llvm-project/commit/46c3d5cb05d63ed7ee1935aa3fd0d96307a9dcac
DIFF: https://github.com/llvm/llvm-project/commit/46c3d5cb05d63ed7ee1935aa3fd0d96307a9dcac.diff
LOG: [amdgpu] Add the late codegen preparation pass.
Summary:
- Teach that pass to widen naturally aligned but not DWORD aligned
sub-DWORD loads.
Reviewers: rampitec, arsenm
Subscribers:
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D80364
Added:
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 2a8d5a7dfd26..132036fbcfd0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -68,6 +68,7 @@ FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
+FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
@@ -223,6 +224,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
extern char &AMDGPUCodeGenPrepareID;
+void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
+extern char &AMDGPULateCodeGenPrepareID;
+
void initializeSIAnnotateControlFlowPass(PassRegistry&);
extern char &SIAnnotateControlFlowPassID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
new file mode 100644
index 000000000000..255cb76b7a26
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -0,0 +1,198 @@
+//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
+/// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <iterator>
+
+#define DEBUG_TYPE "amdgpu-late-codegenprepare"
+
+using namespace llvm;
+
+// Scalar load widening needs running after load-store-vectorizer as that pass
+// doesn't handle overlapping cases. In addition, this pass enhances the
+// widening to handle cases where scalar sub-dword loads are naturally aligned
+// only but not dword aligned.
+static cl::opt<bool>
+ WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
+ cl::desc("Widen sub-dword constant address space loads in "
+ "AMDGPULateCodeGenPrepare"),
+ cl::ReallyHidden, cl::init(true));
+
+namespace {
+
+class AMDGPULateCodeGenPrepare
+ : public FunctionPass,
+ public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
+ Module *Mod = nullptr;
+ const DataLayout *DL = nullptr;
+
+ AssumptionCache *AC = nullptr;
+ LegacyDivergenceAnalysis *DA = nullptr;
+
+public:
+ static char ID;
+
+ AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "AMDGPU IR late optimizations";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.setPreservesAll();
+ }
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+ bool visitInstruction(Instruction &) { return false; }
+
+ // Check if the specified value is at least DWORD aligned.
+ bool isDWORDAligned(const Value *V) const {
+ KnownBits Known = computeKnownBits(V, *DL, 0, AC);
+ return Known.countMinTrailingZeros() >= 2;
+ }
+
+ bool canWidenScalarExtLoad(LoadInst &LI) const;
+ bool visitLoadInst(LoadInst &LI);
+};
+
+} // end anonymous namespace
+
+bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
+ Mod = &M;
+ DL = &Mod->getDataLayout();
+ return false;
+}
+
+bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+ bool Changed = false;
+ for (auto &BB : F)
+ for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
+ Instruction *I = &*BI++;
+ Changed |= visit(*I);
+ }
+
+ return Changed;
+}
+
+bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
+ unsigned AS = LI.getPointerAddressSpace();
+ // Skip non-constant address space.
+ if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
+ AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ return false;
+ // Skip non-simple loads.
+ if (!LI.isSimple())
+ return false;
+ auto *Ty = LI.getType();
+ // Skip aggregate types.
+ if (Ty->isAggregateType())
+ return false;
+ unsigned TySize = DL->getTypeStoreSize(Ty);
+ // Only handle sub-DWORD loads.
+ if (TySize >= 4)
+ return false;
+ // That load must be at least naturally aligned.
+ if (LI.getAlign() < DL->getABITypeAlign(Ty))
+ return false;
+ // It should be uniform, i.e. a scalar load.
+ return DA->isUniform(&LI);
+}
+
+bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
+ if (!WidenLoads)
+ return false;
+
+ // Skip if that load is already aligned on DWORD at least as it's handled in
+ // SDAG.
+ if (LI.getAlign() >= 4)
+ return false;
+
+ if (!canWidenScalarExtLoad(LI))
+ return false;
+
+ int64_t Offset = 0;
+ auto *Base =
+ GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, *DL);
+ // If that base is not DWORD aligned, it's not safe to perform the following
+ // transforms.
+ if (!isDWORDAligned(Base))
+ return false;
+
+ int64_t Adjust = Offset & 0x3;
+ if (Adjust == 0) {
+ // With a zero adjust, the original alignment could be promoted with a
+ // better one.
+ LI.setAlignment(Align(4));
+ return true;
+ }
+
+ IRBuilder<> IRB(&LI);
+ IRB.SetCurrentDebugLocation(LI.getDebugLoc());
+
+ unsigned AS = LI.getPointerAddressSpace();
+ unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
+ auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
+
+ PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
+ auto *NewPtr = IRB.CreateBitCast(
+ IRB.CreateConstGEP1_64(IRB.CreateBitCast(Base, Int8PtrTy),
+ Offset - Adjust),
+ Int32PtrTy);
+ LoadInst *NewLd = IRB.CreateAlignedLoad(NewPtr, Align(4));
+ NewLd->copyMetadata(LI);
+ NewLd->setMetadata(LLVMContext::MD_range, nullptr);
+
+ unsigned ShAmt = Adjust * 8;
+ auto *NewVal = IRB.CreateBitCast(
+ IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
+ LI.replaceAllUsesWith(NewVal);
+ RecursivelyDeleteTriviallyDeadInstructions(&LI);
+
+ return true;
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
+ "AMDGPU IR late optimizations", false, false)
+
+char AMDGPULateCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
+ return new AMDGPULateCodeGenPrepare();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 29199d4ee98e..7081364fde77 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -236,6 +236,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPUPropagateAttributesEarlyPass(*PR);
initializeAMDGPUPropagateAttributesLatePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -865,6 +866,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
+ addPass(createAMDGPULateCodeGenPreparePass());
if (EnableAtomicOptimizations) {
addPass(createAMDGPUAtomicOptimizerPass());
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 0e13efe7e359..b1f6bfdd7be3 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUISelDAGToDAG.cpp
AMDGPUISelLowering.cpp
AMDGPUGlobalISelUtils.cpp
+ AMDGPULateCodeGenPrepare.cpp
AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 8575ca872ebf..42826b7466f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -14,6 +14,22 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
ret void
}
+; GCN-LABEL: {{^}}test2
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1
+; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16
+; GCN-NOT: load_ushort
+; GCN: s_endpgm
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out) {
+ %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+ %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6
+ %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)*
+ %v1 = load i16, i16 addrspace(4)* %h1
+ %e1 = zext i16 %v1 to i32
+ store i32 %e1, i32 addrspace(1)* %out
+ ret void
+}
+
declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
attributes #0 = { readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
index e3126d138c92..65207900f1e6 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-loads.ll
@@ -22,6 +22,37 @@ entry:
ret void
}
+; A little more complicated case where more sub-dword loads could be coalesced
+; if they are not widening earlier.
+; GCN-LABEL: {{^}}load_4i16:
+; GCN: s_load_dwordx2 s{{\[}}[[D0:[0-9]+]]:[[D1:[0-9]+]]{{\]}}, s[4:5], 0x4
+; GCN-NOT: s_load_dword {{s[0-9]+}}, s[4:5], 0x4
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D0]], 16
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s[[D1]], 16
+; GCN: s_endpgm
+define protected amdgpu_kernel void @load_4i16(i32 addrspace(1)* %out) {
+entry:
+ %disp = tail call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+ %gep_x = getelementptr i8, i8 addrspace(4)* %disp, i64 4
+ %gep_x.cast = bitcast i8 addrspace(4)* %gep_x to i16 addrspace(4)*
+ %id_x = load i16, i16 addrspace(4)* %gep_x.cast, align 4, !invariant.load !0 ; load workgroup size x
+ %gep_y = getelementptr i8, i8 addrspace(4)* %disp, i64 6
+ %gep_y.cast = bitcast i8 addrspace(4)* %gep_y to i16 addrspace(4)*
+ %id_y = load i16, i16 addrspace(4)* %gep_y.cast, align 2, !invariant.load !0 ; load workgroup size y
+ %gep_z = getelementptr i8, i8 addrspace(4)* %disp, i64 8
+ %gep_z.cast = bitcast i8 addrspace(4)* %gep_z to i16 addrspace(4)*
+ %id_z = load i16, i16 addrspace(4)* %gep_z.cast, align 4, !invariant.load !0 ; load workgroup size x
+ %gep_w = getelementptr i8, i8 addrspace(4)* %disp, i64 10
+ %gep_w.cast = bitcast i8 addrspace(4)* %gep_w to i16 addrspace(4)*
+ %id_w = load i16, i16 addrspace(4)* %gep_w.cast, align 2, !invariant.load !0 ; load workgroup size y
+ %add = add nuw nsw i16 %id_y, %id_x
+ %add2 = add nuw nsw i16 %id_z, %id_w
+ %add3 = add nuw nsw i16 %add, %add2
+ %conv = zext i16 %add3 to i32
+ store i32 %conv, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
!0 = !{!0}
More information about the llvm-commits
mailing list