[llvm] 8d7d89b - [AMDGPU] Add alias.scope metadata to lowered LDS struct
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 19 11:40:39 PDT 2021
Author: Stanislav Mekhanoshin
Date: 2021-08-19T11:40:30-07:00
New Revision: 8d7d89b0811da55c9f4bf21682b563b0ce521f97
URL: https://github.com/llvm/llvm-project/commit/8d7d89b0811da55c9f4bf21682b563b0ce521f97
DIFF: https://github.com/llvm/llvm-project/commit/8d7d89b0811da55c9f4bf21682b563b0ce521f97.diff
LOG: [AMDGPU] Add alias.scope metadata to lowered LDS struct
Alias analysis is unable to disambiguate accesses to the structure
fields without it unlike distinct variables. As a result we cannot
combine ds_read and ds_write operations in a case of any store in
between which always considered clobbering.
Differential Revision: https://reviews.llvm.org/D108315
Added:
llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 70ecea8dbc3e2..26e2b5ff4d4bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -282,6 +283,21 @@ class AMDGPULowerModuleLDS : public ModulePass {
// so remove the variables from these lists before replaceAllUsesWith
removeFromUsedLists(M, LocalVars);
+ // Create alias.scope and their lists. Each field in the new structure
+ // does not alias with all other fields.
+ SmallVector<MDNode *> AliasScopes;
+ SmallVector<Metadata *> NoAliasList;
+ if (LocalVars.size() > 1) {
+ MDBuilder MDB(Ctx);
+ AliasScopes.reserve(LocalVars.size());
+ MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
+ for (size_t I = 0; I < LocalVars.size(); I++) {
+ MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
+ AliasScopes.push_back(Scope);
+ }
+ NoAliasList.append(&AliasScopes[1], AliasScopes.end());
+ }
+
// Replace uses of ith variable with a constantexpr to the ith field of the
// instance that will be allocated by AMDGPUMachineFunction
Type *I32 = Type::getInt32Ty(Ctx);
@@ -313,7 +329,15 @@ class AMDGPULowerModuleLDS : public ModulePass {
uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
Align A = commonAlignment(StructAlign, Off);
- refineUsesAlignment(GEP, A, DL);
+
+ if (I)
+ NoAliasList[I - 1] = AliasScopes[I - 1];
+ MDNode *NoAlias =
+ NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
+ MDNode *AliasScope =
+ AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
+
+ refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}
// Mark kernels with asm that reads the address of the allocated structure
@@ -334,12 +358,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
return true;
}
- void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
- unsigned MaxDepth = 5) {
- if (!MaxDepth || A == 1)
+ void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
+ MDNode *AliasScope, MDNode *NoAlias,
+ unsigned MaxDepth = 5) {
+ if (!MaxDepth || (A == 1 && !AliasScope))
return;
for (User *U : Ptr->users()) {
+ if (auto *I = dyn_cast<Instruction>(U)) {
+ if (AliasScope && I->mayReadOrWriteMemory()) {
+ MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
+ AS = MDNode::concatenate(AS, AliasScope);
+ I->setMetadata(LLVMContext::MD_alias_scope, AS);
+
+ MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
+ NA = MDNode::concatenate(NA, NoAlias);
+ I->setMetadata(LLVMContext::MD_noalias, NA);
+ }
+ }
+
if (auto *LI = dyn_cast<LoadInst>(U)) {
LI->setAlignment(std::max(A, LI->getAlign()));
continue;
@@ -364,17 +401,19 @@ class AMDGPULowerModuleLDS : public ModulePass {
if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
APInt Off(BitWidth, 0);
- if (GEP->getPointerOperand() == Ptr &&
- GEP->accumulateConstantOffset(DL, Off)) {
- Align GA = commonAlignment(A, Off.getLimitedValue());
- refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+ if (GEP->getPointerOperand() == Ptr) {
+ Align GA;
+ if (GEP->accumulateConstantOffset(DL, Off))
+ GA = commonAlignment(A, Off.getLimitedValue());
+ refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
+ MaxDepth - 1);
}
continue;
}
if (auto *I = dyn_cast<Instruction>(U)) {
if (I->getOpcode() == Instruction::BitCast ||
I->getOpcode() == Instruction::AddrSpaceCast)
- refineUsesAlignment(I, A, DL, MaxDepth - 1);
+ refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
new file mode 100644
index 0000000000000..701cfc99cf3d9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+ at a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+ at b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !alias.scope !5, !noalias !10
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !alias.scope !5, !noalias !10
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !alias.scope !10, !noalias !5
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !alias.scope !10, !noalias !5
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+ store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5
+ %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+ %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3, !tbaa !5
+ store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !alias.scope !3, !noalias !0, !tbaa !5
+ %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+ %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0, !tbaa !5
+ %val = add i32 %val.a, %val.b
+ store i32 %val, i32 addrspace(1)* %arg, align 4
+ ret void
+}
+
+!0 = !{!1}
+!1 = distinct !{!1, !2}
+!2 = distinct !{!2}
+!3 = !{!4}
+!4 = distinct !{!4, !2}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !7, i64 0}
+!7 = !{!"int", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+
+; CHECK:!0 = !{!1, !2, i64 0}
+; CHECK:!1 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !2, i64 0}
+; CHECK:!2 = !{!"int", !3, i64 0}
+; CHECK:!3 = !{!"omnipotent char", !4, i64 0}
+; CHECK:!4 = !{!"Simple C++ TBAA"}
+; CHECK:!5 = !{!6, !8}
+; CHECK:!6 = distinct !{!6, !7}
+; CHECK:!7 = distinct !{!7}
+; CHECK:!8 = distinct !{!8, !9}
+; CHECK:!9 = distinct !{!9}
+; CHECK:!10 = !{!11, !12}
+; CHECK:!11 = distinct !{!11, !7}
+; CHECK:!12 = distinct !{!12, !9}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
new file mode 100644
index 0000000000000..6b24510fc9253
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+ at a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+ at b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+ at c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
+; GCN: ds_write2st64_b32
+; GCN: ds_read2st64_b32
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x2
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+ store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+ %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+ %val.a = load i32, i32 addrspace(3)* %gep.a, align 4
+ store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+ %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+ %val.b = load i32, i32 addrspace(3)* %gep.b, align 4
+ %val = add i32 %val.a, %val.b
+ store i32 %val, i32 addrspace(1)* %arg, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
+; GCN-DAG: ds_write2st64_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_read2st64_b32
+; GCN-DAG: ds_read_b32
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x3
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
+; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14
+; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+ store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+ %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+ %val.a = load i32, i32 addrspace(3)* %gep.a, align 4
+ store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+ %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+ %val.b = load i32, i32 addrspace(3)* %gep.b, align 4
+ store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4
+ %gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i
+ %val.c = load i32, i32 addrspace(3)* %gep.c, align 4
+ %val.1 = add i32 %val.a, %val.b
+ %val = add i32 %val.1, %val.c
+ store i32 %val, i32 addrspace(1)* %arg, align 4
+ ret void
+}
+
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2}
+; CHECK: !2 = distinct !{!2}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2}
+; CHECK: !5 = !{!6}
+; CHECK: !6 = distinct !{!6, !7}
+; CHECK: !7 = distinct !{!7}
+; CHECK: !8 = !{!9, !10}
+; CHECK: !9 = distinct !{!9, !7}
+; CHECK: !10 = distinct !{!10, !7}
+; CHECK: !11 = !{!9}
+; CHECK: !12 = !{!6, !10}
+; CHECK: !13 = !{!10}
+; CHECK: !14 = !{!6, !9}
More information about the llvm-commits
mailing list