[llvm] 8d7d89b - [AMDGPU] Add alias.scope metadata to lowered LDS struct

Thu Aug 19 11:40:39 PDT 2021

Author: Stanislav Mekhanoshin
Date: 2021-08-19T11:40:30-07:00
New Revision: 8d7d89b0811da55c9f4bf21682b563b0ce521f97

URL: https://github.com/llvm/llvm-project/commit/8d7d89b0811da55c9f4bf21682b563b0ce521f97
DIFF: https://github.com/llvm/llvm-project/commit/8d7d89b0811da55c9f4bf21682b563b0ce521f97.diff

LOG: [AMDGPU] Add alias.scope metadata to lowered LDS struct

Alias analysis is unable to disambiguate accesses to the structure
fields without it unlike distinct variables. As a result we cannot
combine ds_read and ds_write operations in a case of any store in
between which always considered clobbering.

Differential Revision: https://reviews.llvm.org/D108315

Added: 
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 70ecea8dbc3e2..26e2b5ff4d4bd 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -282,6 +283,21 @@ class AMDGPULowerModuleLDS : public ModulePass {
     // so remove the variables from these lists before replaceAllUsesWith
     removeFromUsedLists(M, LocalVars);
 
+    // Create alias.scope and their lists. Each field in the new structure
+    // does not alias with all other fields.
+    SmallVector<MDNode *> AliasScopes;
+    SmallVector<Metadata *> NoAliasList;
+    if (LocalVars.size() > 1) {
+      MDBuilder MDB(Ctx);
+      AliasScopes.reserve(LocalVars.size());
+      MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
+      for (size_t I = 0; I < LocalVars.size(); I++) {
+        MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
+        AliasScopes.push_back(Scope);
+      }
+      NoAliasList.append(&AliasScopes[1], AliasScopes.end());
+    }
+
     // Replace uses of ith variable with a constantexpr to the ith field of the
     // instance that will be allocated by AMDGPUMachineFunction
     Type *I32 = Type::getInt32Ty(Ctx);
@@ -313,7 +329,15 @@ class AMDGPULowerModuleLDS : public ModulePass {
 
       uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
       Align A = commonAlignment(StructAlign, Off);
-      refineUsesAlignment(GEP, A, DL);
+
+      if (I)
+        NoAliasList[I - 1] = AliasScopes[I - 1];
+      MDNode *NoAlias =
+          NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
+      MDNode *AliasScope =
+          AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
+
+      refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
 
     // Mark kernels with asm that reads the address of the allocated structure
@@ -334,12 +358,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
     return true;
   }
 
-  void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
-                           unsigned MaxDepth = 5) {
-    if (!MaxDepth || A == 1)
+  void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
+                                MDNode *AliasScope, MDNode *NoAlias,
+                                unsigned MaxDepth = 5) {
+    if (!MaxDepth || (A == 1 && !AliasScope))
       return;
 
     for (User *U : Ptr->users()) {
+      if (auto *I = dyn_cast<Instruction>(U)) {
+        if (AliasScope && I->mayReadOrWriteMemory()) {
+          MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
+          AS = MDNode::concatenate(AS, AliasScope);
+          I->setMetadata(LLVMContext::MD_alias_scope, AS);
+
+          MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
+          NA = MDNode::concatenate(NA, NoAlias);
+          I->setMetadata(LLVMContext::MD_noalias, NA);
+        }
+      }
+
       if (auto *LI = dyn_cast<LoadInst>(U)) {
         LI->setAlignment(std::max(A, LI->getAlign()));
         continue;
@@ -364,17 +401,19 @@ class AMDGPULowerModuleLDS : public ModulePass {
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
         APInt Off(BitWidth, 0);
-        if (GEP->getPointerOperand() == Ptr &&
-            GEP->accumulateConstantOffset(DL, Off)) {
-          Align GA = commonAlignment(A, Off.getLimitedValue());
-          refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+        if (GEP->getPointerOperand() == Ptr) {
+          Align GA;
+          if (GEP->accumulateConstantOffset(DL, Off))
+            GA = commonAlignment(A, Off.getLimitedValue());
+          refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
+                                   MaxDepth - 1);
         }
         continue;
       }
       if (auto *I = dyn_cast<Instruction>(U)) {
         if (I->getOpcode() == Instruction::BitCast ||
             I->getOpcode() == Instruction::AddrSpaceCast)
-          refineUsesAlignment(I, A, DL, MaxDepth - 1);
+          refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
       }
     }
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
new file mode 100644
index 0000000000000..701cfc99cf3d9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+ at a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+ at b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !alias.scope !5, !noalias !10
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !alias.scope !5, !noalias !10
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !alias.scope !10, !noalias !5
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !alias.scope !10, !noalias !5
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5
+  %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3, !tbaa !5
+  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !alias.scope !3, !noalias !0, !tbaa !5
+  %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0, !tbaa !5
+  %val = add i32 %val.a, %val.b
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+!0 = !{!1}
+!1 = distinct !{!1, !2}
+!2 = distinct !{!2}
+!3 = !{!4}
+!4 = distinct !{!4, !2}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !7, i64 0}
+!7 = !{!"int", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+
+; CHECK:!0 = !{!1, !2, i64 0}
+; CHECK:!1 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !2, i64 0}
+; CHECK:!2 = !{!"int", !3, i64 0}
+; CHECK:!3 = !{!"omnipotent char", !4, i64 0}
+; CHECK:!4 = !{!"Simple C++ TBAA"}
+; CHECK:!5 = !{!6, !8}
+; CHECK:!6 = distinct !{!6, !7}
+; CHECK:!7 = distinct !{!7}
+; CHECK:!8 = distinct !{!8, !9}
+; CHECK:!9 = distinct !{!9}
+; CHECK:!10 = !{!11, !12}
+; CHECK:!11 = distinct !{!11, !7}
+; CHECK:!12 = distinct !{!12, !9}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
new file mode 100644
index 0000000000000..6b24510fc9253
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+ at a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+ at b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+ at c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
+; GCN: ds_write2st64_b32
+; GCN: ds_read2st64_b32
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x2
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+  %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4
+  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+  %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4
+  %val = add i32 %val.a, %val.b
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
+; GCN-DAG: ds_write2st64_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_read2st64_b32
+; GCN-DAG: ds_read_b32
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x3
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
+; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14
+; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+  %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4
+  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+  %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4
+  store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4
+  %gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i
+  %val.c = load i32, i32 addrspace(3)* %gep.c, align 4
+  %val.1 = add i32 %val.a, %val.b
+  %val = add i32 %val.1, %val.c
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2}
+; CHECK: !2 = distinct !{!2}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2}
+; CHECK: !5 = !{!6}
+; CHECK: !6 = distinct !{!6, !7}
+; CHECK: !7 = distinct !{!7}
+; CHECK: !8 = !{!9, !10}
+; CHECK: !9 = distinct !{!9, !7}
+; CHECK: !10 = distinct !{!10, !7}
+; CHECK: !11 = !{!9}
+; CHECK: !12 = !{!6, !10}
+; CHECK: !13 = !{!10}
+; CHECK: !14 = !{!6, !9}