[llvm] cbdf624 - [AMDGPU] Correctly merge alias.scope and noalias metadata for memops

Tue Sep 21 11:06:47 PDT 2021

Author: Brendon Cahoon
Date: 2021-09-21T13:02:01-05:00
New Revision: cbdf624bb82b57dc9372353459cb04e615418697

URL: https://github.com/llvm/llvm-project/commit/cbdf624bb82b57dc9372353459cb04e615418697
DIFF: https://github.com/llvm/llvm-project/commit/cbdf624bb82b57dc9372353459cb04e615418697.diff

LOG: [AMDGPU] Correctly merge alias.scope and noalias metadata for memops

When adding alias.scope and noalias metadata to a memcpy function,
the alias.scope and noalias metadata from the operands are merged.
The rule for merging alias.scope is to take the intersection of
the domains and the union of the scopes within those domains.
The rule for merging noalias is to take the intersection.

The bug is that AMDGPULowerModuleLDS was using concatenation for
both alias.scope and noalias. For example, when f1 and f2 are added
to the LDS structure and there is a memcpy(f2, f1, sizeof(f1)).
Then, concatenation creates noalias metadata for the memcpy that
includes both {f1, f2}. That means that the memcpy is assumed
not to alias a prior load of f2, which enables the optimizer to
remove a load of f2 that occurs after mempcy.

The function MDNode::getmostGenericAliasScope defines the semantics
for alias.scope. There is a function, combineMetadata in Local.cpp,
that uses intersect for noalias.

Differential Revision: https://reviews.llvm.org/D110049

Added: 
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 3a7ca9834efc..12d6d35a6917 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -373,11 +373,12 @@ class AMDGPULowerModuleLDS : public ModulePass {
       if (auto *I = dyn_cast<Instruction>(U)) {
         if (AliasScope && I->mayReadOrWriteMemory()) {
           MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
-          AS = MDNode::concatenate(AS, AliasScope);
+          AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
+                   : AliasScope);
           I->setMetadata(LLVMContext::MD_alias_scope, AS);
 
           MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
-          NA = MDNode::concatenate(NA, NoAlias);
+          NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
           I->setMetadata(LLVMContext::MD_noalias, NA);
         }
       }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
new file mode 100644
index 000000000000..7ba370596d31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+%vec_type = type { %vec_base }
+%vec_base = type { %union.anon }
+%union.anon = type { %"vec_base<char, 3>::n_vec_" }
+%"vec_base<char, 3>::n_vec_" = type { [3 x i8] }
+
+$_f1 = comdat any
+$_f2 = comdat any
+ at _f1 = linkonce_odr hidden local_unnamed_addr addrspace(3) global %vec_type undef, comdat, align 1
+ at _f2 = linkonce_odr hidden local_unnamed_addr addrspace(3) global %vec_type undef, comdat, align 1
+
+; GCN-LABEL: @test
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1
+; GCN-NEXT: global_store_byte v{{[0-9]+}}, [[REG]]
+
+; CHECK-LABEL: @test
+; CHECK: store i8 3, i8 addrspace(3)* %0, align 4, !alias.scope !0, !noalias !3
+; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %2, i8 addrspace(3)* noundef align 1 dereferenceable(3) %1, i64 3, i1 false), !alias.scope !6, !noalias !7
+; CHECK: %4 = load i8, i8 addrspace(3)* %3, align 4, !alias.scope !8, !noalias !9
+; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %7, i8 addrspace(3)* noundef align 1 dereferenceable(3) %6, i64 3, i1 false), !alias.scope !6, !noalias !7
+; CHECK: %9 = load i8, i8 addrspace(3)* %8, align 4, !alias.scope !8, !noalias !9
+
+define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce) local_unnamed_addr #0 {
+entry:
+  store i8 3, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1
+  tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false)
+  %0 = load i8, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1
+  %cmp.i.i = icmp eq i8 %0, 3
+  store i8 2, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1
+  tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i8 addrspace(3)* noundef align 1 dereferenceable(3) getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), i64 3, i1 false)
+  %1 = load i8, i8 addrspace(3)* getelementptr inbounds (%vec_type, %vec_type addrspace(3)* @_f2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0), align 1
+  %cmp.i.i19 = icmp eq i8 %1, 2
+  %2 = and i1 %cmp.i.i19, %cmp.i.i
+  %frombool8 = zext i1 %2 to i8
+  store i8 %frombool8, i8 addrspace(1)* %ptr.coerce, align 1
+  ret void
+}
+
+declare void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noalias nocapture writeonly, i8 addrspace(3)* noalias nocapture readonly, i64, i1 immarg) #1
+
+; CHECK:!0 = !{!1}
+; CHECK:!1 = distinct !{!1, !2}
+; CHECK:!2 = distinct !{!2}
+; CHECK:!3 = !{!4, !5}
+; CHECK:!4 = distinct !{!4, !2}
+; CHECK:!5 = distinct !{!5, !2}
+; CHECK:!6 = !{!5, !1}
+; CHECK:!7 = !{!4}
+; CHECK:!8 = !{!5}
+; CHECK:!9 = !{!1, !4}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
index 701cfc99cf3d..8543088da4ec 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -5,10 +5,10 @@
 @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
-; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !alias.scope !5, !noalias !10
-; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !alias.scope !5, !noalias !10
-; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !alias.scope !10, !noalias !5
-; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !alias.scope !10, !noalias !5
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !noalias !5
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !noalias !5
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !noalias !5
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !noalias !5
 
 define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) {
 bb:
@@ -39,11 +39,4 @@ bb:
 ; CHECK:!2 = !{!"int", !3, i64 0}
 ; CHECK:!3 = !{!"omnipotent char", !4, i64 0}
 ; CHECK:!4 = !{!"Simple C++ TBAA"}
-; CHECK:!5 = !{!6, !8}
-; CHECK:!6 = distinct !{!6, !7}
-; CHECK:!7 = distinct !{!7}
-; CHECK:!8 = distinct !{!8, !9}
-; CHECK:!9 = distinct !{!9}
-; CHECK:!10 = !{!11, !12}
-; CHECK:!11 = distinct !{!11, !7}
-; CHECK:!12 = distinct !{!12, !9}
+; CHECK:!5 = !{}