[llvm] cb41ee9 - [AMDGPU] Fix promote alloca with double use in a same insn

Thu Feb 11 11:57:53 PST 2021

Author: Stanislav Mekhanoshin
Date: 2021-02-11T11:42:25-08:00
New Revision: cb41ee92dab809b3389de286a51127723a35834d

URL: https://github.com/llvm/llvm-project/commit/cb41ee92dab809b3389de286a51127723a35834d
DIFF: https://github.com/llvm/llvm-project/commit/cb41ee92dab809b3389de286a51127723a35834d.diff

LOG: [AMDGPU] Fix promote alloca with double use in a same insn

If we have an instruction where more than one pointer operands
are derived from the same promoted alloca, we are fixing it for
one argument and do not fix a second use considering this user
done.

Fix this by deferring processing of memory intrinsics until all
potential operands are replaced.

Fixes: SWDEV-271358

Differential Revision: https://reviews.llvm.org/D96386

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2a6ea838efc0..d104ddc1bf99 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -943,6 +943,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   I.replaceAllUsesWith(Offset);
   I.eraseFromParent();
 
+  SmallVector<IntrinsicInst *> DeferredIntrs;
+
   for (Value *V : WorkList) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
@@ -997,22 +999,13 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       // These intrinsics are for address space 0 only
       Intr->eraseFromParent();
       continue;
-    case Intrinsic::memcpy: {
-      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
-      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
-                           MemCpy->getRawSource(), MemCpy->getSourceAlign(),
-                           MemCpy->getLength(), MemCpy->isVolatile());
-      Intr->eraseFromParent();
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+      // These have 2 pointer operands. In case if second pointer also needs
+      // to be replaced we defer processing of these intrinsics until all
+      // other values are processed.
+      DeferredIntrs.push_back(Intr);
       continue;
-    }
-    case Intrinsic::memmove: {
-      MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
-      Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
-                            MemMove->getRawSource(), MemMove->getSourceAlign(),
-                            MemMove->getLength(), MemMove->isVolatile());
-      Intr->eraseFromParent();
-      continue;
-    }
     case Intrinsic::memset: {
       MemSetInst *MemSet = cast<MemSetInst>(Intr);
       Builder.CreateMemSet(
@@ -1050,6 +1043,27 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
     }
   }
+
+  for (IntrinsicInst *Intr : DeferredIntrs) {
+    Builder.SetInsertPoint(Intr);
+    Intrinsic::ID ID = Intr->getIntrinsicID();
+    assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
+
+    MemTransferInst *MI = cast<MemTransferInst>(Intr);
+    auto *B =
+      Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
+                                    MI->getRawSource(), MI->getSourceAlign(),
+                                    MI->getLength(), MI->isVolatile());
+
+    for (unsigned I = 1; I != 3; ++I) {
+      if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
+        B->addDereferenceableAttr(I, Bytes);
+      }
+    }
+
+    Intr->eraseFromParent();
+  }
+
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index 4515447b7497..e283029344bd 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -2,9 +2,11 @@
 
 declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
 declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0
 
 declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
 declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0
 
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) #0
 
@@ -61,5 +63,35 @@ define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
   ret void
 }
 
+; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy(
+; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
+; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
+; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) {
+entry:
+  %r = alloca double, align 8
+  %arrayidx1 = getelementptr inbounds double, double* %r, i32 1
+  %i = bitcast double* %arrayidx1 to i8*
+  %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c
+  %i1 = bitcast double* %arrayidx2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @promote_alloca_used_twice_in_memmove(
+; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
+; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
+; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) {
+entry:
+  %r = alloca double, align 8
+  %arrayidx1 = getelementptr inbounds double, double* %r, i32 1
+  %i = bitcast double* %arrayidx1 to i8*
+  %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c
+  %i1 = bitcast double* %arrayidx2 to i8*
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+  ret void
+}
+
 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
 attributes #1 = { nounwind readnone }