[llvm] [MemCpyOptimizer] Fix scalable vector crash calling performStackMoveO… (PR #67632)

Thu Sep 28 09:54:04 PDT 2023

https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/67632

>From 4f705251a58b52c19429568a06e05dd629bffbe5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 27 Sep 2023 21:32:03 -0700
Subject: [PATCH 1/3] [MemCpyOptimizer] Fix scalable vector crash calling
 performStackMoveOptzn.

This changes performStackMoveOptzn to take a TypeSize instead of
uint64_t to avoid an implicit conversion when called from processStoreOfLoad.

performStackMoveOptzn will return false if the TypeSize is scalable.
---
 .../llvm/Transforms/Scalar/MemCpyOptimizer.h     |  2 +-
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp   | 10 +++++++---
 llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll | 16 ++++++++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 3e8a5bf6a5bd56e..6c809bc881d050d 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -83,7 +83,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   bool moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI);
   bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
                              AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
-                             uint64_t Size, BatchAAResults &BAA);
+                             TypeSize Size, BatchAAResults &BAA);
 
   void eraseInstruction(Instruction *I);
   bool iterateOnFunction(Function &F);
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 4db9d1b6d309afd..f1d0864477586a1 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1428,8 +1428,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 // allocas that aren't captured.
 bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
                                           AllocaInst *DestAlloca,
-                                          AllocaInst *SrcAlloca, uint64_t Size,
+                                          AllocaInst *SrcAlloca, TypeSize Size,
                                           BatchAAResults &BAA) {
+  // We can't optimize scalable types.
+  if (Size.isScalable())
+    return false;
+
   LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
                     << *Store << "\n");
 
@@ -1766,8 +1770,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
   if (Len == nullptr)
     return false;
-  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca, Len->getZExtValue(),
-                            BAA)) {
+  if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
+                            TypeSize::getFixed(Len->getZExtValue()), BAA)) {
     // Avoid invalidating the iterator.
     BBI = M->getNextNonDebugInstruction()->getIterator();
     eraseInstruction(M);
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index 84b06f6071ff69b..821da24d44e73b3 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -87,3 +87,19 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
 
 declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> , <vscale x 4 x ptr> , i32, <vscale x 4 x i1>)
+
+; Make sure we don't crash calling performStackMoveOptzn from processStoreOfLoad.
+define void @load_store(<vscale x 4 x i32> %x) {
+  %src = alloca <vscale x 4 x i32>
+  %dest = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> %x, ptr %src
+  %1 = call i32 @use_nocapture(ptr nocapture %src)
+
+  %src.val = load <vscale x 4 x i32>, ptr %src
+  store <vscale x 4 x i32> %src.val, ptr %dest
+
+  %2 = call i32 @use_nocapture(ptr nocapture %dest)
+  ret void
+}
+
+declare i32 @use_nocapture(ptr nocapture)

>From 393e496a98f1f7116038a2a8b255dc7517a1a5a3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 28 Sep 2023 09:48:27 -0700
Subject: [PATCH 2/3] !fixup add CHECK lines to new test case.

---
 llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index 821da24d44e73b3..ad36be46d4a73f2 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -90,6 +90,16 @@ declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> , <vsca
 
 ; Make sure we don't crash calling performStackMoveOptzn from processStoreOfLoad.
 define void @load_store(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: @load_store(
+; CHECK-NEXT:    [[SRC:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    [[DEST:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[SRC]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[SRC_VAL:%.*]] = load <vscale x 4 x i32>, ptr [[SRC]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[SRC_VAL]], ptr [[DEST]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[DEST]])
+; CHECK-NEXT:    ret void
+;
   %src = alloca <vscale x 4 x i32>
   %dest = alloca <vscale x 4 x i32>
   store <vscale x 4 x i32> %x, ptr %src

>From 77332f6de99b33c8d6839078cb7b828675461a37 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 28 Sep 2023 09:53:25 -0700
Subject: [PATCH 3/3] [MemCpyOptimizer] Support scalable TypeSize in
 performStackMoveOptzn.

---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp |  9 ++-----
 llvm/test/Transforms/MemCpyOpt/stack-move.ll  | 27 +++++++++++++++++++
 .../Transforms/MemCpyOpt/vscale-crashes.ll    | 26 ------------------
 3 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index f1d0864477586a1..9c19c2973438c9d 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1430,10 +1430,6 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
                                           AllocaInst *DestAlloca,
                                           AllocaInst *SrcAlloca, TypeSize Size,
                                           BatchAAResults &BAA) {
-  // We can't optimize scalable types.
-  if (Size.isScalable())
-    return false;
-
   LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
                     << *Store << "\n");
 
@@ -1446,13 +1442,12 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
   // Check that copy is full with static size.
   const DataLayout &DL = DestAlloca->getModule()->getDataLayout();
   std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
-  if (!SrcSize || SrcSize->isScalable() || Size != SrcSize->getFixedValue()) {
+  if (!SrcSize || Size != *SrcSize) {
     LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
     return false;
   }
   std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
-  if (!DestSize || DestSize->isScalable() ||
-      Size != DestSize->getFixedValue()) {
+  if (!DestSize || Size != *DestSize) {
     LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
     return false;
   }
diff --git a/llvm/test/Transforms/MemCpyOpt/stack-move.ll b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
index dee630f470d0053..6089c0a4d7cf507 100644
--- a/llvm/test/Transforms/MemCpyOpt/stack-move.ll
+++ b/llvm/test/Transforms/MemCpyOpt/stack-move.ll
@@ -113,6 +113,33 @@ define void @load_store() {
   ret void
 }
 
+; Test scalable vectors.
+define void @load_store_scalable(<vscale x 4 x i32> %x) {
+; CHECK-LABEL: define void @load_store_scalable
+; CHECK-SAME: (<vscale x 4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SRC:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[X]], ptr [[SRC]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]])
+; CHECK-NEXT:    ret void
+;
+  %src = alloca <vscale x 4 x i32>
+  %dest = alloca <vscale x 4 x i32>
+  call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture %src)
+  call void @llvm.lifetime.start.p0(i64 -1, ptr nocapture %dest)
+  store <vscale x 4 x i32> %x, ptr %src
+  %1 = call i32 @use_nocapture(ptr nocapture %src)
+
+  %src.val = load <vscale x 4 x i32>, ptr %src
+  store <vscale x 4 x i32> %src.val, ptr %dest
+
+  %2 = call i32 @use_nocapture(ptr nocapture %dest)
+
+  call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture %src)
+  call void @llvm.lifetime.end.p0(i64 -1, ptr nocapture %dest)
+  ret void
+}
+
 ; Tests that merging two allocas shouldn't be more poisonous, smaller aligned src is valid.
 define void @align_up() {
 ; CHECK-LABEL: define void @align_up() {
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index ad36be46d4a73f2..84b06f6071ff69b 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -87,29 +87,3 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
 
 declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> , <vscale x 4 x ptr> , i32, <vscale x 4 x i1>)
-
-; Make sure we don't crash calling performStackMoveOptzn from processStoreOfLoad.
-define void @load_store(<vscale x 4 x i32> %x) {
-; CHECK-LABEL: @load_store(
-; CHECK-NEXT:    [[SRC:%.*]] = alloca <vscale x 4 x i32>, align 16
-; CHECK-NEXT:    [[DEST:%.*]] = alloca <vscale x 4 x i32>, align 16
-; CHECK-NEXT:    store <vscale x 4 x i32> [[X:%.*]], ptr [[SRC]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @use_nocapture(ptr nocapture [[SRC]])
-; CHECK-NEXT:    [[SRC_VAL:%.*]] = load <vscale x 4 x i32>, ptr [[SRC]], align 16
-; CHECK-NEXT:    store <vscale x 4 x i32> [[SRC_VAL]], ptr [[DEST]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @use_nocapture(ptr nocapture [[DEST]])
-; CHECK-NEXT:    ret void
-;
-  %src = alloca <vscale x 4 x i32>
-  %dest = alloca <vscale x 4 x i32>
-  store <vscale x 4 x i32> %x, ptr %src
-  %1 = call i32 @use_nocapture(ptr nocapture %src)
-
-  %src.val = load <vscale x 4 x i32>, ptr %src
-  store <vscale x 4 x i32> %src.val, ptr %dest
-
-  %2 = call i32 @use_nocapture(ptr nocapture %dest)
-  ret void
-}
-
-declare i32 @use_nocapture(ptr nocapture)