[llvm] [SROA] Optimize reloaded values in allocas that escape into readonly nocapture calls. (PR #116645)

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 28 02:28:57 PST 2024


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/116645

>From eedffc94a106dc34546a645cc5fac75dff91e6be Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 18 Nov 2024 13:58:31 +0000
Subject: [PATCH 1/5] [SROA] Escaping readonly nocapture tests. NFC

---
 .../SROA/non-capturing-call-readonly.ll       |   8 +-
 .../test/Transforms/SROA/readonlynocapture.ll | 323 ++++++++++++++++++
 2 files changed, 327 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/SROA/readonlynocapture.ll

diff --git a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
index 3756afadbf884a..87862b929a7511 100644
--- a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
+++ b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
@@ -364,12 +364,12 @@ define i32 @alloca_used_in_maybe_throwing_call(ptr %data, i64 %n) personality pt
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = invoke i32 @user_of_alloca(ptr [[RETVAL]])
-; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[UW:%.*]]
+; CHECK-NEXT:            to label [[CONT:%.*]] unwind label [[UW:%.*]]
 ; CHECK:       cont:
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       uw:
 ; CHECK-NEXT:    [[I1:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[RETVAL]], align 4
@@ -424,10 +424,10 @@ define i32 @alloca_used_in_maybe_throwing_call_with_same_dests(ptr %data, i64 %n
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = invoke i32 @user_of_alloca(ptr [[RETVAL]])
-; CHECK-NEXT:    to label [[END:%.*]] unwind label [[UW:%.*]]
+; CHECK-NEXT:            to label [[END:%.*]] unwind label [[UW:%.*]]
 ; CHECK:       uw:
 ; CHECK-NEXT:    [[I1:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[RETVAL]], align 4
diff --git a/llvm/test/Transforms/SROA/readonlynocapture.ll b/llvm/test/Transforms/SROA/readonlynocapture.ll
new file mode 100644
index 00000000000000..6a48402afcc517
--- /dev/null
+++ b/llvm/test/Transforms/SROA/readonlynocapture.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=sroa -S | FileCheck %s
+
+declare void @callee(ptr nocapture readonly %p)
+
+define i32 @simple() {
+; CHECK-LABEL: @simple(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[L1]]
+;
+  %a = alloca i32
+  store i32 0, ptr %a
+  call void @callee(ptr %a)
+  %l1 = load i32, ptr %a
+  ret i32 %l1
+}
+
+define i32 @smallbig() {
+; CHECK-LABEL: @smallbig(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[L1]]
+;
+  %a = alloca i32
+  store i8 0, ptr %a
+  call void @callee(ptr %a)
+  %l1 = load i32, ptr %a
+  ret i32 %l1
+}
+
+define i32 @twoalloc() {
+; CHECK-LABEL: @twoalloc(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca {i32, i32}
+  store i32 0, ptr %a
+  %b = getelementptr i32, ptr %a, i32 1
+  store i32 1, ptr %b
+  call void @callee(ptr %a)
+  %l1 = load i32, ptr %a
+  %l2 = load i32, ptr %b
+  ret i32 %l2
+}
+
+define i32 @twostore() {
+; CHECK-LABEL: @twostore(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, ptr [[A]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    store i32 2, ptr [[A]], align 4
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[L]]
+;
+  %a = alloca i32
+  store i32 1, ptr %a
+  call void @callee(ptr %a)
+  store i32 2, ptr %a
+  %l = load i32, ptr %a
+  ret i32 %l
+}
+
+define float @differenttype() {
+; CHECK-LABEL: @differenttype(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L2:%.*]] = load float, ptr [[B]], align 4
+; CHECK-NEXT:    ret float [[L2]]
+;
+  %a = alloca {i32, i32}
+  %b = getelementptr i32, ptr %a, i32 1
+  store i32 1, ptr %b
+  call void @callee(ptr %a)
+  %l2 = load float, ptr %b
+  ret float %l2
+}
+
+define i32 @twoalloc_store64(i64 %x) {
+; CHECK-LABEL: @twoalloc_store64(
+; CHECK-NEXT:    [[A:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    store i64 [[X:%.*]], ptr [[A]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca i64
+  store i64 %x, ptr %a
+  call void @callee(ptr %a)
+  %l1 = load i32, ptr %a
+  %b = getelementptr i32, ptr %a, i32 1
+  %l2 = load i32, ptr %b
+  ret i32 %l2
+}
+
+define i32 @twocalls() {
+; CHECK-LABEL: @twocalls(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca {i32, i32}
+  store i32 0, ptr %a
+  %b = getelementptr i32, ptr %a, i32 1
+  store i32 1, ptr %b
+  call void @callee(ptr %a)
+  %l1 = load i32, ptr %a
+  call void @callee(ptr %a)
+  %l2 = load i32, ptr %b
+  ret i32 %l2
+}
+
+define i32 @volatile() {
+; CHECK-LABEL: @volatile(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load volatile i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load volatile i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca {i32, i32}
+  store i32 0, ptr %a
+  %b = getelementptr i32, ptr %a, i32 1
+  store i32 1, ptr %b
+  call void @callee(ptr %a)
+  %l1 = load volatile i32, ptr %a
+  %l2 = load volatile i32, ptr %b
+  ret i32 %l2
+}
+
+define i32 @notdominating() {
+; CHECK-LABEL: @notdominating(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca {i32, i32}
+  %b = getelementptr i32, ptr %a, i32 1
+  %l1 = load i32, ptr %a
+  %l2 = load i32, ptr %b
+  store i32 0, ptr %a
+  store i32 1, ptr %b
+  call void @callee(ptr %a)
+  ret i32 %l2
+}
+
+declare void @callee_notreadonly(ptr %p)
+define i32 @notreadonly() {
+; CHECK-LABEL: @notreadonly(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee_notreadonly(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca {i32, i32}
+  store i32 0, ptr %a
+  %b = getelementptr i32, ptr %a, i32 1
+  store i32 1, ptr %b
+  call void @callee_notreadonly(ptr %a)
+  %l1 = load i32, ptr %a
+  %l2 = load i32, ptr %b
+  ret i32 %l2
+}
+
+declare void @callee_multiuse(ptr nocapture readonly %p, ptr nocapture readonly %q)
+define i32 @multiuse() {
+; CHECK-LABEL: @multiuse(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
+; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
+; CHECK-NEXT:    call void @callee_multiuse(ptr [[A]], ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    ret i32 [[L2]]
+;
+  %a = alloca {i32, i32}
+  store i32 0, ptr %a
+  %b = getelementptr i32, ptr %a, i32 1
+  store i32 1, ptr %b
+  call void @callee_multiuse(ptr %a, ptr %a)
+  %l1 = load i32, ptr %a
+  %l2 = load i32, ptr %b
+  ret i32 %l2
+}
+
+define i32 @memcpyed(ptr %src) {
+; CHECK-LABEL: @memcpyed(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr [[SRC:%.*]], i64 4, i1 false)
+; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[L1]]
+;
+  %a = alloca i32
+  store i32 0, ptr %a
+  call void @callee(ptr %a)
+  call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %src, i64 4, i1 false)
+  %l1 = load i32, ptr %a
+  ret i32 %l1
+}
+
+define ptr @memcpyedsplit(ptr %src) {
+; CHECK-LABEL: @memcpyedsplit(
+; CHECK-NEXT:    [[A:%.*]] = alloca { i64, i64 }, align 8
+; CHECK-NEXT:    store i8 1, ptr [[A]], align 1
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i64, ptr [[A]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[B]], align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr [[SRC:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    call void @callee(ptr [[A]])
+; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[B]], align 8
+; CHECK-NEXT:    ret ptr [[L1]]
+;
+  %a = alloca { i64, i64 }
+  store i8 1, ptr %a
+  %b = getelementptr i64, ptr %a, i32 1
+  store ptr null, ptr %b
+  call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %src, i64 16, i1 false)
+  call void @callee(ptr %a)
+  %l1 = load ptr, ptr %b
+  ret ptr %l1
+}
+
+; This struct contains padding bits. The load should not be replaced by poison.
+%struct.LoadImmediateInfo = type { i32 }
+define void @incompletestruct(i1 %b, i1 %c) {
+; CHECK-LABEL: @incompletestruct(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LII:%.*]] = alloca [[STRUCT_LOADIMMEDIATEINFO:%.*]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
+; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i32, ptr [[LII]], align 4
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 [[BF_LOAD]], -262144
+; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
+; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
+; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[LII]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[LII]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %LII = alloca %struct.LoadImmediateInfo, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %LII)
+  %bf.load = load i32, ptr %LII, align 4
+  %bf.clear4 = and i32 %bf.load, -262144
+  %bf.set5 = select i1 %b, i32 196608, i32 131072
+  %bf.set12 = or disjoint i32 %bf.set5, %bf.clear4
+  store i32 %bf.set12, ptr %LII, align 4
+  call void @callee(ptr %LII)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %LII)
+  ret void
+}
+
+define void @incompletestruct_bb(i1 %b, i1 %c) {
+; CHECK-LABEL: @incompletestruct_bb(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LII:%.*]] = alloca [[STRUCT_LOADIMMEDIATEINFO:%.*]], align 4
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
+; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i32, ptr [[LII]], align 4
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 [[BF_LOAD]], -262144
+; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
+; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
+; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4
+; CHECK-NEXT:    call void @callee(ptr [[LII]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[LII]])
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %LII = alloca %struct.LoadImmediateInfo, align 4
+  br i1 %c, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %LII)
+  %bf.load = load i32, ptr %LII, align 4
+  %bf.clear4 = and i32 %bf.load, -262144
+  %bf.set5 = select i1 %b, i32 196608, i32 131072
+  %bf.set12 = or disjoint i32 %bf.set5, %bf.clear4
+  store i32 %bf.set12, ptr %LII, align 4
+  call void @callee(ptr %LII)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %LII)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)

>From cf3bbdc29be1a9c66284d6d629dd0b2b40140920 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 18 Nov 2024 15:58:22 +0000
Subject: [PATCH 2/5] [SROA] Optimize reloaded values in allocas that escape
 into readonly nocapture calls.

Given an alloca that potentially has many uses in complex code and escapes into
a call that is readonly+nocapture, we cannot easily split up the alloca.  There
are several optimizations that will attempt to take a value that is stored and
a reload, and replace the load with the original stored value.  Instcombine has
some simple heiristics, GVN can sometimes do, as can early CSE in limited
situations. They all suffer from the same issue with complex code - they start
from a load/store and need to prove no-alias for all code between, which in
complex cases might be a loti to look through. Especially if the ptr is an
alloca with many uses that is over the normal escape capture limits.

The pass that does do well with allocas is SROA, as it has a complete view of
the alloca and all of its uses. This patch adds a case to SROA where it can
detect allocas that are passed into calls that are no-capture readonly. It can
then optimize the reloaded values inside the alloca slice with the stored
value knowing that it is valid no matter the location of the loads/stores from
the no-escaping nature of the alloca.
---
 llvm/include/llvm/Analysis/PtrUseVisitor.h    | 14 ++++
 llvm/lib/Transforms/Scalar/SROA.cpp           | 80 ++++++++++++++++++-
 .../SROA/non-capturing-call-readonly.ll       | 12 +--
 .../test/Transforms/SROA/readonlynocapture.ll | 12 +--
 4 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index bbe2741f44fc3d..c9d3874e7dd961 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -64,6 +64,9 @@ class PtrUseVisitorBase {
     /// Is the pointer escaped at some point?
     bool isEscaped() const { return EscapedInfo != nullptr; }
 
+    /// Is the pointer escaped into a read-only nocapture call at some point?
+    bool isEscapedReadOnly() const { return EscapedReadOnly != nullptr; }
+
     /// Get the instruction causing the visit to abort.
     /// \returns a pointer to the instruction causing the abort if one is
     /// available; otherwise returns null.
@@ -74,6 +77,10 @@ class PtrUseVisitorBase {
     /// is available; otherwise returns null.
     Instruction *getEscapingInst() const { return EscapedInfo; }
 
+    /// Get the instruction causing the pointer to escape which is a read-only
+    /// nocapture call.
+    Instruction *getEscapedReadOnlyInst() const { return EscapedReadOnly; }
+
     /// Mark the visit as aborted. Intended for use in a void return.
     /// \param I The instruction which caused the visit to abort, if available.
     void setAborted(Instruction *I) {
@@ -88,6 +95,12 @@ class PtrUseVisitorBase {
       EscapedInfo = I;
     }
 
+    /// Mark the pointer as escaped into a readonly-nocapture call.
+    void setEscapedReadOnly(Instruction *I) {
+      assert(I && "Expected a valid pointer in setEscapedReadOnly");
+      EscapedReadOnly = I;
+    }
+
     /// Mark the pointer as escaped, and the visit as aborted. Intended
     /// for use in a void return.
     /// \param I The instruction which both escapes the pointer and aborts the
@@ -100,6 +113,7 @@ class PtrUseVisitorBase {
   private:
     Instruction *AbortedInfo = nullptr;
     Instruction *EscapedInfo = nullptr;
+    Instruction *EscapedReadOnly = nullptr;
   };
 
 protected:
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d80af26451ac75..1404ab58fc829b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -246,6 +247,7 @@ class SROA {
   bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
   AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
+  bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
   void clobberUse(Use &U);
   bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
@@ -598,6 +600,7 @@ class AllocaSlices {
   /// If this is true, the slices are never fully built and should be
   /// ignored.
   bool isEscaped() const { return PointerEscapingInstr; }
+  bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
 
   /// Support for iterating over the slices.
   /// @{
@@ -680,6 +683,7 @@ class AllocaSlices {
   /// store a pointer to that here and abort trying to form slices of the
   /// alloca. This will be null if the alloca slices are analyzed successfully.
   Instruction *PointerEscapingInstr;
+  Instruction *PointerEscapingInstrReadOnly;
 
   /// The slices of the alloca.
   ///
@@ -1390,6 +1394,23 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
 
   /// Disable SROA entirely if there are unhandled users of the alloca.
   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+
+  void visitCallBase(CallBase &CB) {
+    // If the operands that are U are NoCapture ReadOnly, then we mark it as
+    // EscapedReadOnly.
+    Function *Callee = CB.getCalledFunction();
+    if (Callee && CB.arg_size() == Callee->arg_size() &&
+        !CB.hasOperandBundles() && all_of(enumerate(CB.args()), [&](auto V) {
+          return V.value() != *U ||
+                 (Callee->getArg(V.index())->hasNoCaptureAttr() &&
+                  Callee->getArg(V.index())->onlyReadsMemory());
+        })) {
+      PI.setEscapedReadOnly(&CB);
+      return;
+    }
+
+    Base::visitCallBase(CB);
+  }
 };
 
 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
@@ -1397,7 +1418,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
-      PointerEscapingInstr(nullptr) {
+      PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
   SliceBuilder PB(DL, AI, *this);
   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
   if (PtrI.isEscaped() || PtrI.isAborted()) {
@@ -1408,6 +1429,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
     assert(PointerEscapingInstr && "Did not track a bad instruction");
     return;
   }
+  PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
 
   llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
 
@@ -1445,6 +1467,9 @@ void AllocaSlices::print(raw_ostream &OS) const {
     return;
   }
 
+  if (PointerEscapingInstrReadOnly)
+    OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
+
   OS << "Slices of alloca: " << AI << "\n";
   for (const_iterator I = begin(), E = end(); I != E; ++I)
     print(OS, I);
@@ -5454,6 +5479,54 @@ void SROA::clobberUse(Use &U) {
     }
 }
 
+bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
+  for (auto &P : AS.partitions()) {
+    StoreInst *Store = nullptr;
+    // Make sure all the slices inside the partition are the full width.
+    if (any_of(P, [&P](Slice &S) {
+          return S.beginOffset() != P.beginOffset() ||
+                 S.beginOffset() != P.beginOffset();
+        }))
+      continue;
+
+    // Check there is a single store and nothing else other than loads.
+    for (Slice &S : P) {
+      if (S.isDead())
+        continue;
+      if (auto *St = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+        if (Store) {
+          Store = nullptr;
+          break;
+        }
+        Store = St;
+      } else if (!isa<LoadInst>(S.getUse()->getUser()) &&
+                 !isAssumeLikeIntrinsic(
+                     cast<Instruction>(S.getUse()->getUser()))) {
+        Store = nullptr;
+        break;
+      }
+    }
+
+    if (!Store)
+      continue;
+
+    // Replace loads by the value that was stored.
+    for (Slice &S : P) {
+      if (auto *Ld = dyn_cast<LoadInst>(S.getUse()->getUser())) {
+        if (DTU->getDomTree().dominates(Store, Ld)) {
+          if (Store->getValueOperand()->getType() == Ld->getType()) {
+            LLVM_DEBUG(dbgs() << "    Replacing " << *Ld << " with "
+                              << *Store->getValueOperand() << "\n");
+            Ld->replaceAllUsesWith(Store->getValueOperand());
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
 /// Analyze an alloca for SROA.
 ///
 /// This analyzes the alloca to ensure we can reason about it, builds
@@ -5494,6 +5567,11 @@ SROA::runOnAlloca(AllocaInst &AI) {
   if (AS.isEscaped())
     return {Changed, CFGChanged};
 
+  if (AS.isEscapedReadOnly()) {
+    Changed |= propagateStoredValuesToLoads(AI, AS);
+    return {Changed, CFGChanged};
+  }
+
   // Delete all the dead users of this alloca before splitting and rewriting it.
   for (Instruction *DeadUser : AS.getDeadUsers()) {
     // Free up everything used by this instruction.
diff --git a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
index 87862b929a7511..65f819122a6951 100644
--- a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
+++ b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
@@ -485,7 +485,7 @@ define [2 x i32] @part_of_alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
 ; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
 ; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
 ; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
@@ -538,7 +538,7 @@ define [2 x i32] @all_parts_of_alloca_used_in_call_with_multiple_args(ptr %data,
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL_FULL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
 ; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
 ; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
 ; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
@@ -701,7 +701,7 @@ define [2 x i32] @part_of_alloca_used_in_call_with_multiple_args(ptr %data, i64
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
 ; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I1_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
 ; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
 ; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
@@ -757,7 +757,7 @@ define [2 x i32] @all_parts_of_alloca_used_in_calls_with_multiple_args(ptr %data
 ; CHECK-NEXT:    [[I2:%.*]] = call i32 @capture_of_alloca(ptr [[SOME_ANOTHER_ALLOCA_FULL]])
 ; CHECK-NEXT:    [[I3_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
 ; CHECK-NEXT:    [[I3_FCA_0_LOAD:%.*]] = load i32, ptr [[I3_FCA_0_GEP]], align 4
-; CHECK-NEXT:    [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[I3_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I3_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
 ; CHECK-NEXT:    [[I3_FCA_1_LOAD:%.*]] = load i32, ptr [[I3_FCA_1_GEP]], align 4
 ; CHECK-NEXT:    [[I3_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I3_FCA_0_INSERT]], i32 [[I3_FCA_1_LOAD]], 1
@@ -817,7 +817,7 @@ define i64 @do_schedule_instrs_for_dce_after_fixups() {
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @user_of_alloca(ptr [[ADD_PTR]])
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[C]], align 4
-; CHECK-NEXT:    ret i64 [[LD]]
+; CHECK-NEXT:    ret i64 0
 ;
 entry:
   %c = alloca i64, align 2
@@ -867,7 +867,7 @@ define i8 @transform_load_and_store() {
 ; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
 ; CHECK-NEXT:    call void @byte_user_of_alloca(ptr [[A]])
 ; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 0
 ;
 entry:
   %a = alloca i8
diff --git a/llvm/test/Transforms/SROA/readonlynocapture.ll b/llvm/test/Transforms/SROA/readonlynocapture.ll
index 6a48402afcc517..d1bc94e5ff085d 100644
--- a/llvm/test/Transforms/SROA/readonlynocapture.ll
+++ b/llvm/test/Transforms/SROA/readonlynocapture.ll
@@ -9,7 +9,7 @@ define i32 @simple() {
 ; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[L1]]
+; CHECK-NEXT:    ret i32 0
 ;
   %a = alloca i32
   store i32 0, ptr %a
@@ -42,7 +42,7 @@ define i32 @twoalloc() {
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    ret i32 [[L2]]
+; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
   store i32 0, ptr %a
@@ -117,7 +117,7 @@ define i32 @twocalls() {
 ; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    ret i32 [[L2]]
+; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
   store i32 0, ptr %a
@@ -139,7 +139,7 @@ define i32 @volatile() {
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load volatile i32, ptr [[A]], align 4
 ; CHECK-NEXT:    [[L2:%.*]] = load volatile i32, ptr [[B]], align 4
-; CHECK-NEXT:    ret i32 [[L2]]
+; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
   store i32 0, ptr %a
@@ -204,7 +204,7 @@ define i32 @multiuse() {
 ; CHECK-NEXT:    call void @callee_multiuse(ptr [[A]], ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
-; CHECK-NEXT:    ret i32 [[L2]]
+; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
   store i32 0, ptr %a
@@ -242,7 +242,7 @@ define ptr @memcpyedsplit(ptr %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr [[SRC:%.*]], i64 16, i1 false)
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[B]], align 8
-; CHECK-NEXT:    ret ptr [[L1]]
+; CHECK-NEXT:    ret ptr null
 ;
   %a = alloca { i64, i64 }
   store i8 1, ptr %a

>From 62625a7e0d145f7861b230abc280cfc191238a98 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 21 Nov 2024 07:01:54 +0000
Subject: [PATCH 3/5] Rewrite to use SSAUpdater

---
 llvm/lib/Transforms/Scalar/SROA.cpp           | 104 +++++++++++-------
 .../SROA/non-capturing-call-readonly.ll       |  91 +++++++--------
 .../test/Transforms/SROA/readonlynocapture.ll |  24 +---
 3 files changed, 105 insertions(+), 114 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 1404ab58fc829b..ffd393f42051e6 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -84,6 +84,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -1401,9 +1402,8 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     Function *Callee = CB.getCalledFunction();
     if (Callee && CB.arg_size() == Callee->arg_size() &&
         !CB.hasOperandBundles() && all_of(enumerate(CB.args()), [&](auto V) {
-          return V.value() != *U ||
-                 (Callee->getArg(V.index())->hasNoCaptureAttr() &&
-                  Callee->getArg(V.index())->onlyReadsMemory());
+          return V.value() != *U || (CB.doesNotCapture(V.index()) &&
+                                     CB.onlyReadsMemory(V.index()));
         })) {
       PI.setEscapedReadOnly(&CB);
       return;
@@ -5479,51 +5479,71 @@ void SROA::clobberUse(Use &U) {
     }
 }
 
-bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
-  for (auto &P : AS.partitions()) {
-    StoreInst *Store = nullptr;
-    // Make sure all the slices inside the partition are the full width.
-    if (any_of(P, [&P](Slice &S) {
-          return S.beginOffset() != P.beginOffset() ||
-                 S.beginOffset() != P.beginOffset();
-        }))
-      continue;
+// A basic LoadAndStorePromoter that does not remove store nodes.
+class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
+public:
+  BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S)
+      : LoadAndStorePromoter(Insts, S) {}
+  bool shouldDelete(Instruction *I) const override {
+    return !isa<StoreInst>(I);
+  }
+};
 
-    // Check there is a single store and nothing else other than loads.
-    for (Slice &S : P) {
-      if (S.isDead())
-        continue;
-      if (auto *St = dyn_cast<StoreInst>(S.getUse()->getUser())) {
-        if (Store) {
-          Store = nullptr;
-          break;
-        }
-        Store = St;
-      } else if (!isa<LoadInst>(S.getUse()->getUser()) &&
-                 !isAssumeLikeIntrinsic(
-                     cast<Instruction>(S.getUse()->getUser()))) {
-        Store = nullptr;
-        break;
+bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
+  // Look through each "partition", looking for slices with the same start/end
+  // that do not overlap with any before them. The slices are sorted by
+  // increasing beginOffset. We don't use AS.partitions(), as it will use a more
+  // sophisticated algorithm that takes splittable slices into account.
+  auto PartitionBegin = AS.begin();
+  auto PartitionEnd = PartitionBegin;
+  uint64_t BeginOffset = PartitionBegin->beginOffset();
+  uint64_t EndOffset = PartitionBegin->endOffset();
+  while (PartitionBegin != AS.end()) {
+    bool AllSameAndValid = true;
+    SmallVector<Instruction *> Insts;
+    Type *PartitionType = nullptr;
+    while (PartitionEnd != AS.end() &&
+           (PartitionEnd->beginOffset() < EndOffset ||
+            PartitionEnd->endOffset() <= EndOffset)) {
+      EndOffset = std::max(EndOffset, PartitionEnd->endOffset());
+      if (AllSameAndValid) {
+        AllSameAndValid &= PartitionEnd->beginOffset() == BeginOffset &&
+                           PartitionEnd->endOffset() == EndOffset;
+        Instruction *User =
+            cast<Instruction>(PartitionEnd->getUse()->getUser());
+        if (isa<LoadInst>(User) || isa<StoreInst>(User)) {
+          // LoadAndStorePromoter requires all the types are the same.
+          Type *UserTy = getLoadStoreType(User);
+          if (PartitionType && UserTy != PartitionType)
+            AllSameAndValid = false;
+          PartitionType = UserTy;
+          Insts.push_back(User);
+        } else if (!isAssumeLikeIntrinsic(User))
+          AllSameAndValid = false;
       }
+      ++PartitionEnd;
     }
 
-    if (!Store)
-      continue;
-
-    // Replace loads by the value that was stored.
-    for (Slice &S : P) {
-      if (auto *Ld = dyn_cast<LoadInst>(S.getUse()->getUser())) {
-        if (DTU->getDomTree().dominates(Store, Ld)) {
-          if (Store->getValueOperand()->getType() == Ld->getType()) {
-            LLVM_DEBUG(dbgs() << "    Replacing " << *Ld << " with "
-                              << *Store->getValueOperand() << "\n");
-            Ld->replaceAllUsesWith(Store->getValueOperand());
-          }
-        }
-      }
+    // So long as all the slices start and end offsets matched, update loads to
+    // the values stored in the partition.
+    if (AllSameAndValid && !Insts.empty()) {
+      LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
+                        << EndOffset << ")\n");
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      BasicLoadAndStorePromoter Promoter(Insts, SSA);
+      // Add a zero value at the point of the alloca, to prevent the SSA updater
+      // replacing loads with poison which would not be valid for padded loads.
+      SSA.AddAvailableValue(AI.getParent(),
+                            Constant::getNullValue(PartitionType));
+      Promoter.run(Insts);
     }
-  }
 
+    // Step on to the next partition.
+    PartitionBegin = PartitionEnd;
+    BeginOffset = PartitionBegin->beginOffset();
+    EndOffset = PartitionBegin->endOffset();
+  }
   return true;
 }
 
diff --git a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
index 65f819122a6951..202e8b5ba33a9d 100644
--- a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
+++ b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
@@ -9,19 +9,18 @@ define i32 @alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I1]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -138,19 +137,18 @@ define i32 @alloca_not_captured_and_readonly_as_per_operand_attr(ptr %data, i64
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @capture_of_alloca(ptr nocapture readonly [[RETVAL]])
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I1]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -267,19 +265,18 @@ define i32 @alloca_with_gep_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I1]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -353,11 +350,11 @@ define i32 @alloca_used_in_maybe_throwing_call(ptr %data, i64 %n) personality pt
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -372,8 +369,7 @@ define i32 @alloca_used_in_maybe_throwing_call(ptr %data, i64 %n) personality pt
 ; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I2]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -413,11 +409,11 @@ define i32 @alloca_used_in_maybe_throwing_call_with_same_dests(ptr %data, i64 %n
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -430,8 +426,7 @@ define i32 @alloca_used_in_maybe_throwing_call_with_same_dests(ptr %data, i64 %n
 ; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    ret i32 [[I2]]
+; CHECK-NEXT:    ret i32 [[RDX_INC]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -472,11 +467,11 @@ define [2 x i32] @part_of_alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -484,11 +479,9 @@ define [2 x i32] @part_of_alloca_used_in_call(ptr %data, i64 %n) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca(ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
 ; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I1_FCA_1_INSERT]]
 ;
 entry:
@@ -525,11 +518,11 @@ define [2 x i32] @all_parts_of_alloca_used_in_call_with_multiple_args(ptr %data,
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -537,11 +530,9 @@ define [2 x i32] @all_parts_of_alloca_used_in_call_with_multiple_args(ptr %data,
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL_FULL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
 ; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I1_FCA_1_INSERT]]
 ;
 entry:
@@ -688,11 +679,11 @@ define [2 x i32] @part_of_alloca_used_in_call_with_multiple_args(ptr %data, i64
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -700,11 +691,9 @@ define [2 x i32] @part_of_alloca_used_in_call_with_multiple_args(ptr %data, i64
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I0:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL]], ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I1_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I1_FCA_0_LOAD:%.*]] = load i32, ptr [[I1_FCA_0_GEP]], align 4
 ; CHECK-NEXT:    [[I1_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I1_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I1_FCA_1_LOAD:%.*]] = load i32, ptr [[I1_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[I1_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I1_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I1_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I1_FCA_1_INSERT]]
 ;
 entry:
@@ -742,11 +731,11 @@ define [2 x i32] @all_parts_of_alloca_used_in_calls_with_multiple_args(ptr %data
 ; CHECK-NEXT:    [[RETVAL:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RDX_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[LD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[RDX:%.*]] = load i32, ptr [[RETVAL]], align 4
-; CHECK-NEXT:    [[RDX_INC:%.*]] = add nsw i32 [[RDX]], [[LD]]
+; CHECK-NEXT:    [[RDX_INC]] = add nsw i32 [[RDX]], [[LD]]
 ; CHECK-NEXT:    store i32 [[RDX_INC]], ptr [[RETVAL]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
@@ -756,11 +745,9 @@ define [2 x i32] @all_parts_of_alloca_used_in_calls_with_multiple_args(ptr %data
 ; CHECK-NEXT:    [[I1:%.*]] = call i32 @user_of_alloca_with_multiple_args(ptr [[RETVAL_FULL]], ptr [[RETVAL]])
 ; CHECK-NEXT:    [[I2:%.*]] = call i32 @capture_of_alloca(ptr [[SOME_ANOTHER_ALLOCA_FULL]])
 ; CHECK-NEXT:    [[I3_FCA_0_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 0
-; CHECK-NEXT:    [[I3_FCA_0_LOAD:%.*]] = load i32, ptr [[I3_FCA_0_GEP]], align 4
 ; CHECK-NEXT:    [[I3_FCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
 ; CHECK-NEXT:    [[I3_FCA_1_GEP:%.*]] = getelementptr inbounds [2 x i32], ptr [[RETVAL_FULL]], i32 0, i32 1
-; CHECK-NEXT:    [[I3_FCA_1_LOAD:%.*]] = load i32, ptr [[I3_FCA_1_GEP]], align 4
-; CHECK-NEXT:    [[I3_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I3_FCA_0_INSERT]], i32 [[I3_FCA_1_LOAD]], 1
+; CHECK-NEXT:    [[I3_FCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[I3_FCA_0_INSERT]], i32 [[RDX_INC]], 1
 ; CHECK-NEXT:    ret [2 x i32] [[I3_FCA_1_INSERT]]
 ;
 entry:
@@ -817,7 +804,7 @@ define i64 @do_schedule_instrs_for_dce_after_fixups() {
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @user_of_alloca(ptr [[ADD_PTR]])
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[C]], align 4
-; CHECK-NEXT:    ret i64 0
+; CHECK-NEXT:    ret i64 [[LD]]
 ;
 entry:
   %c = alloca i64, align 2
@@ -851,8 +838,7 @@ define i8 @dont_transform_load_only() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    call void @byte_user_of_alloca(ptr [[A]])
-; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 poison
 ;
 entry:
   %a = alloca i8
@@ -866,7 +852,6 @@ define i8 @transform_load_and_store() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    store i8 0, ptr [[A]], align 1
 ; CHECK-NEXT:    call void @byte_user_of_alloca(ptr [[A]])
-; CHECK-NEXT:    [[R:%.*]] = load i8, ptr [[A]], align 1
 ; CHECK-NEXT:    ret i8 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SROA/readonlynocapture.ll b/llvm/test/Transforms/SROA/readonlynocapture.ll
index d1bc94e5ff085d..da57ff13d3245b 100644
--- a/llvm/test/Transforms/SROA/readonlynocapture.ll
+++ b/llvm/test/Transforms/SROA/readonlynocapture.ll
@@ -8,7 +8,6 @@ define i32 @simple() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   %a = alloca i32
@@ -40,8 +39,6 @@ define i32 @twoalloc() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
 ; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
@@ -60,8 +57,7 @@ define i32 @twostore() {
 ; CHECK-NEXT:    store i32 1, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    store i32 2, ptr [[A]], align 4
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[L]]
+; CHECK-NEXT:    ret i32 2
 ;
   %a = alloca i32
   store i32 1, ptr %a
@@ -114,9 +110,7 @@ define i32 @twocalls() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
 ; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
@@ -137,8 +131,6 @@ define i32 @volatile() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load volatile i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load volatile i32, ptr [[B]], align 4
 ; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
@@ -155,12 +147,10 @@ define i32 @notdominating() {
 ; CHECK-LABEL: @notdominating(
 ; CHECK-NEXT:    [[A:%.*]] = alloca { i32, i32 }, align 8
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
 ; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    ret i32 [[L2]]
+; CHECK-NEXT:    ret i32 poison
 ;
   %a = alloca {i32, i32}
   %b = getelementptr i32, ptr %a, i32 1
@@ -202,8 +192,6 @@ define i32 @multiuse() {
 ; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A]], i32 1
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee_multiuse(ptr [[A]], ptr [[A]])
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[L2:%.*]] = load i32, ptr [[B]], align 4
 ; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca {i32, i32}
@@ -242,7 +230,7 @@ define ptr @memcpyedsplit(ptr %src) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[A]], ptr [[SRC:%.*]], i64 16, i1 false)
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[B]], align 8
-; CHECK-NEXT:    ret ptr null
+; CHECK-NEXT:    ret ptr [[L1]]
 ;
   %a = alloca { i64, i64 }
   store i8 1, ptr %a
@@ -261,8 +249,7 @@ define void @incompletestruct(i1 %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LII:%.*]] = alloca [[STRUCT_LOADIMMEDIATEINFO:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
-; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i32, ptr [[LII]], align 4
-; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 [[BF_LOAD]], -262144
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 poison, -262144
 ; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
 ; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
 ; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4
@@ -290,8 +277,7 @@ define void @incompletestruct_bb(i1 %b, i1 %c) {
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
-; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i32, ptr [[LII]], align 4
-; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 [[BF_LOAD]], -262144
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 0, -262144
 ; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
 ; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
 ; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4

>From a306a3f56b7162587002e91d1d5852f6457630bd Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 27 Nov 2024 13:10:56 +0000
Subject: [PATCH 4/5] Add a UndefinedVal to SSAUpdater, to use instead of
 Poison

---
 .../llvm/Transforms/Utils/SSAUpdater.h        | 11 ++++++++++
 .../llvm/Transforms/Utils/SSAUpdaterImpl.h    | 13 ++++++++----
 llvm/lib/Transforms/Scalar/SROA.cpp           |  9 ++++-----
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      | 20 +++++++++++++++----
 .../SROA/non-capturing-call-readonly.ll       |  2 +-
 .../test/Transforms/SROA/readonlynocapture.ll |  4 ++--
 6 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index 73649766a95388..fea37466ec9594 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -55,6 +55,10 @@ class SSAUpdater {
   /// the vector.
   SmallVectorImpl<PHINode *> *InsertedPHIs;
 
+  /// The Undefined value (usually poison) to use for loads with no available
+  /// values or blocks considered unreachable.
+  Value *UndefinedVal = nullptr;
+
 public:
   /// If InsertedPHIs is specified, it will be filled
   /// in with all PHI Nodes created by rewriting.
@@ -134,6 +138,13 @@ class SSAUpdater {
   /// inserted values.
   void RewriteUseAfterInsertions(Use &U);
 
+  /// Set the UndefinedVal used for for loads with no available values or blocks
+  /// considered unreachable.
+  void SetUndefinedVal(Value *V);
+
+  /// Get the UndefinedVal or Poison if it is unset.
+  Value *GetUndefinedVal(Type *Ty);
+
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
   void UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue);
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index 746926e5bee331..1ee8b6be40deb0 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -81,11 +81,14 @@ class SSAUpdaterImpl {
 
   BBMapTy BBMap;
   BumpPtrAllocator Allocator;
+  std::optional<ValT> UndefinedVal;
 
 public:
   explicit SSAUpdaterImpl(UpdaterT *U, AvailableValsTy *A,
-                          SmallVectorImpl<PhiT *> *Ins) :
-    Updater(U), AvailableVals(A), InsertedPHIs(Ins) {}
+                          SmallVectorImpl<PhiT *> *Ins,
+                          std::optional<ValT> UndefinedVal = std::nullopt)
+      : Updater(U), AvailableVals(A), InsertedPHIs(Ins),
+        UndefinedVal(UndefinedVal) {}
 
   /// GetValue - Check to see if AvailableVals has an entry for the specified
   /// BB and if so, return it.  If not, construct SSA form by first
@@ -97,7 +100,7 @@ class SSAUpdaterImpl {
 
     // Special case: bail out if BB is unreachable.
     if (BlockList.size() == 0) {
-      ValT V = Traits::GetPoisonVal(BB, Updater);
+      ValT V = UndefinedVal ? *UndefinedVal : Traits::GetPoisonVal(BB, Updater);
       (*AvailableVals)[BB] = V;
       return V;
     }
@@ -253,7 +256,9 @@ class SSAUpdaterImpl {
 
           // Treat an unreachable predecessor as a definition with 'poison'.
           if (Pred->BlkNum == 0) {
-            Pred->AvailableVal = Traits::GetPoisonVal(Pred->BB, Updater);
+            Pred->AvailableVal = UndefinedVal
+                                     ? *UndefinedVal
+                                     : Traits::GetPoisonVal(Pred->BB, Updater);
             (*AvailableVals)[Pred->BB] = Pred->AvailableVal;
             Pred->DefBB = Pred;
             Pred->BlkNum = PseudoEntry->BlkNum;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index ffd393f42051e6..7e325f48c601e0 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -5479,7 +5479,7 @@ void SROA::clobberUse(Use &U) {
     }
 }
 
-// A basic LoadAndStorePromoter that does not remove store nodes.
+/// A basic LoadAndStorePromoter that does not remove store nodes.
 class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
 public:
   BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S)
@@ -5505,14 +5505,13 @@ bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
     while (PartitionEnd != AS.end() &&
            (PartitionEnd->beginOffset() < EndOffset ||
             PartitionEnd->endOffset() <= EndOffset)) {
-      EndOffset = std::max(EndOffset, PartitionEnd->endOffset());
       if (AllSameAndValid) {
         AllSameAndValid &= PartitionEnd->beginOffset() == BeginOffset &&
                            PartitionEnd->endOffset() == EndOffset;
         Instruction *User =
             cast<Instruction>(PartitionEnd->getUse()->getUser());
         if (isa<LoadInst>(User) || isa<StoreInst>(User)) {
-          // LoadAndStorePromoter requires all the types are the same.
+          // LoadAndStorePromoter requires all the types to be the same.
           Type *UserTy = getLoadStoreType(User);
           if (PartitionType && UserTy != PartitionType)
             AllSameAndValid = false;
@@ -5521,6 +5520,7 @@ bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
         } else if (!isAssumeLikeIntrinsic(User))
           AllSameAndValid = false;
       }
+      EndOffset = std::max(EndOffset, PartitionEnd->endOffset());
       ++PartitionEnd;
     }
 
@@ -5534,8 +5534,7 @@ bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
       BasicLoadAndStorePromoter Promoter(Insts, SSA);
       // Add a zero value at the point of the alloca, to prevent the SSA updater
       // replacing loads with poison which would not be valid for padded loads.
-      SSA.AddAvailableValue(AI.getParent(),
-                            Constant::getNullValue(PartitionType));
+      SSA.SetUndefinedVal(Constant::getNullValue(PartitionType));
       Promoter.run(Insts);
     }
 
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 597d470f18ff32..6dabcfd36be34c 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -135,9 +135,10 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     }
   }
 
-  // If there are no predecessors, just return poison.
+  // If there are no predecessors, just return poison / the undefined input
+  // value.
   if (PredValues.empty())
-    return PoisonValue::get(ProtoType);
+    return GetUndefinedVal(ProtoType);
 
   // Otherwise, if all the merged values are the same, just use it.
   if (SingularValue)
@@ -359,10 +360,20 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   if (Value *V = AvailableVals[BB])
     return V;
 
-  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs,
+                                  UndefinedVal ? std::optional(UndefinedVal)
+                                               : std::nullopt);
   return Impl.GetValue(BB);
 }
 
+void SSAUpdater::SetUndefinedVal(Value *V) { UndefinedVal = V; }
+
+Value *SSAUpdater::GetUndefinedVal(Type *Ty) {
+  if (!UndefinedVal)
+    UndefinedVal = PoisonValue::get(Ty);
+  return UndefinedVal;
+}
+
 //===----------------------------------------------------------------------===//
 // LoadAndStorePromoter Implementation
 //===----------------------------------------------------------------------===//
@@ -484,7 +495,8 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
     replaceLoadWithValue(ALoad, NewVal);
 
     // Avoid assertions in unreachable code.
-    if (NewVal == ALoad) NewVal = PoisonValue::get(NewVal->getType());
+    if (NewVal == ALoad)
+      NewVal = SSA.GetUndefinedVal(NewVal->getType());
     ALoad->replaceAllUsesWith(NewVal);
     ReplacedLoads[ALoad] = NewVal;
   }
diff --git a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
index 202e8b5ba33a9d..7ba9eac10b4852 100644
--- a/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
+++ b/llvm/test/Transforms/SROA/non-capturing-call-readonly.ll
@@ -838,7 +838,7 @@ define i8 @dont_transform_load_only() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    call void @byte_user_of_alloca(ptr [[A]])
-; CHECK-NEXT:    ret i8 poison
+; CHECK-NEXT:    ret i8 0
 ;
 entry:
   %a = alloca i8
diff --git a/llvm/test/Transforms/SROA/readonlynocapture.ll b/llvm/test/Transforms/SROA/readonlynocapture.ll
index da57ff13d3245b..46279ac46f140a 100644
--- a/llvm/test/Transforms/SROA/readonlynocapture.ll
+++ b/llvm/test/Transforms/SROA/readonlynocapture.ll
@@ -150,7 +150,7 @@ define i32 @notdominating() {
 ; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
 ; CHECK-NEXT:    store i32 1, ptr [[B]], align 4
 ; CHECK-NEXT:    call void @callee(ptr [[A]])
-; CHECK-NEXT:    ret i32 poison
+; CHECK-NEXT:    ret i32 0
 ;
   %a = alloca {i32, i32}
   %b = getelementptr i32, ptr %a, i32 1
@@ -249,7 +249,7 @@ define void @incompletestruct(i1 %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LII:%.*]] = alloca [[STRUCT_LOADIMMEDIATEINFO:%.*]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[LII]])
-; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 poison, -262144
+; CHECK-NEXT:    [[BF_CLEAR4:%.*]] = and i32 0, -262144
 ; CHECK-NEXT:    [[BF_SET5:%.*]] = select i1 [[B:%.*]], i32 196608, i32 131072
 ; CHECK-NEXT:    [[BF_SET12:%.*]] = or disjoint i32 [[BF_SET5]], [[BF_CLEAR4]]
 ; CHECK-NEXT:    store i32 [[BF_SET12]], ptr [[LII]], align 4

>From ea8426dab46e7195497e772cb1f1c08eb6750c09 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 28 Nov 2024 10:27:05 +0000
Subject: [PATCH 5/5] Remove UndefinedVal, have LoadAndStorePromoter keep track
 of the alloca instead.

---
 .../llvm/Transforms/Utils/SSAUpdater.h        | 18 ++++------
 .../llvm/Transforms/Utils/SSAUpdaterImpl.h    | 13 +++----
 llvm/lib/Transforms/Scalar/SROA.cpp           | 30 ++++++++--------
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      | 34 +++++++++----------
 4 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index fea37466ec9594..989cf0b2d0e7b4 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -55,10 +55,6 @@ class SSAUpdater {
   /// the vector.
   SmallVectorImpl<PHINode *> *InsertedPHIs;
 
-  /// The Undefined value (usually poison) to use for loads with no available
-  /// values or blocks considered unreachable.
-  Value *UndefinedVal = nullptr;
-
 public:
   /// If InsertedPHIs is specified, it will be filled
   /// in with all PHI Nodes created by rewriting.
@@ -138,13 +134,6 @@ class SSAUpdater {
   /// inserted values.
   void RewriteUseAfterInsertions(Use &U);
 
-  /// Set the UndefinedVal used for for loads with no available values or blocks
-  /// considered unreachable.
-  void SetUndefinedVal(Value *V);
-
-  /// Get the UndefinedVal or Poison if it is unset.
-  Value *GetUndefinedVal(Type *Ty);
-
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
   void UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue);
@@ -199,6 +188,13 @@ class LoadAndStorePromoter {
   /// Return false if a sub-class wants to keep one of the loads/stores
   /// after the SSA construction.
   virtual bool shouldDelete(Instruction *I) const { return true; }
+
+  /// Return the value to use for the point in the code that the alloca is
+  /// positioned. This will only be used if an Alloca is included in Insts,
+  /// otherwise the value of a uninitialized load will be assumed to be poison.
+  virtual Value *getValueToUseForAlloca(Instruction *AI) const {
+    return nullptr;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index 1ee8b6be40deb0..746926e5bee331 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -81,14 +81,11 @@ class SSAUpdaterImpl {
 
   BBMapTy BBMap;
   BumpPtrAllocator Allocator;
-  std::optional<ValT> UndefinedVal;
 
 public:
   explicit SSAUpdaterImpl(UpdaterT *U, AvailableValsTy *A,
-                          SmallVectorImpl<PhiT *> *Ins,
-                          std::optional<ValT> UndefinedVal = std::nullopt)
-      : Updater(U), AvailableVals(A), InsertedPHIs(Ins),
-        UndefinedVal(UndefinedVal) {}
+                          SmallVectorImpl<PhiT *> *Ins) :
+    Updater(U), AvailableVals(A), InsertedPHIs(Ins) {}
 
   /// GetValue - Check to see if AvailableVals has an entry for the specified
   /// BB and if so, return it.  If not, construct SSA form by first
@@ -100,7 +97,7 @@ class SSAUpdaterImpl {
 
     // Special case: bail out if BB is unreachable.
     if (BlockList.size() == 0) {
-      ValT V = UndefinedVal ? *UndefinedVal : Traits::GetPoisonVal(BB, Updater);
+      ValT V = Traits::GetPoisonVal(BB, Updater);
       (*AvailableVals)[BB] = V;
       return V;
     }
@@ -256,9 +253,7 @@ class SSAUpdaterImpl {
 
           // Treat an unreachable predecessor as a definition with 'poison'.
           if (Pred->BlkNum == 0) {
-            Pred->AvailableVal = UndefinedVal
-                                     ? *UndefinedVal
-                                     : Traits::GetPoisonVal(Pred->BB, Updater);
+            Pred->AvailableVal = Traits::GetPoisonVal(Pred->BB, Updater);
             (*AvailableVals)[Pred->BB] = Pred->AvailableVal;
             Pred->DefBB = Pred;
             Pred->BlkNum = PseudoEntry->BlkNum;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 7e325f48c601e0..bf85cd312f069a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1397,14 +1397,10 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
 
   void visitCallBase(CallBase &CB) {
-    // If the operands that are U are NoCapture ReadOnly, then we mark it as
+    // If the call operand is NoCapture ReadOnly, then we mark it as
     // EscapedReadOnly.
-    Function *Callee = CB.getCalledFunction();
-    if (Callee && CB.arg_size() == Callee->arg_size() &&
-        !CB.hasOperandBundles() && all_of(enumerate(CB.args()), [&](auto V) {
-          return V.value() != *U || (CB.doesNotCapture(V.index()) &&
-                                     CB.onlyReadsMemory(V.index()));
-        })) {
+    if (CB.doesNotCapture(U->getOperandNo()) &&
+        CB.onlyReadsMemory(U->getOperandNo())) {
       PI.setEscapedReadOnly(&CB);
       return;
     }
@@ -5482,11 +5478,19 @@ void SROA::clobberUse(Use &U) {
 /// A basic LoadAndStorePromoter that does not remove store nodes.
 class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
 public:
-  BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S)
-      : LoadAndStorePromoter(Insts, S) {}
+  BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+                            Type *ZeroType)
+      : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
   bool shouldDelete(Instruction *I) const override {
-    return !isa<StoreInst>(I);
+    return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
   }
+
+  Value *getValueToUseForAlloca(Instruction *I) const override {
+    return Constant::getNullValue(ZeroType);
+  }
+
+private:
+  Type *ZeroType;
 };
 
 bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
@@ -5531,10 +5535,8 @@ bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
                         << EndOffset << ")\n");
       SmallVector<PHINode *, 4> NewPHIs;
       SSAUpdater SSA(&NewPHIs);
-      BasicLoadAndStorePromoter Promoter(Insts, SSA);
-      // Add a zero value at the point of the alloca, to prevent the SSA updater
-      // replacing loads with poison which would not be valid for padded loads.
-      SSA.SetUndefinedVal(Constant::getNullValue(PartitionType));
+      Insts.push_back(&AI);
+      BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
       Promoter.run(Insts);
     }
 
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 6dabcfd36be34c..4b7e007011396b 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -135,10 +135,9 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     }
   }
 
-  // If there are no predecessors, just return poison / the undefined input
-  // value.
+  // If there are no predecessors, just return poison.
   if (PredValues.empty())
-    return GetUndefinedVal(ProtoType);
+    return PoisonValue::get(ProtoType);
 
   // Otherwise, if all the merged values are the same, just use it.
   if (SingularValue)
@@ -360,20 +359,10 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   if (Value *V = AvailableVals[BB])
     return V;
 
-  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs,
-                                  UndefinedVal ? std::optional(UndefinedVal)
-                                               : std::nullopt);
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
   return Impl.GetValue(BB);
 }
 
-void SSAUpdater::SetUndefinedVal(Value *V) { UndefinedVal = V; }
-
-Value *SSAUpdater::GetUndefinedVal(Type *Ty) {
-  if (!UndefinedVal)
-    UndefinedVal = PoisonValue::get(Ty);
-  return UndefinedVal;
-}
-
 //===----------------------------------------------------------------------===//
 // LoadAndStorePromoter Implementation
 //===----------------------------------------------------------------------===//
@@ -423,9 +412,13 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
       if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
         updateDebugInfo(SI);
         SSA.AddAvailableValue(BB, SI->getOperand(0));
-      } else
+      } else if (auto *AI = dyn_cast<AllocaInst>(User)) {
+        // We treat AllocaInst as a store of an getValueToUseForAlloca value.
+        SSA.AddAvailableValue(BB, getValueToUseForAlloca(AI));
+      } else {
         // Otherwise it is a load, queue it to rewrite as a live-in load.
         LiveInLoads.push_back(cast<LoadInst>(User));
+      }
       BlockUses.clear();
       continue;
     }
@@ -433,7 +426,7 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
     // Otherwise, check to see if this block is all loads.
     bool HasStore = false;
     for (Instruction *I : BlockUses) {
-      if (isa<StoreInst>(I)) {
+      if (isa<StoreInst>(I) || isa<AllocaInst>(I)) {
         HasStore = true;
         break;
       }
@@ -479,6 +472,12 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
 
         // Remember that this is the active value in the block.
         StoredValue = SI->getOperand(0);
+      } else if (auto *AI = dyn_cast<AllocaInst>(&I)) {
+        // Check if this a alloca, in which case we treat it as a store of
+        // getValueToUseForAlloca.
+        if (!isInstInList(AI, Insts))
+          continue;
+        StoredValue = getValueToUseForAlloca(AI);
       }
     }
 
@@ -495,8 +494,7 @@ void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
     replaceLoadWithValue(ALoad, NewVal);
 
     // Avoid assertions in unreachable code.
-    if (NewVal == ALoad)
-      NewVal = SSA.GetUndefinedVal(NewVal->getType());
+    if (NewVal == ALoad) NewVal = PoisonValue::get(NewVal->getType());
     ALoad->replaceAllUsesWith(NewVal);
     ReplacedLoads[ALoad] = NewVal;
   }



More information about the llvm-commits mailing list