[llvm] [LoopUnroll] Add CSE to remove redundant loads after unrolling. (PR #83860)

Fri Apr 5 05:16:02 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/83860

>From 7184cd28333e9ef1ada958c2faf3692ba0a9600f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 4 Mar 2024 14:39:47 +0000
Subject: [PATCH 1/2] [LoopUnroll] Add tests for load CSE as part of
 loop-unroll.

---
 .../Transforms/LoopUnroll/unroll-loads-cse.ll | 480 ++++++++++++++++++
 .../AArch64/extra-unroll-simplifications.ll   |  81 +++
 2 files changed, 561 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll

diff --git a/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll b/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
new file mode 100644
index 00000000000000..109a1834c3029f
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
@@ -0,0 +1,480 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-unroll -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define void @cse_matching_load_from_previous_unrolled_iteration(ptr %src, ptr noalias %dst, i64 %N) {
+; CHECK-LABEL: define void @cse_matching_load_from_previous_unrolled_iteration(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i64, ptr [[GEP_SRC_12]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV]]
+; CHECK-NEXT:    [[L_4:%.*]] = load i64, ptr [[GEP_SRC_4]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[L_12]], [[L_4]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_12_1:%.*]] = load i64, ptr [[GEP_SRC_12_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_4_1:%.*]] = load i64, ptr [[GEP_SRC_4_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_4_1]]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LOOP_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.epil.preheader:
+; CHECK-NEXT:    br label [[LOOP_EPIL:%.*]]
+; CHECK:       loop.epil:
+; CHECK-NEXT:    [[GEP_SRC_12_EPIL:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_12_EPIL:%.*]] = load i64, ptr [[GEP_SRC_12_EPIL]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_EPIL:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_4_EPIL:%.*]] = load i64, ptr [[GEP_SRC_4_EPIL]], align 8
+; CHECK-NEXT:    [[MUL_EPIL:%.*]] = mul i64 [[L_12_EPIL]], [[L_4_EPIL]]
+; CHECK-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_UNR]]
+; CHECK-NEXT:    store i64 [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.4 = getelementptr i8, ptr %src, i64 4
+  %src.12 = getelementptr i8, ptr %src, i64 12
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.12 = getelementptr i64, ptr %src.12, i64 %iv
+  %l.12 = load i64, ptr %gep.src.12, align 8
+  %gep.src.4 = getelementptr i64, ptr %src.4, i64 %iv
+  %l.4 = load i64, ptr %gep.src.4, align 8
+  %mul = mul i64 %l.12, %l.4
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %mul, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %c = icmp eq i64 %iv.next, %N
+  br i1 %c, label %exit, label %loop, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+define void @cse_different_load_types(ptr %src, ptr noalias %dst, i64 %N) {
+; CHECK-LABEL: define void @cse_different_load_types(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i32, ptr [[GEP_SRC_12]], align 8
+; CHECK-NEXT:    [[L_12_EXT:%.*]] = zext i32 [[L_12]] to i64
+; CHECK-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV]]
+; CHECK-NEXT:    [[L_4:%.*]] = load i64, ptr [[GEP_SRC_4]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[L_12_EXT]], [[L_4]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_12_1:%.*]] = load i32, ptr [[GEP_SRC_12_1]], align 8
+; CHECK-NEXT:    [[L_12_EXT_1:%.*]] = zext i32 [[L_12_1]] to i64
+; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_4_1:%.*]] = load i64, ptr [[GEP_SRC_4_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_EXT_1]], [[L_4_1]]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LOOP_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.epil.preheader:
+; CHECK-NEXT:    br label [[LOOP_EPIL:%.*]]
+; CHECK:       loop.epil:
+; CHECK-NEXT:    [[GEP_SRC_12_EPIL:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_12_EPIL:%.*]] = load i32, ptr [[GEP_SRC_12_EPIL]], align 8
+; CHECK-NEXT:    [[L_12_EXT_EPIL:%.*]] = zext i32 [[L_12_EPIL]] to i64
+; CHECK-NEXT:    [[GEP_SRC_4_EPIL:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_4_EPIL:%.*]] = load i64, ptr [[GEP_SRC_4_EPIL]], align 8
+; CHECK-NEXT:    [[MUL_EPIL:%.*]] = mul i64 [[L_12_EXT_EPIL]], [[L_4_EPIL]]
+; CHECK-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_UNR]]
+; CHECK-NEXT:    store i64 [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.4 = getelementptr i8, ptr %src, i64 4
+  %src.12 = getelementptr i8, ptr %src, i64 12
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.12 = getelementptr i64, ptr %src.12, i64 %iv
+  %l.12 = load i32, ptr %gep.src.12, align 8
+  %l.12.ext = zext i32 %l.12 to i64
+  %gep.src.4 = getelementptr i64, ptr %src.4, i64 %iv
+  %l.4 = load i64, ptr %gep.src.4, align 8
+  %mul = mul i64 %l.12.ext, %l.4
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %mul, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %c = icmp eq i64 %iv.next, %N
+  br i1 %c, label %exit, label %loop, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+define void @cse_volatile_loads(ptr %src, ptr noalias %dst, i64 %N) {
+; CHECK-LABEL: define void @cse_volatile_loads(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i64, ptr [[GEP_SRC_12]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV]]
+; CHECK-NEXT:    [[L_4:%.*]] = load volatile i64, ptr [[GEP_SRC_4]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[L_12]], [[L_4]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_12_1:%.*]] = load i64, ptr [[GEP_SRC_12_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_4_1:%.*]] = load volatile i64, ptr [[GEP_SRC_4_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_4_1]]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LOOP_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.epil.preheader:
+; CHECK-NEXT:    br label [[LOOP_EPIL:%.*]]
+; CHECK:       loop.epil:
+; CHECK-NEXT:    [[GEP_SRC_12_EPIL:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_12_EPIL:%.*]] = load i64, ptr [[GEP_SRC_12_EPIL]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_EPIL:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_4_EPIL:%.*]] = load volatile i64, ptr [[GEP_SRC_4_EPIL]], align 8
+; CHECK-NEXT:    [[MUL_EPIL:%.*]] = mul i64 [[L_12_EPIL]], [[L_4_EPIL]]
+; CHECK-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_UNR]]
+; CHECK-NEXT:    store i64 [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.4 = getelementptr i8, ptr %src, i64 4
+  %src.12 = getelementptr i8, ptr %src, i64 12
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.12 = getelementptr i64, ptr %src.12, i64 %iv
+  %l.12 = load i64, ptr %gep.src.12, align 8
+  %gep.src.4 = getelementptr i64, ptr %src.4, i64 %iv
+  %l.4 = load volatile i64, ptr %gep.src.4, align 8
+  %mul = mul i64 %l.12, %l.4
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %mul, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %c = icmp eq i64 %iv.next, %N
+  br i1 %c, label %exit, label %loop, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+define void @cse_atomic_loads(ptr %src, ptr noalias %dst, i64 %N) {
+; CHECK-LABEL: define void @cse_atomic_loads(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i64, ptr [[GEP_SRC_12]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV]]
+; CHECK-NEXT:    [[L_4:%.*]] = load atomic i64, ptr [[GEP_SRC_4]] unordered, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[L_12]], [[L_4]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_12_1:%.*]] = load i64, ptr [[GEP_SRC_12_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_4_1:%.*]] = load atomic i64, ptr [[GEP_SRC_4_1]] unordered, align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_4_1]]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LOOP_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.epil.preheader:
+; CHECK-NEXT:    br label [[LOOP_EPIL:%.*]]
+; CHECK:       loop.epil:
+; CHECK-NEXT:    [[GEP_SRC_12_EPIL:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_12_EPIL:%.*]] = load i64, ptr [[GEP_SRC_12_EPIL]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_EPIL:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_4_EPIL:%.*]] = load atomic i64, ptr [[GEP_SRC_4_EPIL]] unordered, align 8
+; CHECK-NEXT:    [[MUL_EPIL:%.*]] = mul i64 [[L_12_EPIL]], [[L_4_EPIL]]
+; CHECK-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_UNR]]
+; CHECK-NEXT:    store i64 [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.4 = getelementptr i8, ptr %src, i64 4
+  %src.12 = getelementptr i8, ptr %src, i64 12
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.12 = getelementptr i64, ptr %src.12, i64 %iv
+  %l.12 = load i64, ptr %gep.src.12, align 8
+  %gep.src.4 = getelementptr i64, ptr %src.4, i64 %iv
+  %l.4 = load atomic i64, ptr %gep.src.4 unordered, align 8
+  %mul = mul i64 %l.12, %l.4
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %mul, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %c = icmp eq i64 %iv.next, %N
+  br i1 %c, label %exit, label %loop, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+define void @cse_load_may_be_clobbered(ptr %src, ptr %dst, i64 %N) {
+; CHECK-LABEL: define void @cse_load_may_be_clobbered(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV]]
+; CHECK-NEXT:    [[L_12:%.*]] = load i64, ptr [[GEP_SRC_12]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV]]
+; CHECK-NEXT:    [[L_4:%.*]] = load i64, ptr [[GEP_SRC_4]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[L_12]], [[L_4]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[MUL]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_12_1:%.*]] = load i64, ptr [[GEP_SRC_12_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[L_4_1:%.*]] = load i64, ptr [[GEP_SRC_4_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_4_1]]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], [[LOOP]] ]
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_UNR_PH]], [[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[LOOP_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.epil.preheader:
+; CHECK-NEXT:    br label [[LOOP_EPIL:%.*]]
+; CHECK:       loop.epil:
+; CHECK-NEXT:    [[GEP_SRC_12_EPIL:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_12_EPIL:%.*]] = load i64, ptr [[GEP_SRC_12_EPIL]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_EPIL:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_UNR]]
+; CHECK-NEXT:    [[L_4_EPIL:%.*]] = load i64, ptr [[GEP_SRC_4_EPIL]], align 8
+; CHECK-NEXT:    [[MUL_EPIL:%.*]] = mul i64 [[L_12_EPIL]], [[L_4_EPIL]]
+; CHECK-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_UNR]]
+; CHECK-NEXT:    store i64 [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.4 = getelementptr i8, ptr %src, i64 4
+  %src.12 = getelementptr i8, ptr %src, i64 12
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.src.12 = getelementptr i64, ptr %src.12, i64 %iv
+  %l.12 = load i64, ptr %gep.src.12, align 8
+  %gep.src.4 = getelementptr i64, ptr %src.4, i64 %iv
+  %l.4 = load i64, ptr %gep.src.4, align 8
+  %mul = mul i64 %l.12, %l.4
+  %gep.dst = getelementptr i64, ptr %dst, i64 %iv
+  store i64 %mul, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %c = icmp eq i64 %iv.next, %N
+  br i1 %c, label %exit, label %loop, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+
+declare void @foo()
+
+define void @loop_body_with_dead_blocks(ptr %src) {
+; CHECK-LABEL: define void @loop_body_with_dead_blocks(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header.loopexit:
+; CHECK-NEXT:    br label [[OUTER_HEADER]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    br label [[LOOP_BB:%.*]]
+; CHECK:       loop.bb.dead:
+; CHECK-NEXT:    unreachable
+; CHECK:       loop.bb:
+; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L_1]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label [[OUTER_HEADER_LOOPEXIT:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i32 [[L_2]], 1
+; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT:%.*]], label [[LOOP_HEADER_1:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       loop.header.1:
+; CHECK-NEXT:    br label [[LOOP_BB_1:%.*]]
+; CHECK:       loop.bb.1:
+; CHECK-NEXT:    [[L_1_1:%.*]] = load i32, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[C_1_1:%.*]] = icmp eq i32 [[L_1_1]], 0
+; CHECK-NEXT:    br i1 [[C_1_1]], label [[OUTER_HEADER_LOOPEXIT]], label [[LOOP_LATCH_1:%.*]]
+; CHECK:       loop.latch.1:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[L_2_1:%.*]] = load i32, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[C_2_1:%.*]] = icmp eq i32 [[L_2_1]], 1
+; CHECK-NEXT:    br i1 [[C_2_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  br label %loop.header
+
+loop.header:
+  br label %loop.bb
+
+loop.bb.dead:
+  br label %loop.bb
+
+loop.bb:
+  %l.1 = load i32, ptr %src, align 8
+  %c.1 = icmp eq i32 %l.1, 0
+  br i1 %c.1, label %outer.header, label %loop.latch
+
+loop.latch:
+  call void @foo()
+  %l.2 = load i32, ptr %src, align 8
+  %c.2 = icmp eq i32 %l.2, 1
+  br i1 %c.2, label %exit, label %loop.header, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!0 = !{!"llvm.loop.mustprogress"}
+!1 = distinct !{!1, !0, !2}
+!2 = !{!"llvm.loop.unroll.count", i32 2}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META8:![0-9]+]]}
+; CHECK: [[META8]] = !{!"llvm.loop.unroll.count", i32 2}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index 6132c35c96ca32..b32f4e2a258cd7 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -72,6 +72,86 @@ exit:
   ret void
 }
 
+define void @cse_matching_load_from_previous_unrolled_iteration(i32 %N, ptr %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @cse_matching_load_from_previous_unrolled_iteration(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture readonly [[SRC:%.*]], ptr noalias nocapture writeonly [[DST:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_4:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[SRC_12:%.*]] = getelementptr i8, ptr [[SRC]], i64 12
+; CHECK-NEXT:    [[CMP141:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP141]], label [[LOOP_LATCH_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.latch.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[N]], 1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[LOOP_LATCH_PREHEADER_NEW:%.*]]
+; CHECK:       loop.latch.preheader.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483646
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[GEP_SRC_12:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L_12:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L_4:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i32> [[L_4]], [[L_12]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store <2 x i32> [[MUL]], ptr [[GEP_DST]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[L_12_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_1]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[L_4_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4_1]], align 8
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul <2 x i32> [[L_4_1]], [[L_12_1]]
+; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    store <2 x i32> [[MUL_1]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_LOOPEXIT_UNR_LCSSA]], label [[LOOP_LATCH]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[LOOP_LATCH_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1]], [[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[EXIT]], label [[LOOP_LATCH_EPIL:%.*]]
+; CHECK:       loop.latch.epil:
+; CHECK-NEXT:    [[GEP_SRC_12_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[L_12_EPIL:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_EPIL]], align 8
+; CHECK-NEXT:    [[GEP_SRC_4_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[L_4_EPIL:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4_EPIL]], align 8
+; CHECK-NEXT:    [[MUL_EPIL:%.*]] = mul <2 x i32> [[L_4_EPIL]], [[L_12_EPIL]]
+; CHECK-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    store <2 x i32> [[MUL_EPIL]], ptr [[GEP_DST_EPIL]], align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.4 = getelementptr i8, ptr %src, i64 4
+  %src.12 = getelementptr i8, ptr %src, i64 12
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %cmp14 = icmp slt i32 %iv, %N
+  br i1 %cmp14, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.ext = zext i32 %iv to i64
+  %gep.src.12 = getelementptr <2 x i32>, ptr %src.12, i64 %iv.ext
+  %l.12 = load <2 x i32>, ptr %gep.src.12, align 8
+  %gep.src.4 = getelementptr <2 x i32>, ptr %src.4, i64 %iv.ext
+  %l.4 = load <2 x i32>, ptr %gep.src.4, align 8
+  %mul = mul <2 x i32> %l.12, %l.4
+  %gep.dst = getelementptr <2 x i32>, ptr %dst, i64 %iv.ext
+  store <2 x i32> %mul, ptr %gep.dst
+  %iv.next = add nuw nsw i32 %iv, 1
+  br label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
 !0 = distinct !{!0, !1, !2}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.unroll.count", i32 2}
@@ -79,4 +159,5 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
 ;.

>From a8e73370f086d07c6f03e4fbd23be06d89ca29ba Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 29 Feb 2024 20:17:11 +0000
Subject: [PATCH 2/2] [LoopUnroll] Add CSE to remove redundant loads after
 unrolling.

This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based
on EarlyCSE's implementation using ScopeHashTable and is using SCEV for
accessed pointers to check to find redundant loads after unrolling.

This applies to the late unroll pass only, for full unrolling those
redundant loads will be cleaned up by the regular pipeline.

If we agree to move forward with this approach, there may be  potential
to share some logic with EarlyCSE and to split off the MemorySSA changes
separately.

The current approach constructs MSSA on-dmeand per-loop, but there is
still small but notable compile-time impact:

stage1-O3  +0.04%
stage1-ReleaseThinLTO +0.06%
stage1-ReleaseLTO-g +0.05%
stage1-O0-g +0.02%
stage2-O3 +0.09%
stage2-O0-g +0.04%
stage2-clang +0.02%

https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u

This benefits some workloads with runtime-unrolling disabled,
where users use pragmas to force unrolling, as well as with
runtime unrolling enabled.

On SPEC/MultiSource, this removes a number of loads after unrolling
on AArch64 with runtime unrolling enabled.

External/S...te/526.blender_r/526.blender_r    96
MultiSourc...rks/mediabench/gsm/toast/toast    39
SingleSource/Benchmarks/Misc/ffbench            4
External/SPEC/CINT2006/403.gcc/403.gcc         18
MultiSourc.../Applications/JM/ldecod/ldecod     4
MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg     6
MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG     9
MultiSourc...e/Applications/ClamAV/clamscan     4
MultiSourc.../MallocBench/espresso/espresso     3
MultiSourc...dence-flt/LinearDependence-flt     2
MultiSourc...ch/office-ispell/office-ispell     4
MultiSourc...ch/consumer-jpeg/consumer-jpeg     6
MultiSourc...ench/security-sha/security-sha    11
MultiSourc...chmarks/McCat/04-bisect/bisect     3
SingleSour...tTests/2020-01-06-coverage-009    12
MultiSourc...ench/telecomm-gsm/telecomm-gsm    39
MultiSourc...lds-flt/CrossingThresholds-flt    24
MultiSourc...dence-dbl/LinearDependence-dbl     2
External/S...C/CINT2006/445.gobmk/445.gobmk     6
MultiSourc...enchmarks/mafft/pairlocalalign    53
External/S...31.deepsjeng_r/531.deepsjeng_r     3
External/S...rate/510.parest_r/510.parest_r    58
External/S...NT2006/464.h264ref/464.h264ref    29
External/S...NT2017rate/502.gcc_r/502.gcc_r    45
External/S...C/CINT2006/456.hmmer/456.hmmer     6
External/S...te/538.imagick_r/538.imagick_r    18
External/S.../CFP2006/447.dealII/447.dealII     4
MultiSourc...OE-ProxyApps-C++/miniFE/miniFE    12
External/S...2017rate/525.x264_r/525.x264_r    36
MultiSourc...Benchmarks/7zip/7zip-benchmark    33
MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk     2
MultiSourc...chmarks/VersaBench/8b10b/8b10b     1
MultiSourc.../Applications/JM/lencod/lencod   116
MultiSourc...lds-dbl/CrossingThresholds-dbl    24
MultiSource/Benchmarks/McCat/05-eks/eks        15
---
 llvm/include/llvm/Analysis/MemorySSA.h        |  15 +-
 .../llvm/Transforms/Utils/UnrollLoop.h        |   7 +-
 llvm/lib/Analysis/MemorySSA.cpp               |  97 ++++++++---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |  14 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 157 +++++++++++++++++-
 .../Transforms/LoopUnroll/unroll-loads-cse.ll |   7 +-
 .../AArch64/extra-unroll-simplifications.ll   |   4 +-
 7 files changed, 254 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index caf0e31fd37d6c..2ca5c281166cad 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -110,6 +110,7 @@ namespace llvm {
 template <class GraphType> struct GraphTraits;
 class BasicBlock;
 class Function;
+class Loop;
 class Instruction;
 class LLVMContext;
 class MemoryAccess;
@@ -700,6 +701,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
 class MemorySSA {
 public:
   MemorySSA(Function &, AliasAnalysis *, DominatorTree *);
+  MemorySSA(Loop &, AliasAnalysis *, DominatorTree *);
 
   // MemorySSA must remain where it's constructed; Walkers it creates store
   // pointers to it.
@@ -800,10 +802,11 @@ class MemorySSA {
   // Used by Memory SSA dumpers and wrapper pass
   friend class MemorySSAUpdater;
 
+  template <typename IterT>
   void verifyOrderingDominationAndDefUses(
-      Function &F, VerificationLevel = VerificationLevel::Fast) const;
-  void verifyDominationNumbers(const Function &F) const;
-  void verifyPrevDefInPhis(Function &F) const;
+      IterT Blocks, VerificationLevel = VerificationLevel::Fast) const;
+  template <typename IterT> void verifyDominationNumbers(IterT Blocks) const;
+  template <typename IterT> void verifyPrevDefInPhis(IterT Blocks) const;
 
   // This is used by the use optimizer and updater.
   AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
@@ -847,7 +850,8 @@ class MemorySSA {
   class OptimizeUses;
 
   CachingWalker *getWalkerImpl();
-  void buildMemorySSA(BatchAAResults &BAA);
+  template <typename IterT>
+  void buildMemorySSA(BatchAAResults &BAA, IterT Blocks);
 
   void prepareForMoveTo(MemoryAccess *, BasicBlock *);
   void verifyUseInDefs(MemoryAccess *, MemoryAccess *) const;
@@ -871,7 +875,8 @@ class MemorySSA {
   void renumberBlock(const BasicBlock *) const;
   AliasAnalysis *AA = nullptr;
   DominatorTree *DT;
-  Function &F;
+  Function *F = nullptr;
+  Loop *L = nullptr;
 
   // Memory SSA mappings
   DenseMap<const Value *, MemoryAccess *> ValueToMemoryAccess;
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index e8b03f81b34830..bd804dc1126624 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -22,6 +22,7 @@
 namespace llvm {
 
 class AssumptionCache;
+class AAResults;
 class BasicBlock;
 class BlockFrequencyInfo;
 class DependenceInfo;
@@ -79,7 +80,8 @@ LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                             AssumptionCache *AC,
                             const llvm::TargetTransformInfo *TTI,
                             OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
-                            Loop **RemainderLoop = nullptr);
+                            Loop **RemainderLoop = nullptr,
+                            AAResults *AA = nullptr);
 
 bool UnrollRuntimeLoopRemainder(
     Loop *L, unsigned Count, bool AllowExpensiveTripCount,
@@ -102,7 +104,8 @@ bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                              ScalarEvolution *SE, DominatorTree *DT,
                              AssumptionCache *AC,
-                             const TargetTransformInfo *TTI);
+                             const TargetTransformInfo *TTI,
+                             AAResults *AA = nullptr);
 
 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
 
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 82a6c470650cc9..d88eaceca1a2e7 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -1228,7 +1229,7 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
 }
 
 MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
-    : DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
+    : DT(DT), F(&Func), LiveOnEntryDef(nullptr), Walker(nullptr),
       SkipWalker(nullptr) {
   // Build MemorySSA using a batch alias analysis. This reuses the internal
   // state that AA collects during an alias()/getModRefInfo() call. This is
@@ -1237,7 +1238,28 @@ MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
   // make queries about all the instructions in the Function.
   assert(AA && "No alias analysis?");
   BatchAAResults BatchAA(*AA);
-  buildMemorySSA(BatchAA);
+  buildMemorySSA(BatchAA, iterator_range(F->begin(), F->end()));
+  // Intentionally leave AA to nullptr while building so we don't accidently
+  // use non-batch AliasAnalysis.
+  this->AA = AA;
+  // Also create the walker here.
+  getWalker();
+}
+
+MemorySSA::MemorySSA(Loop &L, AliasAnalysis *AA, DominatorTree *DT)
+    : DT(DT), L(&L), LiveOnEntryDef(nullptr), Walker(nullptr),
+      SkipWalker(nullptr) {
+  // Build MemorySSA using a batch alias analysis. This reuses the internal
+  // state that AA collects during an alias()/getModRefInfo() call. This is
+  // safe because there are no CFG changes while building MemorySSA and can
+  // significantly reduce the time spent by the compiler in AA, because we will
+  // make queries about all the instructions in the Function.
+  assert(AA && "No alias analysis?");
+  BatchAAResults BatchAA(*AA);
+  buildMemorySSA(
+      BatchAA, map_range(L.blocks(), [](const BasicBlock *BB) -> BasicBlock & {
+        return *const_cast<BasicBlock *>(BB);
+      }));
   // Intentionally leave AA to nullptr while building so we don't accidently
   // use non-batch AliasAnalysis.
   this->AA = AA;
@@ -1491,16 +1513,17 @@ void MemorySSA::placePHINodes(
     createMemoryPhi(BB);
 }
 
-void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
+template <typename IterT>
+void MemorySSA::buildMemorySSA(BatchAAResults &BAA, IterT Blocks) {
   // We create an access to represent "live on entry", for things like
   // arguments or users of globals, where the memory they use is defined before
   // the beginning of the function. We do not actually insert it into the IR.
   // We do not define a live on exit for the immediate uses, and thus our
   // semantics do *not* imply that something with no immediate uses can simply
   // be removed.
-  BasicBlock &StartingPoint = F.getEntryBlock();
-  LiveOnEntryDef.reset(new MemoryDef(F.getContext(), nullptr, nullptr,
-                                     &StartingPoint, NextID++));
+  BasicBlock &StartingPoint = *Blocks.begin();
+  LiveOnEntryDef.reset(new MemoryDef(StartingPoint.getContext(), nullptr,
+                                     nullptr, &StartingPoint, NextID++));
 
   // We maintain lists of memory accesses per-block, trading memory for time. We
   // could just look up the memory access for every possible instruction in the
@@ -1508,7 +1531,7 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
   SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
   // Go through each block, figure out where defs occur, and chain together all
   // the accesses.
-  for (BasicBlock &B : F) {
+  for (BasicBlock &B : Blocks) {
     bool InsertIntoDef = false;
     AccessList *Accesses = nullptr;
     DefsList *Defs = nullptr;
@@ -1535,11 +1558,26 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
   // Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
   // filled in with all blocks.
   SmallPtrSet<BasicBlock *, 16> Visited;
-  renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+  if (L) {
+    if (auto *P = getMemoryAccess(L->getLoopPreheader())) {
+      for (Use &U : make_early_inc_range(P->uses())) {
+        U.set(LiveOnEntryDef.get());
+      }
+      removeFromLists(P);
+    }
+    SmallVector<BasicBlock *> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    Visited.insert(ExitBlocks.begin(), ExitBlocks.end());
+    renamePass(DT->getNode(L->getLoopPreheader()), LiveOnEntryDef.get(),
+               Visited);
+
+  } else {
+    renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+  }
 
   // Mark the uses in unreachable blocks as live on entry, so that they go
   // somewhere.
-  for (auto &BB : F)
+  for (auto &BB : Blocks)
     if (!Visited.count(&BB))
       markUnreachableAsLiveOnEntry(&BB);
 }
@@ -1847,7 +1885,10 @@ void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
 
 void MemorySSA::print(raw_ostream &OS) const {
   MemorySSAAnnotatedWriter Writer(this);
-  F.print(OS, &Writer);
+  Function *F = this->F;
+  if (L)
+    F = L->getHeader()->getParent();
+  F->print(OS, &Writer);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1860,10 +1901,23 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
 #endif
 
 #ifndef NDEBUG
-  verifyOrderingDominationAndDefUses(F, VL);
-  verifyDominationNumbers(F);
-  if (VL == VerificationLevel::Full)
-    verifyPrevDefInPhis(F);
+  if (F) {
+    auto Blocks = iterator_range(F->begin(), F->end());
+    verifyOrderingDominationAndDefUses(Blocks, VL);
+    verifyDominationNumbers(Blocks);
+    if (VL == VerificationLevel::Full)
+      verifyPrevDefInPhis(Blocks);
+  } else {
+    assert(L && "must either have loop or function");
+    auto Blocks =
+        map_range(L->blocks(), [](const BasicBlock *BB) -> BasicBlock & {
+          return *const_cast<BasicBlock *>(BB);
+        });
+    verifyOrderingDominationAndDefUses(Blocks, VL);
+    verifyDominationNumbers(Blocks);
+    if (VL == VerificationLevel::Full)
+      verifyPrevDefInPhis(Blocks);
+  }
 #endif
   // Previously, the verification used to also verify that the clobberingAccess
   // cached by MemorySSA is the same as the clobberingAccess found at a later
@@ -1877,8 +1931,9 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
   // example, see test4 added in D51960.
 }
 
-void MemorySSA::verifyPrevDefInPhis(Function &F) const {
-  for (const BasicBlock &BB : F) {
+template <typename IterT>
+void MemorySSA::verifyPrevDefInPhis(IterT Blocks) const {
+  for (const BasicBlock &BB : Blocks) {
     if (MemoryPhi *Phi = getMemoryAccess(&BB)) {
       for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
         auto *Pred = Phi->getIncomingBlock(I);
@@ -1913,12 +1968,13 @@ void MemorySSA::verifyPrevDefInPhis(Function &F) const {
 
 /// Verify that all of the blocks we believe to have valid domination numbers
 /// actually have valid domination numbers.
-void MemorySSA::verifyDominationNumbers(const Function &F) const {
+template <typename IterT>
+void MemorySSA::verifyDominationNumbers(IterT Blocks) const {
   if (BlockNumberingValid.empty())
     return;
 
   SmallPtrSet<const BasicBlock *, 16> ValidBlocks = BlockNumberingValid;
-  for (const BasicBlock &BB : F) {
+  for (const BasicBlock &BB : Blocks) {
     if (!ValidBlocks.count(&BB))
       continue;
 
@@ -1954,14 +2010,15 @@ void MemorySSA::verifyDominationNumbers(const Function &F) const {
 /// Verify def-uses: the immediate use information - walk all the memory
 /// accesses and verifying that, for each use, it appears in the appropriate
 /// def's use list
-void MemorySSA::verifyOrderingDominationAndDefUses(Function &F,
+template <typename IterT>
+void MemorySSA::verifyOrderingDominationAndDefUses(IterT Blocks,
                                                    VerificationLevel VL) const {
   // Walk all the blocks, comparing what the lookups think and what the access
   // lists think, as well as the order in the blocks vs the order in the access
   // lists.
   SmallVector<MemoryAccess *, 32> ActualAccesses;
   SmallVector<MemoryAccess *, 32> ActualDefs;
-  for (BasicBlock &B : F) {
+  for (BasicBlock &B : Blocks) {
     const AccessList *AL = getBlockAccesses(&B);
     const auto *DL = getBlockDefs(&B);
     MemoryPhi *Phi = getMemoryAccess(&B);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 75fb8765061edf..5f38e64873084c 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -27,6 +28,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -1140,7 +1142,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
                 std::optional<bool> ProvidedUpperBound,
                 std::optional<bool> ProvidedAllowPeeling,
                 std::optional<bool> ProvidedAllowProfileBasedPeeling,
-                std::optional<unsigned> ProvidedFullUnrollMaxCount) {
+                std::optional<unsigned> ProvidedFullUnrollMaxCount,
+                AAResults *AA = nullptr) {
 
   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
                     << L->getHeader()->getParent()->getName() << "] Loop %"
@@ -1292,7 +1295,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
     ValueToValueMapTy VMap;
     if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
-      simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
+      simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
       // If the loop was peeled, we already "used up" the profile information
       // we had, so we don't want to unroll or peel again.
       if (PP.PeelProfiledIterations)
@@ -1325,7 +1328,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
       L,
       {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
        UP.UnrollRemainder, ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
+      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
@@ -1572,6 +1575,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  AAResults &AA = AM.getResult<AAManager>(F);
 
   LoopAnalysisManager *LAM = nullptr;
   if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
@@ -1601,6 +1605,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
   SmallPriorityWorklist<Loop *, 4> Worklist;
   appendLoopsToWorklist(LI, Worklist);
 
+  SmallVector<Loop *> LoopsForCSE;
   while (!Worklist.empty()) {
     // Because the LoopInfo stores the loops in RPO, we walk the worklist
     // from back to front so that we work forward across the CFG, which
@@ -1627,7 +1632,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
         /*Count*/ std::nullopt,
         /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
-        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
+        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
+        &AA);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 6f0d000815726e..3aa3630c8a824e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -18,17 +18,20 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_iterator.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
@@ -209,13 +212,143 @@ static bool isEpilogProfitable(Loop *L) {
   return false;
 }
 
+struct LoadValue {
+  Instruction *DefI = nullptr;
+  unsigned Generation = 0;
+  LoadValue() = default;
+  LoadValue(Instruction *Inst, unsigned Generation)
+      : DefI(Inst), Generation(Generation) {}
+};
+
+class StackNode {
+  ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope;
+  unsigned CurrentGeneration;
+  unsigned ChildGeneration;
+  DomTreeNode *Node;
+  DomTreeNode::const_iterator ChildIter;
+  DomTreeNode::const_iterator EndIter;
+  bool Processed = false;
+
+public:
+  StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads,
+            unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child,
+            DomTreeNode::const_iterator End)
+      : LoadScope(AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg),
+        Node(N), ChildIter(Child), EndIter(End) {}
+  // Accessors.
+  unsigned currentGeneration() const { return CurrentGeneration; }
+  unsigned childGeneration() const { return ChildGeneration; }
+  void childGeneration(unsigned generation) { ChildGeneration = generation; }
+  DomTreeNode *node() { return Node; }
+  DomTreeNode::const_iterator childIter() const { return ChildIter; }
+
+  DomTreeNode *nextChild() {
+    DomTreeNode *child = *ChildIter;
+    ++ChildIter;
+    return child;
+  }
+
+  DomTreeNode::const_iterator end() const { return EndIter; }
+  bool isProcessed() const { return Processed; }
+  void process() { Processed = true; }
+};
+
+Value *getMatchingValue(LoadValue LV, LoadInst *LI, unsigned CurrentGeneration,
+                        MemorySSA *MSSA) {
+  if (!LV.DefI)
+    return nullptr;
+  if (LV.Generation != CurrentGeneration) {
+    if (!MSSA)
+      return nullptr;
+    auto *EarlierMA = MSSA->getMemoryAccess(LV.DefI);
+    MemoryAccess *LaterDef;
+    LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LI);
+    if (!MSSA->dominates(LaterDef, EarlierMA))
+      return nullptr;
+  }
+
+  if (LV.DefI->getType() != LI->getType())
+    return nullptr;
+  return LV.DefI;
+}
+
+void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI,
+             function_ref<MemorySSA *()> GetMSSA) {
+  ScopedHashTable<const SCEV *, LoadValue> AvailableLoads;
+  SmallVector<std::unique_ptr<StackNode>> NodesToProcess;
+  DomTreeNode *HeaderD = DT.getNode(L->getHeader());
+  NodesToProcess.emplace_back(new StackNode(AvailableLoads, 0, HeaderD,
+                                            HeaderD->begin(), HeaderD->end()));
+
+  unsigned CurrentGeneration = 0;
+  while (!NodesToProcess.empty()) {
+    // Grab the first item off the stack. Set the current generation, remove
+    // the node from the stack, and process it.
+    StackNode *NodeToProcess = &*NodesToProcess.back();
+
+    // Initialize class members.
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+
+      // If this block has a single predecessor, then the predecessor is the
+      // parent
+      // of the domtree node and all of the live out memory values are still
+      // current in this block.  If this block has multiple predecessors, then
+      // they could have invalidated the live-out memory values of our parent
+      // value.  For now, just be conservative and invalidate memory if this
+      // block has multiple predecessors.
+      if (!NodeToProcess->node()->getBlock()->getSinglePredecessor())
+        ++CurrentGeneration;
+      for (auto &I : make_early_inc_range(*NodeToProcess->node()->getBlock())) {
+
+        auto *Load = dyn_cast<LoadInst>(&I);
+        if (!Load || !Load->isSimple()) {
+          if (I.mayWriteToMemory())
+            CurrentGeneration++;
+          continue;
+        }
+
+        const SCEV *PtrSCEV = SE.getSCEV(Load->getPointerOperand());
+        LoadValue LV = AvailableLoads.lookup(PtrSCEV);
+        if (Value *M =
+                getMatchingValue(LV, Load, CurrentGeneration, GetMSSA())) {
+
+          if (LI.replacementPreservesLCSSAForm(Load, M)) {
+            Load->replaceAllUsesWith(M);
+            Load->eraseFromParent();
+          }
+        } else {
+          AvailableLoads.insert(PtrSCEV, LoadValue(Load, CurrentGeneration));
+        }
+      }
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *child = NodeToProcess->nextChild();
+      if (!L->contains(child->getBlock()))
+        continue;
+      NodesToProcess.emplace_back(
+          new StackNode(AvailableLoads, NodeToProcess->childGeneration(), child,
+                        child->begin(), child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      NodesToProcess.pop_back();
+    }
+  }
+}
+
 /// Perform some cleanup and simplifications on loops after unrolling. It is
 /// useful to simplify the IV's in the new loop, as well as do a quick
 /// simplify/dce pass of the instructions.
 void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                                    ScalarEvolution *SE, DominatorTree *DT,
                                    AssumptionCache *AC,
-                                   const TargetTransformInfo *TTI) {
+                                   const TargetTransformInfo *TTI,
+                                   AAResults *AA) {
   using namespace llvm::PatternMatch;
 
   // Simplify any new induction variables in the partially unrolled loop.
@@ -230,6 +363,15 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
       if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
         RecursivelyDeleteTriviallyDeadInstructions(Inst);
     }
+
+    if (AA) {
+      std::unique_ptr<MemorySSA> MSSA = nullptr;
+      loadCSE(L, *DT, *SE, *LI, [L, AA, DT, &MSSA]() -> MemorySSA * {
+        if (!MSSA)
+          MSSA.reset(new MemorySSA(*L, AA, DT));
+        return &*MSSA;
+      });
+    }
   }
 
   // At this point, the code is well formed.  Perform constprop, instsimplify,
@@ -292,12 +434,11 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
 ///
 /// If RemainderLoop is non-null, it will receive the remainder loop (if
 /// required and not fully unrolled).
-LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
-                                  ScalarEvolution *SE, DominatorTree *DT,
-                                  AssumptionCache *AC,
-                                  const TargetTransformInfo *TTI,
-                                  OptimizationRemarkEmitter *ORE,
-                                  bool PreserveLCSSA, Loop **RemainderLoop) {
+LoopUnrollResult
+llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
+                 ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+                 const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
+                 bool PreserveLCSSA, Loop **RemainderLoop, AAResults *AA) {
   assert(DT && "DomTree is required");
 
   if (!L->getLoopPreheader()) {
@@ -852,7 +993,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC,
-                          TTI);
+                          TTI, AA);
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll b/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
index 109a1834c3029f..d4105254e53146 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-loads-cse.ll
@@ -29,9 +29,7 @@ define void @cse_matching_load_from_previous_unrolled_iteration(ptr %src, ptr no
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr i64, ptr [[SRC_12]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[L_12_1:%.*]] = load i64, ptr [[GEP_SRC_12_1]], align 8
-; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr i64, ptr [[SRC_4]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[L_4_1:%.*]] = load i64, ptr [[GEP_SRC_4_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_4_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_12_1]], [[L_12]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[GEP_DST_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
@@ -425,8 +423,7 @@ define void @loop_body_with_dead_blocks(ptr %src) {
 ; CHECK:       loop.header.1:
 ; CHECK-NEXT:    br label [[LOOP_BB_1:%.*]]
 ; CHECK:       loop.bb.1:
-; CHECK-NEXT:    [[L_1_1:%.*]] = load i32, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[C_1_1:%.*]] = icmp eq i32 [[L_1_1]], 0
+; CHECK-NEXT:    [[C_1_1:%.*]] = icmp eq i32 [[L_2]], 0
 ; CHECK-NEXT:    br i1 [[C_1_1]], label [[OUTER_HEADER_LOOPEXIT]], label [[LOOP_LATCH_1:%.*]]
 ; CHECK:       loop.latch.1:
 ; CHECK-NEXT:    call void @foo()
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
index b32f4e2a258cd7..6c45442bdcd3c3 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/extra-unroll-simplifications.ll
@@ -101,9 +101,7 @@ define void @cse_matching_load_from_previous_unrolled_iteration(i32 %N, ptr %src
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[GEP_SRC_12_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_12]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[L_12_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_12_1]], align 8
-; CHECK-NEXT:    [[GEP_SRC_4_1:%.*]] = getelementptr <2 x i32>, ptr [[SRC_4]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[L_4_1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_4_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul <2 x i32> [[L_4_1]], [[L_12_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul <2 x i32> [[L_12]], [[L_12_1]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr <2 x i32>, ptr [[DST]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    store <2 x i32> [[MUL_1]], ptr [[GEP_DST_1]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2