[llvm] [AArch64] Tune unrolling prefs for more patterns on Apple CPUs (PR #149358)

Ahmad Yasin via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 4 10:07:20 PDT 2025


https://github.com/ayasin-a updated https://github.com/llvm/llvm-project/pull/149358

>From 7b46981ceca0054efdedf6c52ebb784018471ef0 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 9 Jul 2025 16:27:30 +0300
Subject: [PATCH 1/8] tune budget for runtime-unroll

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 90d3d92d6bbf5..1eb6589ed8ca9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4808,7 +4808,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   if (Header == L->getLoopLatch()) {
     // Estimate the size of the loop.
     unsigned Size;
-    if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
+    if (!isLoopSizeWithinBudget(L, TTI, 9, &Size))
       return;
 
     SmallPtrSet<Value *, 8> LoadedValues;

>From eb21acc1bca702f935f408e64a0fb34f4870ea37 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 9 Jul 2025 22:21:59 +0300
Subject: [PATCH 2/8] Include immediate users of loaded values in the
 load/stores dependencies predicate

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1eb6589ed8ca9..6d97ae7c8c5e7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4811,7 +4811,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
     if (!isLoopSizeWithinBudget(L, TTI, 9, &Size))
       return;
 
-    SmallPtrSet<Value *, 8> LoadedValues;
+    SmallPtrSet<Value *, 8> LoadedValuesPlus;
     SmallVector<StoreInst *> Stores;
     for (auto *BB : L->blocks()) {
       for (auto &I : *BB) {
@@ -4821,9 +4821,16 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
         const SCEV *PtrSCEV = SE.getSCEV(Ptr);
         if (SE.isLoopInvariant(PtrSCEV, L))
           continue;
-        if (isa<LoadInst>(&I))
-          LoadedValues.insert(&I);
-        else
+        if (isa<LoadInst>(&I)) {
+          LoadedValuesPlus.insert(&I);
+          // Included 1st users of loaded values
+          for (auto *U : I.users()) {
+            auto *Inst = dyn_cast<Instruction>(U);
+            if (!Inst || Inst->getParent() != BB)
+              continue;
+            LoadedValuesPlus.insert(U);
+          }
+        } else
           Stores.push_back(cast<StoreInst>(&I));
       }
     }
@@ -4846,8 +4853,8 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
       UC++;
     }
 
-    if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
-          return LoadedValues.contains(SI->getOperand(0));
+    if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
+          return LoadedValuesPlus.contains(SI->getOperand(0));
         }))
       return;
 

>From f20211fb4c15a4e5a47e531bbb776beb605ae104 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 23 Jul 2025 12:51:07 +0300
Subject: [PATCH 3/8] a test for the load-op-store scenario permitted by this
 PR

---
 .../LoopUnroll/AArch64/apple-unrolling.ll     | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index 1a091e847ca34..5304b43601910 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -165,6 +165,126 @@ exit:
   ret void
 }
 
+define void @load_op_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) {
+; APPLE-LABEL: define void @load_op_store_loop(
+; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; APPLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; APPLE-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE:       [[ENTRY_NEW]]:
+; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; APPLE-NEXT:    [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
+; APPLE-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; APPLE-NEXT:    [[O:%.*]] = fadd float [[L]], [[K]]
+; APPLE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
+; APPLE-NEXT:    store float [[O]], ptr [[GEP_DST]], align 4
+; APPLE-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; APPLE-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; APPLE-NEXT:    [[O_1:%.*]] = fadd float [[L_1]], [[K]]
+; APPLE-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]]
+; APPLE-NEXT:    store float [[O_1]], ptr [[GEP_DST_1]], align 4
+; APPLE-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; APPLE-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; APPLE-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; APPLE-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; APPLE:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; APPLE-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; APPLE:       [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE:       [[LOOP_EPIL_PREHEADER]]:
+; APPLE-NEXT:    br label %[[LOOP_EPIL:.*]]
+; APPLE:       [[LOOP_EPIL]]:
+; APPLE-NEXT:    [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
+; APPLE-NEXT:    [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
+; APPLE-NEXT:    [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]]
+; APPLE-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]]
+; APPLE-NEXT:    store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4
+; APPLE-NEXT:    br label %[[EXIT]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    ret void
+;
+; OTHER-LABEL: define void @load_op_store_loop(
+; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; OTHER-NEXT:    [[XTRAITER:%.*]] = and i64 [[N]], 1
+; OTHER-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
+; OTHER-NEXT:    br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; OTHER:       [[ENTRY_NEW]]:
+; OTHER-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
+; OTHER-NEXT:    [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
+; OTHER-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
+; OTHER-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; OTHER-NEXT:    [[O:%.*]] = fadd float [[L]], [[K]]
+; OTHER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
+; OTHER-NEXT:    store float [[O]], ptr [[GEP_DST]], align 4
+; OTHER-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]]
+; OTHER-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; OTHER-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; OTHER-NEXT:    [[O_1:%.*]] = fadd float [[L_1]], [[K]]
+; OTHER-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]]
+; OTHER-NEXT:    store float [[O_1]], ptr [[GEP_DST_1]], align 4
+; OTHER-NEXT:    [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
+; OTHER-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; OTHER-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; OTHER-NEXT:    br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; OTHER:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; OTHER-NEXT:    [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
+; OTHER-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; OTHER:       [[EXIT_UNR_LCSSA]]:
+; OTHER-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; OTHER-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; OTHER-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; OTHER:       [[LOOP_EPIL_PREHEADER]]:
+; OTHER-NEXT:    br label %[[LOOP_EPIL:.*]]
+; OTHER:       [[LOOP_EPIL]]:
+; OTHER-NEXT:    [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]]
+; OTHER-NEXT:    [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
+; OTHER-NEXT:    [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
+; OTHER-NEXT:    [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]]
+; OTHER-NEXT:    [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]]
+; OTHER-NEXT:    store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4
+; OTHER-NEXT:    br label %[[EXIT]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %scaled.iv = mul nuw nsw i64 %iv, %scale
+  %gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv
+  %l = load float, ptr %gep.src, align 4
+  %o = fadd float %l, %k
+  %gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv
+  store float %o, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 @A = external constant [9 x i8], align 1
 @B = external constant [8 x i32], align 4
 @C = external constant [8 x i32], align 4

>From 18cd58b0c0b3a7b6763f693c972221ea116821e2 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Thu, 24 Jul 2025 00:06:56 +0300
Subject: [PATCH 4/8] Update comments with the newly covered pattern

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6d97ae7c8c5e7..8290d87e1df87 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4802,8 +4802,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
   // Limit to loops with trip counts that are cheap to expand.
   UP.SCEVExpansionBudget = 1;
 
-  // Try to unroll small, single block loops, if they have load/store
-  // dependencies, to expose more parallel memory access streams.
+  // Try to unroll small loops, of few-blocks with low budget, if they have
+  // load/store dependencies, to expose more parallel memory access streams,
+  // or if they do little work inside a block (i.e. load -> X -> store pattern).
   BasicBlock *Header = L->getHeader();
   if (Header == L->getLoopLatch()) {
     // Estimate the size of the loop.

>From 5f715615407c67dda3b611fd599ce4549290af76 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Tue, 29 Jul 2025 15:17:01 +0300
Subject: [PATCH 5/8] Avoid unnecessary dyn_cast of Instruction

Co-authored-by: Florian Hahn <flo at fhahn.com>
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8290d87e1df87..0d9fe59e92607 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4826,7 +4826,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
           LoadedValuesPlus.insert(&I);
           // Included 1st users of loaded values
           for (auto *U : I.users()) {
-            auto *Inst = dyn_cast<Instruction>(U);
+            auto *Inst = cast<Instruction>(U);
             if (!Inst || Inst->getParent() != BB)
               continue;
             LoadedValuesPlus.insert(U);

>From 8022590a4a37ec9f600d7bc44f87b7f888173005 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Wed, 30 Jul 2025 22:25:00 +0300
Subject: [PATCH 6/8] allow load users from other blocks of the same loop

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0d9fe59e92607..f7a4959875712 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4827,7 +4827,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
           // Included 1st users of loaded values
           for (auto *U : I.users()) {
             auto *Inst = cast<Instruction>(U);
-            if (!Inst || Inst->getParent() != BB)
+            if (!Inst)
               continue;
             LoadedValuesPlus.insert(U);
           }

>From 344f098a04425fe01bfbb14828a6bbc78f9ecbff Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 4 Aug 2025 18:53:47 +0300
Subject: [PATCH 7/8] skip load users outside the loop

---
 .../lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f7a4959875712..64943b2adf9b8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4824,13 +4824,10 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
           continue;
         if (isa<LoadInst>(&I)) {
           LoadedValuesPlus.insert(&I);
-          // Included 1st users of loaded values
-          for (auto *U : I.users()) {
-            auto *Inst = cast<Instruction>(U);
-            if (!Inst)
-              continue;
-            LoadedValuesPlus.insert(U);
-          }
+          // Include in-loop 1st users of loaded values.
+          for (auto *U : I.users())
+            if (L->contains(cast<Instruction>(U)))
+              LoadedValuesPlus.insert(U);
         } else
           Stores.push_back(cast<StoreInst>(&I));
       }

>From e6466424291e85b1d0f97bffbb5db67ad6800765 Mon Sep 17 00:00:00 2001
From: Ahmad Yasin <ahmad.yasin at apple.com>
Date: Mon, 4 Aug 2025 20:07:01 +0300
Subject: [PATCH 8/8] added a test case with users from different blocks of the
 loop

---
 .../LoopUnroll/AArch64/apple-unrolling.ll     | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index 5304b43601910..94760a817e1d7 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -285,6 +285,84 @@ exit:
   ret void
 }
 
+define void @load_op_store_loop_multiblock(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) {
+; APPLE-LABEL: define void @load_op_store_loop_multiblock(
+; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    br label %[[LOOP:.*]]
+; APPLE:       [[LOOP]]:
+; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ]
+; APPLE-NEXT:    [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
+; APPLE-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
+; APPLE-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; APPLE-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
+; APPLE-NEXT:    [[ODD:%.*]] = icmp eq i64 [[AND]], 1
+; APPLE-NEXT:    br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]]
+; APPLE:       [[LOOPCONT]]:
+; APPLE-NEXT:    [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ]
+; APPLE-NEXT:    [[O:%.*]] = fadd float [[D]], [[K]]
+; APPLE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
+; APPLE-NEXT:    store float [[O]], ptr [[GEP_DST]], align 4
+; APPLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; APPLE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; APPLE-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE:       [[LOOPODD]]:
+; APPLE-NEXT:    [[L2]] = fneg float [[L]]
+; APPLE-NEXT:    br label %[[LOOPCONT]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    ret void
+;
+; OTHER-LABEL: define void @load_op_store_loop_multiblock(
+; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    br label %[[LOOP:.*]]
+; OTHER:       [[LOOP]]:
+; OTHER-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ]
+; OTHER-NEXT:    [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
+; OTHER-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
+; OTHER-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; OTHER-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
+; OTHER-NEXT:    [[ODD:%.*]] = icmp eq i64 [[AND]], 1
+; OTHER-NEXT:    br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]]
+; OTHER:       [[LOOPCONT]]:
+; OTHER-NEXT:    [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ]
+; OTHER-NEXT:    [[O:%.*]] = fadd float [[D]], [[K]]
+; OTHER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
+; OTHER-NEXT:    store float [[O]], ptr [[GEP_DST]], align 4
+; OTHER-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; OTHER-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; OTHER-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; OTHER:       [[LOOPODD]]:
+; OTHER-NEXT:    [[L2]] = fneg float [[L]]
+; OTHER-NEXT:    br label %[[LOOPCONT]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loopcont ]
+  %scaled.iv = mul nuw nsw i64 %iv, %scale
+  %gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv
+  %l1 = load float, ptr %gep.src, align 4
+  %and = and i64 %iv, 1
+  %odd = icmp eq i64 %and, 1
+  br i1 %odd, label %loopodd, label %loopcont
+loopcont:
+  %d = phi float [ %l2, %loopodd ], [ %l1, %loop]
+  %o = fadd float %d, %k
+  %gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv
+  store float %o, ptr %gep.dst, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+loopodd:
+  %l2 = fneg float %l1
+  br label %loopcont
+exit:
+  ret void
+}
+
 @A = external constant [9 x i8], align 1
 @B = external constant [8 x i32], align 4
 @C = external constant [8 x i32], align 4



More information about the llvm-commits mailing list