[llvm] [AArch64] Unroll some loops with early-continues on Apple Silicon. (PR #118499)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 3 07:26:04 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
Try to runtime-unroll loops with early-continues depending on loop-varying
loads; this helps with branch-prediction for the early-continues and can
significantly improve performance for such loops
Builds on top of https://github.com/llvm/llvm-project/pull/118317
---
Patch is 31.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118499.diff
2 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+142-3)
- (modified) llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll (+311-9)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b333d33cffd52..7e7b18ffcc15b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3989,6 +3989,133 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
+/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
+/// OOO engine's wide instruction window and various predictors.
+static void
+getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ AArch64TTIImpl &TTI) {
+ // Limit loops with structure that is highly likely to benefit from runtime
+ // unrolling; that is we exclude outer loops, loops with multiple exits and
+ // many blocks (i.e. likely with complex control flow). Note that the
+ // heuristics here may be overly conservative and we err on the side of
+ // avoiding runtime unrolling rather than unroll excessively. They are all
+ // subject to further refinement.
+ if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
+ return;
+
+ const SCEV *BTC = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
+ (SE.getSmallConstantMaxTripCount(L) > 0 &&
+ SE.getSmallConstantMaxTripCount(L) <= 32))
+ return;
+ if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+ return;
+
+ int64_t Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
+ return;
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ Size +=
+ *TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
+ }
+ }
+
+ // Limit to loops with trip counts that are cheap to expand.
+ UP.SCEVExpansionBudget = 1;
+
+ // Try to unroll small, single block loops, if they have load/store
+ // dependencies, to expose more parallel memory access streams.
+ BasicBlock *Header = L->getHeader();
+ if (Header == L->getLoopLatch()) {
+ if (Size > 8)
+ return;
+
+ SmallPtrSet<Value *, 8> LoadedValues;
+ SmallVector<StoreInst *> Stores;
+ for (auto *BB : L->blocks()) {
+ for (auto &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+ const SCEV *PtrSCEV = SE.getSCEV(Ptr);
+ if (SE.isLoopInvariant(PtrSCEV, L))
+ continue;
+ if (isa<LoadInst>(&I))
+ LoadedValues.insert(&I);
+ else
+ Stores.push_back(cast<StoreInst>(&I));
+ }
+ }
+
+ // Try to find an unroll count that maximizes the use of the instruction
+ // window, i.e. trying to fetch as many instructions per cycle as possible.
+ unsigned MaxInstsPerLine = 16;
+ unsigned UC = 1;
+ unsigned BestUC = 1;
+ unsigned SizeWithBestUC = BestUC * Size;
+ while (UC <= 8) {
+ unsigned SizeWithUC = UC * Size;
+ if (SizeWithUC > 48)
+ break;
+ if ((SizeWithUC % MaxInstsPerLine) == 0 ||
+ (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
+ BestUC = UC;
+ SizeWithBestUC = BestUC * Size;
+ }
+ UC++;
+ }
+
+ if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
+ return LoadedValues.contains(SI->getOperand(0));
+ }))
+ return;
+
+ UP.Runtime = true;
+ UP.DefaultUnrollRuntimeCount = BestUC;
+ return;
+ }
+
+ // Try to runtime-unroll loops with early-continues depending on loop-varying
+ // loads; this helps with branch-prediction for the early-continues.
+ auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
+ auto *Latch = L->getLoopLatch();
+ SmallVector<BasicBlock *> Preds(predecessors(Latch));
+ if (!Term || !Term->isConditional() || Preds.size() == 1 ||
+ none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
+ none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
+ return;
+
+ std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
+ [&](Instruction *I, unsigned Depth) -> bool {
+ if (isa<PHINode>(I))
+ return false;
+
+ if (L->isLoopInvariant(I))
+ return false;
+
+ if (Depth > 8)
+ return false;
+
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return true;
+
+ return any_of(I->operands(), [&](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && DependsOnLoopLoad(I, Depth + 1);
+ });
+ };
+ CmpInst::Predicate Pred;
+ Instruction *I;
+ if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
+ m_Value())) &&
+ DependsOnLoopLoad(I, 0)) {
+ UP.Runtime = true;
+ }
+}
+
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
@@ -4006,9 +4133,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
- if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
- EnableFalkorHWPFUnrollFix)
- getFalkorUnrollingPreferences(L, SE, UP);
+ // Apply subtarget-specific unrolling preferences.
+ switch (ST->getProcFamily()) {
+ case AArch64Subtarget::AppleA14:
+ case AArch64Subtarget::AppleA15:
+ case AArch64Subtarget::AppleA16:
+ case AArch64Subtarget::AppleM4:
+ getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
+ break;
+ case AArch64Subtarget::Falkor:
+ if (EnableFalkorHWPFUnrollFix)
+ getFalkorUnrollingPreferences(L, SE, UP);
+ break;
+ default:
+ break;
+ }
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
index deacec795fb03a..1a091e847ca345 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
; APPLE-LABEL: define void @small_load_store_loop(
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
+; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
+; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
-; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
-; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
+; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
+; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
+; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
+; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
+; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
+; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
+; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
+; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
+; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
+; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
+; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
+; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
+; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
+; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
+; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
+; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
+; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
+; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
+; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
+; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
+; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
+; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
+; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
+; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
+; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
+; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
+; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
+; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
+; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
+; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
+; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
+; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
+; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
+; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
+; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; APPLE: [[EXIT_UNR_LCSSA]]:
+; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
+; APPLE: [[LOOP_EPIL_PREHEADER]]:
+; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
+; APPLE: [[LOOP_EPIL]]:
+; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
+; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
+; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
+; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
+; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
+; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
+; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
+; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
+; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
+; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
+; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; APPLE: [[EXIT_EPILOG_LCSSA]]:
+; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
@@ -99,13 +173,21 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
; APPLE-LABEL: define void @early_continue_dep_on_load_large(
; APPLE-SAME: ptr [[P_1:%.*]], ptr [[P_2:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i32 [[WIDTH:%.*]], i32 [[T_1:%.*]], i32 [[T_2:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
+; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
+; APPLE-NEXT: [[TMP1:%.*]] = add i64 [[N]], -2
+; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 3
+; APPLE-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 3
+; APPLE-NEXT: br i1 [[TMP2]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
+; APPLE: [[ENTRY_NEW]]:
+; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[TMP0]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP_HEADER:.*]]
; APPLE: [[LOOP_HEADER]]:
-; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_LATCH:.*]] ]
+; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 1, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ]
+; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP_LATCH_3]] ]
; APPLE-NEXT: [[GEP_EPIL:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_EPIL]]
; APPLE-NEXT: [[L_1_EPIL:%.*]] = load i32, ptr [[GEP_EPIL]], align 4
; APPLE-NEXT: [[CMP6_NOT_EPIL:%.*]] = icmp sgt i32 [[L_1_EPIL]], [[T_1]]
-; APPLE-NEXT: br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; APPLE-NEXT: br i1 [[CMP6_NOT_EPIL]], label %[[THEN:.*]], label %[[LOOP_LATCH:.*]]
; APPLE: [[THEN]]:
; APPLE-NEXT: [[GEP_4_EPIL:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_EPIL]], i64 4
; APPLE-NEXT: [[L_2_EPIL:%.*]] = load i8, ptr [[GEP_4_EPIL]], align 4
@@ -150,9 +232,224 @@ define void @early_continue_dep_on_load_large(ptr %p.1, ptr %p.2, i64 %N, i32 %x
; APPLE-NEXT: store i8 [[RES_EPIL]], ptr [[GEP_5_EPIL]], align 1
; APPLE-NEXT: br label %[[LOOP_LATCH]]
; APPLE: [[LOOP_LATCH]]:
-; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
-; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
-; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
+; APPLE-NEXT: [[GEP_1:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_EPIL]]
+; APPLE-NEXT: [[L_1_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; APPLE-NEXT: [[C_1_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_1]]
+; APPLE-NEXT: br i1 [[C_1_1]], label %[[THEN_1:.*]], label %[[LOOP_LATCH_1:.*]]
+; APPLE: [[THEN_1]]:
+; APPLE-NEXT: [[GEP_4_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 4
+; APPLE-NEXT: [[L_2_1:%.*]] = load i8, ptr [[GEP_4_1]], align 4
+; APPLE-NEXT: [[C_2_1:%.*]] = icmp ugt i8 [[L_2_1]], 7
+; APPLE-NEXT: br i1 [[C_2_1]], label %[[MERGE_11:.*]], label %[[ELSE_1:.*]]
+; APPLE: [[ELSE_1]]:
+; APPLE-NEXT: [[CONV_I_1:%.*]] = zext nneg i8 [[L_2_1]] to i64
+; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_1]]
+; APPLE-NEXT: [[L_3_1:%.*]] = load i8, ptr [[GEP_A_1]], align 1
+; APPLE-NEXT: [[IDXPROM_I_1:%.*]] = sext i8 [[L_3_1]] to i64
+; APPLE-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT: [[L_4_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
+; APPLE-NEXT: [[GEP_C_1:%.*]] = getelementptr inbounds [8 x i32], ptr @C, i64 0, i64 [[IDXPROM_I_1]]
+; APPLE-NEXT: [[L_5_1:%.*]] = load i32, ptr [[GEP_C_1]], align 4
+; APPLE-NEXT: br label %[[MERGE_11]]
+; APPLE: [[MERGE_11]]:
+; APPLE-NEXT: [[MERGE_1_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_4_1]], %[[ELSE_1]] ]
+; APPLE-NEXT: [[MERGE_2_1:%.*]] = phi i32 [ 0, %[[THEN_1]] ], [ [[L_5_1]], %[[ELSE_1]] ]
+; APPLE-NEXT: [[ADD14_1:%.*]] = add nsw i32 [[MERGE_2_1]], [[X]]
+; APPLE-NEXT: [[MUL15_1:%.*]] = mul nsw i32 [[ADD14_1]], [[WIDTH]]
+; APPLE-NEXT: [[TMP4:%.*]] = trunc nuw nsw i64 [[IV_NEXT_EPIL]] to i32
+; APPLE-NEXT: [[ADD16_1:%.*]] = add nsw i32 [[MERGE_1_1]], [[TMP4]]
+; APPLE-NEXT: [[ADD17_1:%.*]] = add nsw i32 [[ADD16_1]], [[MUL15_1]]
+; APPLE-NEXT: [[IDXPROM18_1:%.*]] = sext i32 [[ADD17_1]] to i64
+; APPLE-NEXT: [[GEP_P_2_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM18_1]]
+; APPLE-NEXT: [[L_6_1:%.*]] = load i32, ptr [[GEP_P_2_1]], align 4
+; APPLE-NEXT: [[SUB_1:%.*]] = sub nsw i32 [[X]], [[MERGE_2_1]]
+; APPLE-NEXT: [[MUL21_1:%.*]] = mul nsw i32 [[SUB_1]], [[WIDTH]]
+; APPLE-NEXT: [[SUB22_1:%.*]] = sub i32 [[TMP4]], [[MERGE_1_1]]
+; APPLE-NEXT: [[ADD23_1:%.*]] = add nsw i32 [[SUB22_1]], [[MUL21_1]]
+; APPLE-NEXT: [[IDXPROM24_1:%.*]] = sext i32 [[ADD23_1]] to i64
+; APPLE-NEXT: [[GEP_P2_1_1:%.*]] = getelementptr inbounds { i32, i8, i8, [2 x i8] }, ptr [[P_2]], i64 [[IDXPROM24_1]]
+; APPLE-NEXT: [[L_7_1:%.*]] = load i32, ptr [[GEP_P2_1_1]], align 4
+; APPLE-NEXT: [[C_3_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_6_1]]
+; APPLE-NEXT: [[C_4_1:%.*]] = icmp sgt i32 [[L_1_1]], [[L_7_1]]
+; APPLE-NEXT: [[AND34_1:%.*]] = and i1 [[C_3_1]], [[C_4_1]]
+; APPLE-NEXT: br i1 [[AND34_1]], label %[[STORE_RES_1:.*]], label %[[LOOP_LATCH_1]]
+; APPLE: [[STORE_RES_1]]:
+; APPLE-NEXT: [[C_5_1:%.*]] = icmp sgt i32 [[L_1_1]], [[T_2]]
+; APPLE-NEXT: [[GEP_5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_1]], i64 5
+; APPLE-NEXT: [[RES_1:%.*]] = select i1 [[C_5_1]], i8 1, i8 2
+; APPLE-NEXT: store i8 [[RES_1]], ptr [[GEP_5_1]], align 1
+; APPLE-NEXT: br label %[[LOOP_LATCH_1]]
+; APPLE: [[LOOP_LATCH_1]]:
+; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
+; APPLE-NEXT: [[GEP_2:%.*]] = getelementptr { i32, i8, i8, [2 x i8] }, ptr [[P_1]], i64 [[IV_NEXT_1]]
+; APPLE-NEXT: [[L_1_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; APPLE-NEXT: [[C_1_2:%.*]] = icmp sgt i32 [[L_1_2]], [[T_1]]
+; APPLE-NEXT: br i1 [[C_1_2]], label %[[THEN_2:.*]], label %[[LOOP_LATCH_2:.*]]
+; APPLE: [[THEN_2]]:
+; APPLE-NEXT: [[GEP_4_2:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_2]], i64 4
+; APPLE-NEXT: [[L_2_2:%.*]] = load i8, ptr [[GEP_4_2]], align 4
+; APPLE-NEXT: [[C_2_2:%.*]] = icmp ugt i8 [[L_2_2]], 7
+; APPLE-NEXT: br i1 [[C_2_2]], label %[[MERGE_22:.*]], label %[[ELSE_2:.*]]
+; APPLE: [[ELSE_2]]:
+; APPLE-NEXT: [[CONV_I_2:%.*]] = zext nneg i8 [[L_2_2]] to i64
+; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds [9 x i8], ptr @A, i64 0, i64 [[CONV_I_2]]
+; APPLE-NEXT: [[L_3_2:%.*]] = load i8, ptr [[GEP_A_2]], align 1
+; APPLE-NEXT: [[IDXPROM_I_2:%.*]] = sext i8 [[L_3_2]] to i64
+; APPLE-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds [8 x i32], ptr @B, i64 0, i64 [[IDXPROM_I_2]]
+; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/118499
More information about the llvm-commits
mailing list