[llvm] 75d4812 - [AArch64] Turn MaxInterleaveFactor into a subtarget feature (#171088)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 2 07:45:32 PST 2026
Author: David Green
Date: 2026-01-02T15:45:27Z
New Revision: 75d4812532b0bb471b2d55d76c68a619d58e8f46
URL: https://github.com/llvm/llvm-project/commit/75d4812532b0bb471b2d55d76c68a619d58e8f46
DIFF: https://github.com/llvm/llvm-project/commit/75d4812532b0bb471b2d55d76c68a619d58e8f46.diff
LOG: [AArch64] Turn MaxInterleaveFactor into a subtarget feature (#171088)
The default value for MaxInterleaveFactor is 2, but some CPUs prefer a
wider factor of 4. This adds a subtarget feature so that cpus can
override the default in their tuning features, keeping more of the
options together in one place.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64Features.td
llvm/lib/Target/AArch64/AArch64Processors.td
llvm/lib/Target/AArch64/AArch64Subtarget.cpp
llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 5d9878aac507c..47b518dab3a08 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -923,6 +923,10 @@ def FeatureAggressiveInterleaving : SubtargetFeature<"aggressive-interleaving",
"AggressiveInterleaving", "true",
"Make use of aggressive interleaving during vectorization">;
+def FeatureMaxInterleaveFactor4 : SubtargetFeature<
+ "max-interleave-factor-4", "MaxInterleaveFactor", "4",
+ "Set the MaxInterleaveFactor to 4 (from the default 2)">;
+
//===----------------------------------------------------------------------===//
// Architectures.
//
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index fe763a1ff0460..0fff083ee9db9 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -83,7 +83,8 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureAddrLSLSlow14,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
"Cortex-A65 ARM processors", [
@@ -354,7 +355,8 @@ def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
FeatureArithmeticBccFusion,
FeatureStorePairSuppress,
FeaturePredictableSelectIsExpensive,
- FeatureDisableUnpredicatedLdStLower]>;
+ FeatureDisableUnpredicatedLdStLower,
+ FeatureMaxInterleaveFactor4]>;
def TuneMONAKA : SubtargetFeature<"fujitsu-monaka", "ARMProcFamily", "MONAKA",
"Fujitsu FUJITSU-MONAKA processors", [
@@ -376,7 +378,8 @@ def TuneOlympus : SubtargetFeature<"olympus", "ARMProcFamily", "Olympus",
FeatureFuseAdrpAdd,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureUseFixedOverScalableIfEqualCost]>;
+ FeatureUseFixedOverScalableIfEqualCost,
+ FeatureMaxInterleaveFactor4]>;
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targeting apple OSes.
@@ -479,7 +482,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
@@ -500,7 +504,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
@@ -521,7 +526,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
@@ -542,7 +548,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
@@ -562,7 +569,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
"Apple M5", [
@@ -582,7 +590,8 @@ def TuneAppleM5 : SubtargetFeature<"apple-m5", "ARMProcFamily", "AppleM5",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureNoZCZeroingFPR64,
- FeatureZCZeroingFPR128]>;
+ FeatureZCZeroingFPR128,
+ FeatureMaxInterleaveFactor4]>;
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
@@ -596,7 +605,8 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureStorePairSuppress,
FeatureALULSLFast,
FeaturePostRAScheduler,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
// Re-uses some scheduling and tunings from the ExynosM3 proc family.
def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
@@ -615,7 +625,8 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureZCZeroingGPR32,
- FeatureZCZeroingGPR64]>;
+ FeatureZCZeroingGPR64,
+ FeatureMaxInterleaveFactor4]>;
def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [
@@ -624,7 +635,8 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureALULSLFast,
- FeatureStorePairSuppress]>;
+ FeatureStorePairSuppress,
+ FeatureMaxInterleaveFactor4]>;
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
@@ -634,7 +646,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast,
- FeatureSlowSTRQro]>;
+ FeatureSlowSTRQro,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
"Neoverse E1 ARM processors", [
@@ -682,7 +695,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
"Neoverse V1 ARM processors", [
@@ -711,7 +725,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
FeatureUseFixedOverScalableIfEqualCost,
FeatureAvoidLDAPUR,
FeaturePredictableSelectIsExpensive,
- FeatureDisableLatencySchedHeuristic]>;
+ FeatureDisableLatencySchedHeuristic,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
"Neoverse V3 ARM processors", [
@@ -723,7 +738,8 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
"Neoverse V3AE ARM processors", [
@@ -735,7 +751,8 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeatureAvoidLDAPUR,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
@@ -744,7 +761,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
- FeatureALULSLFast]>;
+ FeatureALULSLFast,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
"Cavium ThunderX2 processors", [
@@ -752,7 +770,8 @@ def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "Thund
FeatureArithmeticBccFusion,
FeaturePostRAScheduler,
FeatureStorePairSuppress,
- FeaturePredictableSelectIsExpensive]>;
+ FeaturePredictableSelectIsExpensive,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
"ThunderX3T110",
@@ -763,7 +782,8 @@ def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
FeaturePredictableSelectIsExpensive,
FeatureBalanceFPOps,
FeatureStorePairSuppress,
- FeatureStrictAlign]>;
+ FeatureStrictAlign,
+ FeatureMaxInterleaveFactor4]>;
def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
"Cavium ThunderX processors", [
@@ -811,7 +831,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
"Ampere Computing Ampere-1A processors", [
@@ -827,7 +848,8 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
FeatureFuseAddSub2RegAndConstOne,
FeatureStorePairSuppress,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
"Ampere Computing Ampere-1B processors", [
@@ -844,7 +866,8 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B",
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive,
FeatureLdpAlignedOnly,
- FeatureStpAlignedOnly]>;
+ FeatureStpAlignedOnly,
+ FeatureMaxInterleaveFactor4]>;
def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
"Oryon",
@@ -867,7 +890,8 @@ def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily",
FeaturePerfMon,
FeatureSPE,
FeaturePostRAScheduler,
- HasV8_6aOps]>;
+ HasV8_6aOps,
+ FeatureMaxInterleaveFactor4]>;
def ProcessorFeatures {
list<SubtargetFeature> A320 = [HasV9_2aOps, FeatureNEON, FeatureMTE,
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 911eedbe36185..f3640239809dd 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -149,7 +149,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxBytesForLoopAlignment = 8;
break;
case CortexA57:
- MaxInterleaveFactor = 4;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(16);
MaxBytesForLoopAlignment = 8;
@@ -203,7 +202,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 256;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -227,17 +225,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
- if (isAppleMLike())
- MaxInterleaveFactor = 4;
break;
case ExynosM3:
- MaxInterleaveFactor = 4;
MaxJumpTableSize = 20;
PrefFunctionAlignment = Align(32);
PrefLoopAlignment = Align(16);
break;
case Falkor:
- MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
CacheLineSize = 128;
@@ -246,7 +240,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
MaxPrefetchIterationsAhead = 8;
break;
case Kryo:
- MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
CacheLineSize = 128;
PrefetchDistance = 740;
@@ -267,7 +260,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case NeoverseV3:
CacheLineSize = 64;
EpilogueVectorizationMinVF = 8;
- MaxInterleaveFactor = 4;
ScatterOverhead = 13;
[[fallthrough]];
case NeoverseN2:
@@ -287,10 +279,8 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
case Neoverse512TVB:
PrefFunctionAlignment = Align(16);
VScaleForTuning = 1;
- MaxInterleaveFactor = 4;
break;
case Saphira:
- MaxInterleaveFactor = 4;
// FIXME: remove this to enable 64-bit SLP if performance looks good.
MinVectorRegisterBitWidth = 128;
break;
@@ -298,7 +288,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(8);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -324,7 +313,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(4);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 4;
@@ -337,18 +325,15 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
CacheLineSize = 64;
PrefFunctionAlignment = Align(64);
PrefLoopAlignment = Align(64);
- MaxInterleaveFactor = 4;
break;
case Oryon:
CacheLineSize = 64;
PrefFunctionAlignment = Align(16);
- MaxInterleaveFactor = 4;
PrefetchDistance = 128;
MinPrefetchStride = 1024;
break;
case Olympus:
EpilogueVectorizationMinVF = 8;
- MaxInterleaveFactor = 4;
ScatterOverhead = 13;
PrefFunctionAlignment = Align(16);
PrefLoopAlignment = Align(32);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
index f7060ec3512ac..c2224858049b7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
@@ -1,16 +1,110 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -S %s | FileCheck --check-prefix=INTERLEAVE-2 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mattr=+max-interleave-factor-4 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=cortex-a75 -S %s | FileCheck --check-prefix=INTERLEAVE-2 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-m1 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a17 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a18 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3ae -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=exynos-m5 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
; Tests for selecting interleave counts for loops with loads and stores.
define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 %b) {
+; INTERLEAVE-2-LABEL: @interleave_single_load_store(
+; INTERLEAVE-2-NEXT: iter.check:
+; INTERLEAVE-2-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; INTERLEAVE-2-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; INTERLEAVE-2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; INTERLEAVE-2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; INTERLEAVE-2: vector.memcheck:
+; INTERLEAVE-2-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; INTERLEAVE-2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; INTERLEAVE-2-NEXT: br i1 [[DIFF_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; INTERLEAVE-2: vector.main.loop.iter.check:
+; INTERLEAVE-2-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[N]], 32
+; INTERLEAVE-2-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INTERLEAVE-2: vector.ph:
+; INTERLEAVE-2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; INTERLEAVE-2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
+; INTERLEAVE-2-NEXT: br label [[VECTOR_BODY:%.*]]
+; INTERLEAVE-2: vector.body:
+; INTERLEAVE-2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16
+; INTERLEAVE-2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; INTERLEAVE-2-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; INTERLEAVE-2-NEXT: [[TMP7:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-2-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-2-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT6]])
+; INTERLEAVE-2-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD4]], <16 x i8> [[BROADCAST_SPLAT6]])
+; INTERLEAVE-2-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP9]]
+; INTERLEAVE-2-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP10]]
+; INTERLEAVE-2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 16
+; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP11]], ptr [[TMP13]], align 1
+; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP12]], ptr [[TMP16]], align 1
+; INTERLEAVE-2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; INTERLEAVE-2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-2: middle.block:
+; INTERLEAVE-2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; INTERLEAVE-2: vec.epilog.iter.check:
+; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
+; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; INTERLEAVE-2: vec.epilog.ph:
+; INTERLEAVE-2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-2-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[N]], 8
+; INTERLEAVE-2-NEXT: [[N_VEC8:%.*]] = sub i64 [[N]], [[N_MOD_VF7]]
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT12]], <8 x i8> poison, <8 x i32> zeroinitializer
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT14]], <8 x i8> poison, <8 x i32> zeroinitializer
+; INTERLEAVE-2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; INTERLEAVE-2: vec.epilog.vector.body:
+; INTERLEAVE-2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; INTERLEAVE-2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX10]]
+; INTERLEAVE-2-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP19]], align 1
+; INTERLEAVE-2-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD11]], [[BROADCAST_SPLAT13]]
+; INTERLEAVE-2-NEXT: [[TMP22:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD11]], <8 x i8> [[BROADCAST_SPLAT15]])
+; INTERLEAVE-2-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP21]], <8 x i8> [[BROADCAST_SPLAT13]], <8 x i8> [[TMP22]]
+; INTERLEAVE-2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX10]]
+; INTERLEAVE-2-NEXT: store <8 x i8> [[TMP23]], ptr [[TMP24]], align 1
+; INTERLEAVE-2-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX10]], 8
+; INTERLEAVE-2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC8]]
+; INTERLEAVE-2-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE-2: vec.epilog.middle.block:
+; INTERLEAVE-2-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[N]], [[N_VEC8]]
+; INTERLEAVE-2-NEXT: br i1 [[CMP_N9]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; INTERLEAVE-2: vec.epilog.scalar.ph:
+; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; INTERLEAVE-2-NEXT: br label [[LOOP:%.*]]
+; INTERLEAVE-2: loop:
+; INTERLEAVE-2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; INTERLEAVE-2-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; INTERLEAVE-2-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; INTERLEAVE-2-NEXT: [[CMP:%.*]] = icmp sgt i8 [[L]], [[B]]
+; INTERLEAVE-2-NEXT: [[MAX:%.*]] = tail call i8 @llvm.smax.i8(i8 [[L]], i8 [[A]])
+; INTERLEAVE-2-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[B]], i8 [[MAX]]
+; INTERLEAVE-2-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; INTERLEAVE-2-NEXT: store i8 [[SEL]], ptr [[GEP_DST]], align 1
+; INTERLEAVE-2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-2-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; INTERLEAVE-2: exit:
+; INTERLEAVE-2-NEXT: ret void
+;
; INTERLEAVE-4-LABEL: @interleave_single_load_store(
; INTERLEAVE-4-NEXT: iter.check:
; INTERLEAVE-4-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
@@ -113,99 +207,117 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
; INTERLEAVE-4: exit:
; INTERLEAVE-4-NEXT: ret void
;
-; INTERLEAVE-2-LABEL: @interleave_single_load_store(
-; INTERLEAVE-2-NEXT: iter.check:
-; INTERLEAVE-2-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
-; INTERLEAVE-2-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; INTERLEAVE-2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; INTERLEAVE-2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; INTERLEAVE-2: vector.memcheck:
-; INTERLEAVE-2-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
-; INTERLEAVE-2-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
-; INTERLEAVE-2-NEXT: br i1 [[DIFF_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; INTERLEAVE-2: vector.main.loop.iter.check:
-; INTERLEAVE-2-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[N]], 32
-; INTERLEAVE-2-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; INTERLEAVE-2: vector.ph:
-; INTERLEAVE-2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; INTERLEAVE-2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
-; INTERLEAVE-2-NEXT: br label [[VECTOR_BODY:%.*]]
-; INTERLEAVE-2: vector.body:
-; INTERLEAVE-2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16
-; INTERLEAVE-2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; INTERLEAVE-2-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
-; INTERLEAVE-2-NEXT: [[TMP7:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; INTERLEAVE-2-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]]
-; INTERLEAVE-2-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT6]])
-; INTERLEAVE-2-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD4]], <16 x i8> [[BROADCAST_SPLAT6]])
-; INTERLEAVE-2-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP7]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP9]]
-; INTERLEAVE-2-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP8]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP10]]
-; INTERLEAVE-2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 16
-; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP11]], ptr [[TMP13]], align 1
-; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP12]], ptr [[TMP16]], align 1
-; INTERLEAVE-2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; INTERLEAVE-2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; INTERLEAVE-2: middle.block:
-; INTERLEAVE-2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; INTERLEAVE-2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; INTERLEAVE-2: vec.epilog.iter.check:
-; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
-; INTERLEAVE-2: vec.epilog.ph:
-; INTERLEAVE-2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; INTERLEAVE-2-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[N]], 8
-; INTERLEAVE-2-NEXT: [[N_VEC8:%.*]] = sub i64 [[N]], [[N_MOD_VF7]]
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT12]], <8 x i8> poison, <8 x i32> zeroinitializer
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
-; INTERLEAVE-2-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT14]], <8 x i8> poison, <8 x i32> zeroinitializer
-; INTERLEAVE-2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; INTERLEAVE-2: vec.epilog.vector.body:
-; INTERLEAVE-2-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; INTERLEAVE-2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX10]]
-; INTERLEAVE-2-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP19]], align 1
-; INTERLEAVE-2-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD11]], [[BROADCAST_SPLAT13]]
-; INTERLEAVE-2-NEXT: [[TMP22:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD11]], <8 x i8> [[BROADCAST_SPLAT15]])
-; INTERLEAVE-2-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP21]], <8 x i8> [[BROADCAST_SPLAT13]], <8 x i8> [[TMP22]]
-; INTERLEAVE-2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX10]]
-; INTERLEAVE-2-NEXT: store <8 x i8> [[TMP23]], ptr [[TMP24]], align 1
-; INTERLEAVE-2-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX10]], 8
-; INTERLEAVE-2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC8]]
-; INTERLEAVE-2-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; INTERLEAVE-2: vec.epilog.middle.block:
-; INTERLEAVE-2-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[N]], [[N_VEC8]]
-; INTERLEAVE-2-NEXT: br i1 [[CMP_N9]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; INTERLEAVE-2: vec.epilog.scalar.ph:
-; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; INTERLEAVE-2-NEXT: br label [[LOOP:%.*]]
-; INTERLEAVE-2: loop:
-; INTERLEAVE-2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; INTERLEAVE-2-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
-; INTERLEAVE-2-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
-; INTERLEAVE-2-NEXT: [[CMP:%.*]] = icmp sgt i8 [[L]], [[B]]
-; INTERLEAVE-2-NEXT: [[MAX:%.*]] = tail call i8 @llvm.smax.i8(i8 [[L]], i8 [[A]])
-; INTERLEAVE-2-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[B]], i8 [[MAX]]
-; INTERLEAVE-2-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; INTERLEAVE-2-NEXT: store i8 [[SEL]], ptr [[GEP_DST]], align 1
-; INTERLEAVE-2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; INTERLEAVE-2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; INTERLEAVE-2-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
-; INTERLEAVE-2: exit:
-; INTERLEAVE-2-NEXT: ret void
-;
; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
-; INTERLEAVE-4-VLA: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
-; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
-; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
-; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
+; INTERLEAVE-4-VLA-NEXT: iter.check:
+; INTERLEAVE-4-VLA-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64
+; INTERLEAVE-4-VLA-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; INTERLEAVE-4-VLA-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
+; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; INTERLEAVE-4-VLA: vector.memcheck:
+; INTERLEAVE-4-VLA-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
+; INTERLEAVE-4-VLA-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; INTERLEAVE-4-VLA-NEXT: [[TMP3:%.*]] = sub i64 [[DST1]], [[SRC2]]
+; INTERLEAVE-4-VLA-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[DIFF_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; INTERLEAVE-4-VLA: vector.main.loop.iter.check:
+; INTERLEAVE-4-VLA-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 6
+; INTERLEAVE-4-VLA-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[N]], [[TMP5]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; INTERLEAVE-4-VLA: vector.ph:
+; INTERLEAVE-4-VLA-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-4-VLA-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP6]], 16
+; INTERLEAVE-4-VLA-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP10]], 4
+; INTERLEAVE-4-VLA-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; INTERLEAVE-4-VLA-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B:%.*]], i64 0
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A:%.*]], i64 0
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT4]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; INTERLEAVE-4-VLA-NEXT: br label [[VECTOR_BODY:%.*]]
+; INTERLEAVE-4-VLA: vector.body:
+; INTERLEAVE-4-VLA-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP13:%.*]] = mul nuw nsw i64 [[TMP10]], 2
+; INTERLEAVE-4-VLA-NEXT: [[TMP16:%.*]] = mul nuw nsw i64 [[TMP10]], 3
+; INTERLEAVE-4-VLA-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP10]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP13]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP16]]
+; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[TMP18:%.*]] = icmp sgt <vscale x 16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP19:%.*]] = icmp sgt <vscale x 16 x i8> [[WIDE_LOAD6]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP20:%.*]] = icmp sgt <vscale x 16 x i8> [[WIDE_LOAD7]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP21:%.*]] = icmp sgt <vscale x 16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD]], <vscale x 16 x i8> [[BROADCAST_SPLAT5]])
+; INTERLEAVE-4-VLA-NEXT: [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD6]], <vscale x 16 x i8> [[BROADCAST_SPLAT5]])
+; INTERLEAVE-4-VLA-NEXT: [[TMP24:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD7]], <vscale x 16 x i8> [[BROADCAST_SPLAT5]])
+; INTERLEAVE-4-VLA-NEXT: [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD8]], <vscale x 16 x i8> [[BROADCAST_SPLAT5]])
+; INTERLEAVE-4-VLA-NEXT: [[TMP26:%.*]] = select <vscale x 16 x i1> [[TMP18]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> [[TMP22]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP27:%.*]] = select <vscale x 16 x i1> [[TMP19]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> [[TMP23]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP28:%.*]] = select <vscale x 16 x i1> [[TMP20]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> [[TMP24]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP29:%.*]] = select <vscale x 16 x i1> [[TMP21]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> [[TMP25]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i64 [[TMP10]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i64 [[TMP13]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP30]], i64 [[TMP16]]
+; INTERLEAVE-4-VLA-NEXT: store <vscale x 16 x i8> [[TMP26]], ptr [[TMP30]], align 1
+; INTERLEAVE-4-VLA-NEXT: store <vscale x 16 x i8> [[TMP27]], ptr [[TMP33]], align 1
+; INTERLEAVE-4-VLA-NEXT: store <vscale x 16 x i8> [[TMP28]], ptr [[TMP36]], align 1
+; INTERLEAVE-4-VLA-NEXT: store <vscale x 16 x i8> [[TMP29]], ptr [[TMP39]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-4-VLA: middle.block:
+; INTERLEAVE-4-VLA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; INTERLEAVE-4-VLA: vec.epilog.iter.check:
+; INTERLEAVE-4-VLA-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
+; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; INTERLEAVE-4-VLA: vec.epilog.ph:
+; INTERLEAVE-4-VLA-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; INTERLEAVE-4-VLA-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
+; INTERLEAVE-4-VLA-NEXT: [[N_VEC10:%.*]] = sub i64 [[N]], [[N_MOD_VF9]]
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT11]], <8 x i8> poison, <8 x i32> zeroinitializer
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0
+; INTERLEAVE-4-VLA-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT13]], <8 x i8> poison, <8 x i32> zeroinitializer
+; INTERLEAVE-4-VLA-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; INTERLEAVE-4-VLA: vec.epilog.vector.body:
+; INTERLEAVE-4-VLA-NEXT: [[INDEX15:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; INTERLEAVE-4-VLA-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX15]]
+; INTERLEAVE-4-VLA-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i8>, ptr [[TMP41]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[TMP42:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD16]], [[BROADCAST_SPLAT12]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP43:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD16]], <8 x i8> [[BROADCAST_SPLAT14]])
+; INTERLEAVE-4-VLA-NEXT: [[TMP44:%.*]] = select <8 x i1> [[TMP42]], <8 x i8> [[BROADCAST_SPLAT12]], <8 x i8> [[TMP43]]
+; INTERLEAVE-4-VLA-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX15]]
+; INTERLEAVE-4-VLA-NEXT: store <8 x i8> [[TMP44]], ptr [[TMP45]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX15]], 8
+; INTERLEAVE-4-VLA-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC10]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[TMP46]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE-4-VLA: vec.epilog.middle.block:
+; INTERLEAVE-4-VLA-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[CMP_N18]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; INTERLEAVE-4-VLA: vec.epilog.scalar.ph:
+; INTERLEAVE-4-VLA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; INTERLEAVE-4-VLA-NEXT: br label [[LOOP:%.*]]
+; INTERLEAVE-4-VLA: loop:
+; INTERLEAVE-4-VLA-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; INTERLEAVE-4-VLA-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
+; INTERLEAVE-4-VLA-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[CMP:%.*]] = icmp sgt i8 [[L]], [[B]]
+; INTERLEAVE-4-VLA-NEXT: [[MAX:%.*]] = tail call i8 @llvm.smax.i8(i8 [[L]], i8 [[A]])
+; INTERLEAVE-4-VLA-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[B]], i8 [[MAX]]
+; INTERLEAVE-4-VLA-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; INTERLEAVE-4-VLA-NEXT: store i8 [[SEL]], ptr [[GEP_DST]], align 1
+; INTERLEAVE-4-VLA-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; INTERLEAVE-4-VLA-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; INTERLEAVE-4-VLA-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; INTERLEAVE-4-VLA: exit:
+; INTERLEAVE-4-VLA-NEXT: ret void
;
entry:
br label %loop
More information about the llvm-commits
mailing list