[llvm] [AArch64] - Improve costing for Identity shuffles for SVE targets. (PR #165375)

Pawan Nirpal via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 18 09:41:16 PST 2025


https://github.com/pawan-nirpal-031 updated https://github.com/llvm/llvm-project/pull/165375

>From 70bf624b5a1c129b027ed0892b567349c44d57fb Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 28 Oct 2025 17:22:03 +0530
Subject: [PATCH 1/6] [AArch64] - Improve costing for Identity shuffles for SVE
 targets.

Identity masks can be treated as free when SVE is available with fixed vectors.
This allows for aggressive vector combines for identity shuffle masks.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |   9 +
 .../AArch64/identity-shuffle-sve.ll           | 322 ++++++++++++++++++
 2 files changed, 331 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 490f6391c15a0..2f0bf300f8a8f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5782,6 +5782,15 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
         VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
     unsigned SegmentElts = VTy->getNumElements() / Segments;
 
+    // Check for identity masks when SVE is available, which we can treat as
+    // free.
+    if (LT.second.isFixedLengthVector() && ST->isSVEorStreamingSVEAvailable() &&
+        (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
+        all_of(enumerate(Mask), [](const auto &M) {
+          return M.value() < 0 || M.value() == (int)M.index();
+        }))
+      return 0;
+
     // dupq zd.t, zn.t[idx]
     if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
         ST->isSVEorStreamingSVEAvailable() &&
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
new file mode 100644
index 0000000000000..f46a10906feb9
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=vector-combine -mtriple=aarch64-unknown-linux-gnu -S %s | FileCheck %s
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=COST
+target triple = "aarch64-unknown-linux-gnu"
+
+ at global = external hidden global [64 x i8], align 1
+ at global.1 = external local_unnamed_addr global ptr, align 8
+ at global.2 = external hidden unnamed_addr constant [10 x i8], align 1
+ at global.3 = external hidden unnamed_addr constant [30 x i8], align 1
+ at global.4 = external hidden unnamed_addr constant [80 x i8], align 1
+ at global.5 = external global i32, align 4
+
+; Function Attrs: nounwind uwtable vscale_range(1,16)
+define dso_local i32 @ham(ptr noundef %arg, i32 noundef %arg1, ptr noundef %arg2) local_unnamed_addr #0 {
+  ; CHECK-LABEL: define dso_local i32 @ham(
+  ; CHECK-SAME: ptr noundef [[ARG:%.*]], i32 noundef [[ARG1:%.*]], ptr noundef [[ARG2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+  ; CHECK-NEXT:  [[BB:.*:]]
+  ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
+  ; CHECK-NEXT:    [[ALLOCA3:%.*]] = alloca i32, align 4
+  ; CHECK-NEXT:    [[ALLOCA4:%.*]] = alloca i32, align 4
+  ; CHECK-NEXT:    [[ALLOCA5:%.*]] = alloca i32, align 4
+  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA]]) #[[ATTR4:[0-9]+]]
+  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA3]]) #[[ATTR4]]
+  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA4]]) #[[ATTR4]]
+  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA5]]) #[[ATTR4]]
+  ; CHECK-NEXT:    tail call void @zot() #[[ATTR4]]
+  ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @pluto(i32 noundef [[ARG1]], ptr noundef [[ARG2]]) #[[ATTR4]]
+  ; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr @global.1, align 8, !tbaa [[TBAA5:![0-9]+]]
+  ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq ptr [[LOAD]], null
+  ; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB9:.*]], label %[[BB6:.*]]
+  ; CHECK:       [[BB6]]:
+  ; CHECK-NEXT:    [[LOAD7:%.*]] = load i8, ptr [[LOAD]], align 1, !tbaa [[TBAA10:![0-9]+]]
+  ; CHECK-NEXT:    [[ICMP8:%.*]] = icmp eq i8 [[LOAD7]], 0
+  ; CHECK-NEXT:    br i1 [[ICMP8]], label %[[BB9]], label %[[BB11:.*]]
+  ; CHECK:       [[BB9]]:
+  ; CHECK-NEXT:    [[CALL10:%.*]] = tail call ptr @pluto.9(ptr noundef nonnull @global.2, ptr noundef nonnull @global.3, i32 noundef 218) #[[ATTR4]]
+  ; CHECK-NEXT:    store ptr [[CALL10]], ptr @global.1, align 8, !tbaa [[TBAA5]]
+  ; CHECK-NEXT:    br label %[[BB11]]
+  ; CHECK:       [[BB11]]:
+  ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[CALL10]], %[[BB9]] ], [ [[LOAD]], %[[BB6]] ]
+  ; CHECK-NEXT:    [[CALL12:%.*]] = call ptr @baz(ptr noundef [[PHI]], ptr noundef nonnull [[ALLOCA]], ptr noundef nonnull [[ALLOCA3]], ptr noundef nonnull [[ALLOCA4]], ptr noundef nonnull [[ALLOCA5]]) #[[ATTR4]]
+  ; CHECK-NEXT:    [[LOAD13:%.*]] = load i32, ptr [[ALLOCA5]], align 4, !tbaa [[TBAA11:![0-9]+]]
+  ; CHECK-NEXT:    [[ICMP14:%.*]] = icmp eq i32 [[LOAD13]], 3
+  ; CHECK-NEXT:    br i1 [[ICMP14]], label %[[BB17:.*]], label %[[BB15:.*]]
+  ; CHECK:       [[BB15]]:
+  ; CHECK:          br label %[[BB17]]
+  ; CHECK:         [[BB17]]:
+  ; CHECK-NEXT:    [[LOAD18:%.*]] = load i32, ptr [[ALLOCA]], align 4, !tbaa [[TBAA11]]
+  ; CHECK-NEXT:    [[LOAD19:%.*]] = load i32, ptr [[ALLOCA3]], align 4, !tbaa [[TBAA11]]
+  ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LOAD19]], [[LOAD18]]
+  ; CHECK-NEXT:    [[MUL20:%.*]] = mul i32 [[MUL]], 3
+  ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[MUL20]] to i64
+  ; CHECK-NEXT:    [[CALL21:%.*]] = call ptr @zot.11(i64 noundef [[ZEXT]], ptr noundef nonnull @global.3, i32 noundef 228) #[[ATTR4]]
+  ; CHECK-NEXT:    call void @wombat() #[[ATTR4]]
+  ; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 144
+  ; CHECK-NEXT:    [[LOAD22:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 8, !tbaa [[TBAA13:![0-9]+]]
+  ; CHECK-NEXT:    [[ICMP23:%.*]] = icmp eq i32 [[LOAD22]], 0
+  ; CHECK-NEXT:    br i1 [[ICMP23]], label %[[BB224:.*]], label %[[BB24:.*]]
+  ; CHECK:       [[BB24]]:
+  ; CHECK-NEXT:    [[ICMP25:%.*]] = icmp eq i32 [[MUL]], 0
+  ; CHECK-NEXT:    br i1 [[ICMP25]], label %[[BB205:.*]], label %[[BB26:.*]]
+  ; CHECK:       [[BB26]]:
+  ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], -1
+  ; CHECK-NEXT:    [[ZEXT27:%.*]] = zext i32 [[ADD]] to i64
+  ; CHECK-NEXT:    [[MUL28:%.*]] = mul nuw nsw i64 [[ZEXT27]], 3
+  ; CHECK-NEXT:    [[ADD29:%.*]] = add nuw nsw i64 [[MUL28]], 3
+  ; CHECK-NEXT:    [[GETELEMENTPTR30:%.*]] = getelementptr i8, ptr [[CALL21]], i64 [[ADD29]]
+  ; CHECK-NEXT:    [[GETELEMENTPTR31:%.*]] = getelementptr i8, ptr [[CALL12]], i64 [[ADD29]]
+  ; CHECK-NEXT:    [[ZEXT32:%.*]] = zext i32 [[MUL]] to i64
+  ; CHECK-NEXT:    br label %[[BB33:.*]]
+  ; CHECK:       [[BB33]]:
+  ; CHECK-NEXT:    [[PHI34:%.*]] = phi i64 [ [[ADD201:%.*]], %[[BB191:.*]] ], [ 0, %[[BB26]] ]
+  ; CHECK-NEXT:    [[LOAD35:%.*]] = load volatile i32, ptr @global.5, align 4, !tbaa [[TBAA11]]
+  ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LOAD35]] to i8
+  ; CHECK-NEXT:    [[LOAD36:%.*]] = load volatile i32, ptr @global.5, align 4, !tbaa [[TBAA11]]
+  ; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD36]] to i64
+  ; CHECK-NEXT:    [[GETELEMENTPTR37:%.*]] = getelementptr inbounds i8, ptr [[CALL21]], i64 [[SEXT]]
+  ; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[GETELEMENTPTR37]], align 1, !tbaa [[TBAA10]]
+  ; CHECK-NEXT:    [[ICMP38:%.*]] = icmp ult i32 [[MUL]], 8
+  ; CHECK-NEXT:    br i1 [[ICMP38]], label %[[BB149:.*]], label %[[BB39:.*]]
+  ; CHECK:       [[BB39]]:
+  ; CHECK-NEXT:    [[ICMP40:%.*]] = icmp ult ptr [[CALL21]], [[GETELEMENTPTR31]]
+  ; CHECK-NEXT:    [[ICMP41:%.*]] = icmp ult ptr [[CALL12]], [[GETELEMENTPTR30]]
+  ; CHECK-NEXT:    [[AND42:%.*]] = and i1 [[ICMP40]], [[ICMP41]]
+  ; CHECK-NEXT:    br i1 [[AND42]], label %[[BB149]], label %[[BB43:.*]]
+  ; CHECK:       [[BB43]]:
+  ; CHECK-NEXT:    [[ICMP44:%.*]] = icmp ult i32 [[MUL]], 16
+  ; CHECK-NEXT:    br i1 [[ICMP44]], label %[[BB97:.*]], label %[[BB45:.*]]
+  ; CHECK:       [[BB45]]:
+  ; CHECK:         br label %[[BB48:.*]]
+  ; CHECK:       [[BB48]]:
+  ; CHECK-NEXT:    [[PHI49:%.*]] = phi i64 [ 0, %[[BB45]] ], [ [[ADD86:%.*]], %[[BB48]] ]
+  ; CHECK-NEXT:    [[MUL50:%.*]] = mul i64 [[PHI49]], 3
+  ; CHECK-NEXT:    [[GETELEMENTPTR51:%.*]] = getelementptr i8, ptr [[CALL21]], i64 [[MUL50]]
+  ; CHECK-NEXT:    [[MUL52:%.*]] = mul i64 [[PHI49]], 3
+  ; CHECK-NEXT:    [[GETELEMENTPTR53:%.*]] = getelementptr i8, ptr [[CALL12]], i64 [[MUL52]]
+  ; CHECK-NEXT:    [[LOAD54:%.*]] = load <48 x i8>, ptr [[GETELEMENTPTR53]], align 1, !tbaa [[TBAA10]], !alias.scope [[META18:![0-9]+]]
+  ; CHECK-NEXT:    [[SHUFFLEVECTOR:%.*]] = shufflevector <48 x i8> [[LOAD54]], <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  ; CHECK-NEXT:    [[SHUFFLEVECTOR55:%.*]] = shufflevector <48 x i8> [[LOAD54]], <48 x i8> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  ; CHECK-NEXT:    [[SHUFFLEVECTOR56:%.*]] = shufflevector <48 x i8> [[LOAD54]], <48 x i8> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  ; CHECK-NEXT:    [[ZEXT57:%.*]] = zext <16 x i8> [[SHUFFLEVECTOR]] to <16 x i32>
+  ; CHECK-NEXT:    [[ZEXT59:%.*]] = zext <16 x i8> [[SHUFFLEVECTOR55]] to <16 x i32>
+  ; CHECK-NEXT:    [[ZEXT61:%.*]] = zext <16 x i8> [[SHUFFLEVECTOR56]] to <16 x i32>
+  ; CHECK-NEXT:    [[MUL75:%.*]] = mul nuw nsw <16 x i32> [[ZEXT57]], splat (i32 13282)
+  ; CHECK-NEXT:    [[MUL76:%.*]] = mul nuw <16 x i32> [[ZEXT59]], splat (i32 16744449)
+  ; CHECK-NEXT:    [[MUL77:%.*]] = mul nuw nsw <16 x i32> [[ZEXT61]], splat (i32 19485)
+  ; CHECK-NEXT:    [[ADD78:%.*]] = add nuw nsw <16 x i32> [[MUL75]], splat (i32 32768)
+  ; CHECK-NEXT:    [[ADD79:%.*]] = add nuw <16 x i32> [[ADD78]], [[MUL76]]
+  ; CHECK-NEXT:    [[ADD80:%.*]] = add nuw <16 x i32> [[ADD79]], [[MUL77]]
+  ; CHECK-NEXT:    [[LSHR81:%.*]] = lshr <16 x i32> [[ADD80]], splat (i32 16)
+  ; CHECK-NEXT:    [[TRUNC82:%.*]] = trunc <16 x i32> [[LSHR81]] to <16 x i8>
+  ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[ZEXT57]], <16 x i32> [[ZEXT57]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <32 x i32> [[TMP0]], <i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <32 x i32> [[TMP1]], splat (i32 32768)
+  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[ZEXT59]], <16 x i32> [[ZEXT59]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw <32 x i32> [[TMP3]], <i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097>
+  ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw <32 x i32> [[TMP2]], [[TMP4]]
+  ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[ZEXT61]], <16 x i32> [[ZEXT61]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <32 x i32> [[TMP6]], <i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568>
+  
+  ; COST: Cost Model: Found costs of 0 for:   %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+bb:
+  %alloca = alloca i32, align 4
+  %alloca3 = alloca i32, align 4
+  %alloca4 = alloca i32, align 4
+  %alloca5 = alloca i32, align 4
+  call void @llvm.lifetime.start.p0(ptr nonnull %alloca) #4
+  call void @llvm.lifetime.start.p0(ptr nonnull %alloca3) #4
+  call void @llvm.lifetime.start.p0(ptr nonnull %alloca4) #4
+  call void @llvm.lifetime.start.p0(ptr nonnull %alloca5) #4
+  tail call void @zot() #4
+  %call = tail call i32 @pluto(i32 noundef %arg1, ptr noundef %arg2) #4
+  %load = load ptr, ptr @global.1, align 8, !tbaa !5
+  %icmp = icmp eq ptr %load, null
+  br i1 %icmp, label %bb9, label %bb6
+
+bb6:                                              ; preds = %bb
+  %load7 = load i8, ptr %load, align 1, !tbaa !10
+  %icmp8 = icmp eq i8 %load7, 0
+  br i1 %icmp8, label %bb9, label %bb11
+
+bb9:                                              ; preds = %bb6, %bb
+  %call10 = tail call ptr @pluto.9(ptr noundef nonnull @global.2, ptr noundef nonnull @global.3, i32 noundef 218) #4
+  store ptr %call10, ptr @global.1, align 8, !tbaa !5
+  br label %bb11
+
+bb11:                                             ; preds = %bb9, %bb6
+  %phi = phi ptr [ %call10, %bb9 ], [ %load, %bb6 ]
+  %call12 = call ptr @baz(ptr noundef %phi, ptr noundef nonnull %alloca, ptr noundef nonnull %alloca3, ptr noundef nonnull %alloca4, ptr noundef nonnull %alloca5) #4
+  %load13 = load i32, ptr %alloca5, align 4, !tbaa !11
+  %icmp14 = icmp eq i32 %load13, 3
+  br i1 %icmp14, label %bb17, label %bb15
+
+bb15:                                             ; preds = %bb11
+  %load16 = load ptr, ptr @global.1, align 8, !tbaa !5
+  call void (i32, ptr, ...) @ham.10(i32 noundef 1, ptr noundef nonnull @global.4, ptr noundef %load16, i32 noundef %load13) #4
+  br label %bb17
+
+bb17:                                             ; preds = %bb15, %bb11
+  %load18 = load i32, ptr %alloca, align 4, !tbaa !11
+  %load19 = load i32, ptr %alloca3, align 4, !tbaa !11
+  %mul = mul nsw i32 %load19, %load18
+  %mul20 = mul i32 %mul, 3
+  %zext = zext i32 %mul20 to i64
+  %call21 = call ptr @zot.11(i64 noundef %zext, ptr noundef nonnull @global.3, i32 noundef 228) #4
+  call void @wombat() #4
+  %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 144
+  %load22 = load i32, ptr %getelementptr, align 8, !tbaa !13
+  %icmp23 = icmp eq i32 %load22, 0
+  br i1 %icmp23, label %bb205, label %bb24
+
+bb24:                                             ; preds = %bb17
+  %icmp25 = icmp eq i32 %mul, 0
+  br i1 %icmp25, label %bb205, label %bb26
+
+bb26:                                             ; preds = %bb24
+  %add = add i32 %mul, -1
+  %zext27 = zext i32 %add to i64
+  %mul28 = mul nuw nsw i64 %zext27, 3
+  %add29 = add nuw nsw i64 %mul28, 3
+  %getelementptr30 = getelementptr i8, ptr %call21, i64 %add29
+  %getelementptr31 = getelementptr i8, ptr %call12, i64 %add29
+  %zext32 = zext i32 %mul to i64
+  br label %bb33
+
+bb33:                                             ; preds = %bb191, %bb26
+  %phi34 = phi i64 [ %add201, %bb191 ], [ 0, %bb26 ]
+  %load35 = load volatile i32, ptr @global.5, align 4, !tbaa !11
+  %trunc = trunc i32 %load35 to i8
+  %load36 = load volatile i32, ptr @global.5, align 4, !tbaa !11
+  %sext = sext i32 %load36 to i64
+  %getelementptr37 = getelementptr inbounds i8, ptr %call21, i64 %sext
+  store i8 %trunc, ptr %getelementptr37, align 1, !tbaa !10
+  %icmp38 = icmp ult i32 %mul, 8
+  br i1 %icmp38, label %bb191, label %bb39
+
+bb39:                                             ; preds = %bb33
+  %icmp40 = icmp ult ptr %call21, %getelementptr31
+  %icmp41 = icmp ult ptr %call12, %getelementptr30
+  %and42 = and i1 %icmp40, %icmp41
+  br i1 %and42, label %bb191, label %bb43
+
+bb43:                                             ; preds = %bb39
+  %icmp44 = icmp ult i32 %mul, 16
+  br i1 %icmp44, label %bb191, label %bb45
+
+bb45:                                             ; preds = %bb43
+  %and47 = and i64 %zext32, 4294967280
+  br label %bb48
+
+bb48:                                             ; preds = %bb48, %bb45
+  %phi49 = phi i64 [ 0, %bb45 ], [ %add86, %bb48 ]
+  %mul50 = mul i64 %phi49, 3
+  %getelementptr51 = getelementptr i8, ptr %call21, i64 %mul50
+  %mul52 = mul i64 %phi49, 3
+  %getelementptr53 = getelementptr i8, ptr %call12, i64 %mul52
+  %load54 = load <48 x i8>, ptr %getelementptr53, align 1, !tbaa !10, !alias.scope !18
+  %shufflevector = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %shufflevector55 = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %shufflevector56 = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  %zext57 = zext <16 x i8> %shufflevector to <16 x i32>
+  %mul58 = mul nuw nsw <16 x i32> %zext57, splat (i32 19595)
+  %zext59 = zext <16 x i8> %shufflevector55 to <16 x i32>
+  %mul60 = mul nuw nsw <16 x i32> %zext59, splat (i32 38470)
+  %zext61 = zext <16 x i8> %shufflevector56 to <16 x i32>
+  %mul62 = mul nuw nsw <16 x i32> %zext61, splat (i32 7471)
+  %add63 = add nuw nsw <16 x i32> %mul58, splat (i32 32768)
+  %add64 = add nuw nsw <16 x i32> %add63, %mul60
+  %add65 = add nuw nsw <16 x i32> %add64, %mul62
+  %lshr = lshr <16 x i32> %add65, splat (i32 16)
+  %trunc66 = trunc nuw <16 x i32> %lshr to <16 x i8>
+  %mul67 = mul nuw nsw <16 x i32> %zext57, splat (i32 32767)
+  %mul68 = mul nuw <16 x i32> %zext59, splat (i32 16762097)
+  %mul69 = mul nuw <16 x i32> %zext61, splat (i32 16759568)
+  %add70 = add nuw nsw <16 x i32> %mul67, splat (i32 32768)
+  %add71 = add nuw <16 x i32> %add70, %mul68
+  %add72 = add <16 x i32> %add71, %mul69
+  %lshr73 = lshr <16 x i32> %add72, splat (i32 16)
+  %trunc74 = trunc <16 x i32> %lshr73 to <16 x i8>
+  %mul75 = mul nuw nsw <16 x i32> %zext57, splat (i32 13282)
+  %mul76 = mul nuw <16 x i32> %zext59, splat (i32 16744449)
+  %mul77 = mul nuw nsw <16 x i32> %zext61, splat (i32 19485)
+  %add78 = add nuw nsw <16 x i32> %mul75, splat (i32 32768)
+  %add79 = add nuw <16 x i32> %add78, %mul76
+  %add80 = add nuw <16 x i32> %add79, %mul77
+  %lshr81 = lshr <16 x i32> %add80, splat (i32 16)
+  %trunc82 = trunc <16 x i32> %lshr81 to <16 x i8>
+  %shufflevector83 = shufflevector <16 x i8> %trunc66, <16 x i8> %trunc74, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shufflevector85 = shufflevector <32 x i8> %shufflevector83, <32 x i8> %shufflevector84, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i8> %shufflevector85, ptr %getelementptr51, align 1, !tbaa !10, !alias.scope !21, !noalias !18
+  %add86 = add nuw i64 %phi49, 16
+  %icmp87 = icmp eq i64 %add86, %and47
+  br i1 %icmp87, label %bb88, label %bb48, !llvm.loop !23
+
+bb88:                                             ; preds = %bb48
+  %icmp89 = icmp eq i64 %and47, %zext32
+  br label %bb191
+
+bb191:                                            ; preds = %bb88
+  %add201 = add nuw nsw i64 %phi34, 1
+  %load202 = load i32, ptr %getelementptr, align 8, !tbaa !13
+  %zext203 = zext i32 %load202 to i64
+  %icmp204 = icmp samesign ult i64 %add201, %zext203
+  br i1 %icmp204, label %bb33, label %bb205
+
+bb205:                                            ; preds = %bb205, %bb24
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(ptr captures(none)) #1
+
+declare void @zot() local_unnamed_addr #2
+
+declare i32 @pluto(i32 noundef, ptr noundef) local_unnamed_addr #2
+
+declare ptr @pluto.9(ptr noundef, ptr noundef, i32 noundef) local_unnamed_addr #2
+
+declare ptr @baz(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef) local_unnamed_addr #2
+
+declare void @ham.10(i32 noundef, ptr noundef, ...) local_unnamed_addr #2
+
+declare ptr @zot.11(i64 noundef, ptr noundef, i32 noundef) local_unnamed_addr #2
+
+declare void @wombat() local_unnamed_addr #2
+
+
+attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a,-fmv"}
+attributes #2 = { "target-cpu"="cortex-a57" "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a,-fmv" }
+attributes #4 = { nounwind }
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"p1 omnipotent char", !7, i64 0}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C/C++ TBAA"}
+!10 = !{!8, !8, i64 0}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !8, i64 0}
+!13 = !{!14, !12, i64 144}
+!14 = !{!"TCDef", !8, i64 0, !8, i64 16, !8, i64 32, !8, i64 48, !8, i64 64, !15, i64 128, !16, i64 130, !16, i64 134, !16, i64 138, !12, i64 144, !12, i64 148, !12, i64 152, !15, i64 156, !17, i64 160, !17, i64 168, !17, i64 176, !17, i64 184}
+!15 = !{!"short", !8, i64 0}
+!16 = !{!"", !8, i64 0, !8, i64 1, !8, i64 2, !8, i64 3}
+!17 = !{!"long", !8, i64 0}
+!18 = !{!19}
+!19 = distinct !{!19, !20}
+!20 = distinct !{!20, !"LVerDomain"}
+!21 = !{!22}
+!22 = distinct !{!22, !20}
+!23 = distinct !{!23, !24, !25}
+!24 = !{!"llvm.loop.isvectorized", i32 1}
+!25 = !{!"llvm.loop.unroll.runtime.disable"}
+!26 = !{!"branch_weights", i32 8, i32 8}
+!27 = distinct !{!27, !24, !25}
+!28 = distinct !{!28, !24}
+

>From d2f6cc3c5540b38a58bc594987c4d728f7a8a173 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pawannirpal at gmail.com>
Date: Mon, 3 Nov 2025 22:23:40 +0530
Subject: [PATCH 2/6] Update identity-shuffle-sve.ll

---
 .../AArch64/identity-shuffle-sve.ll           | 287 ++----------------
 1 file changed, 18 insertions(+), 269 deletions(-)

diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
index f46a10906feb9..45413591bbc96 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
@@ -3,226 +3,30 @@
 ; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=COST
 target triple = "aarch64-unknown-linux-gnu"
 
- at global = external hidden global [64 x i8], align 1
- at global.1 = external local_unnamed_addr global ptr, align 8
- at global.2 = external hidden unnamed_addr constant [10 x i8], align 1
- at global.3 = external hidden unnamed_addr constant [30 x i8], align 1
- at global.4 = external hidden unnamed_addr constant [80 x i8], align 1
- at global.5 = external global i32, align 4
+define i32 @ham(ptr %call12) local_unnamed_addr #0 {
+; CHECK-LABEL: define i32 @ham(
+; CHECK-SAME: ptr [[CALL12:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK:    [[TMP2:%.*]] = mul nuw nsw <32 x i32> [[TMP1:%.*]], <i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
 
-; Function Attrs: nounwind uwtable vscale_range(1,16)
-define dso_local i32 @ham(ptr noundef %arg, i32 noundef %arg1, ptr noundef %arg2) local_unnamed_addr #0 {
-  ; CHECK-LABEL: define dso_local i32 @ham(
-  ; CHECK-SAME: ptr noundef [[ARG:%.*]], i32 noundef [[ARG1:%.*]], ptr noundef [[ARG2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
-  ; CHECK-NEXT:  [[BB:.*:]]
-  ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4
-  ; CHECK-NEXT:    [[ALLOCA3:%.*]] = alloca i32, align 4
-  ; CHECK-NEXT:    [[ALLOCA4:%.*]] = alloca i32, align 4
-  ; CHECK-NEXT:    [[ALLOCA5:%.*]] = alloca i32, align 4
-  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA]]) #[[ATTR4:[0-9]+]]
-  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA3]]) #[[ATTR4]]
-  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA4]]) #[[ATTR4]]
-  ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[ALLOCA5]]) #[[ATTR4]]
-  ; CHECK-NEXT:    tail call void @zot() #[[ATTR4]]
-  ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @pluto(i32 noundef [[ARG1]], ptr noundef [[ARG2]]) #[[ATTR4]]
-  ; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr @global.1, align 8, !tbaa [[TBAA5:![0-9]+]]
-  ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq ptr [[LOAD]], null
-  ; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB9:.*]], label %[[BB6:.*]]
-  ; CHECK:       [[BB6]]:
-  ; CHECK-NEXT:    [[LOAD7:%.*]] = load i8, ptr [[LOAD]], align 1, !tbaa [[TBAA10:![0-9]+]]
-  ; CHECK-NEXT:    [[ICMP8:%.*]] = icmp eq i8 [[LOAD7]], 0
-  ; CHECK-NEXT:    br i1 [[ICMP8]], label %[[BB9]], label %[[BB11:.*]]
-  ; CHECK:       [[BB9]]:
-  ; CHECK-NEXT:    [[CALL10:%.*]] = tail call ptr @pluto.9(ptr noundef nonnull @global.2, ptr noundef nonnull @global.3, i32 noundef 218) #[[ATTR4]]
-  ; CHECK-NEXT:    store ptr [[CALL10]], ptr @global.1, align 8, !tbaa [[TBAA5]]
-  ; CHECK-NEXT:    br label %[[BB11]]
-  ; CHECK:       [[BB11]]:
-  ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[CALL10]], %[[BB9]] ], [ [[LOAD]], %[[BB6]] ]
-  ; CHECK-NEXT:    [[CALL12:%.*]] = call ptr @baz(ptr noundef [[PHI]], ptr noundef nonnull [[ALLOCA]], ptr noundef nonnull [[ALLOCA3]], ptr noundef nonnull [[ALLOCA4]], ptr noundef nonnull [[ALLOCA5]]) #[[ATTR4]]
-  ; CHECK-NEXT:    [[LOAD13:%.*]] = load i32, ptr [[ALLOCA5]], align 4, !tbaa [[TBAA11:![0-9]+]]
-  ; CHECK-NEXT:    [[ICMP14:%.*]] = icmp eq i32 [[LOAD13]], 3
-  ; CHECK-NEXT:    br i1 [[ICMP14]], label %[[BB17:.*]], label %[[BB15:.*]]
-  ; CHECK:       [[BB15]]:
-  ; CHECK:          br label %[[BB17]]
-  ; CHECK:         [[BB17]]:
-  ; CHECK-NEXT:    [[LOAD18:%.*]] = load i32, ptr [[ALLOCA]], align 4, !tbaa [[TBAA11]]
-  ; CHECK-NEXT:    [[LOAD19:%.*]] = load i32, ptr [[ALLOCA3]], align 4, !tbaa [[TBAA11]]
-  ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LOAD19]], [[LOAD18]]
-  ; CHECK-NEXT:    [[MUL20:%.*]] = mul i32 [[MUL]], 3
-  ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[MUL20]] to i64
-  ; CHECK-NEXT:    [[CALL21:%.*]] = call ptr @zot.11(i64 noundef [[ZEXT]], ptr noundef nonnull @global.3, i32 noundef 228) #[[ATTR4]]
-  ; CHECK-NEXT:    call void @wombat() #[[ATTR4]]
-  ; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 144
-  ; CHECK-NEXT:    [[LOAD22:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 8, !tbaa [[TBAA13:![0-9]+]]
-  ; CHECK-NEXT:    [[ICMP23:%.*]] = icmp eq i32 [[LOAD22]], 0
-  ; CHECK-NEXT:    br i1 [[ICMP23]], label %[[BB224:.*]], label %[[BB24:.*]]
-  ; CHECK:       [[BB24]]:
-  ; CHECK-NEXT:    [[ICMP25:%.*]] = icmp eq i32 [[MUL]], 0
-  ; CHECK-NEXT:    br i1 [[ICMP25]], label %[[BB205:.*]], label %[[BB26:.*]]
-  ; CHECK:       [[BB26]]:
-  ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL]], -1
-  ; CHECK-NEXT:    [[ZEXT27:%.*]] = zext i32 [[ADD]] to i64
-  ; CHECK-NEXT:    [[MUL28:%.*]] = mul nuw nsw i64 [[ZEXT27]], 3
-  ; CHECK-NEXT:    [[ADD29:%.*]] = add nuw nsw i64 [[MUL28]], 3
-  ; CHECK-NEXT:    [[GETELEMENTPTR30:%.*]] = getelementptr i8, ptr [[CALL21]], i64 [[ADD29]]
-  ; CHECK-NEXT:    [[GETELEMENTPTR31:%.*]] = getelementptr i8, ptr [[CALL12]], i64 [[ADD29]]
-  ; CHECK-NEXT:    [[ZEXT32:%.*]] = zext i32 [[MUL]] to i64
-  ; CHECK-NEXT:    br label %[[BB33:.*]]
-  ; CHECK:       [[BB33]]:
-  ; CHECK-NEXT:    [[PHI34:%.*]] = phi i64 [ [[ADD201:%.*]], %[[BB191:.*]] ], [ 0, %[[BB26]] ]
-  ; CHECK-NEXT:    [[LOAD35:%.*]] = load volatile i32, ptr @global.5, align 4, !tbaa [[TBAA11]]
-  ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LOAD35]] to i8
-  ; CHECK-NEXT:    [[LOAD36:%.*]] = load volatile i32, ptr @global.5, align 4, !tbaa [[TBAA11]]
-  ; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD36]] to i64
-  ; CHECK-NEXT:    [[GETELEMENTPTR37:%.*]] = getelementptr inbounds i8, ptr [[CALL21]], i64 [[SEXT]]
-  ; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[GETELEMENTPTR37]], align 1, !tbaa [[TBAA10]]
-  ; CHECK-NEXT:    [[ICMP38:%.*]] = icmp ult i32 [[MUL]], 8
-  ; CHECK-NEXT:    br i1 [[ICMP38]], label %[[BB149:.*]], label %[[BB39:.*]]
-  ; CHECK:       [[BB39]]:
-  ; CHECK-NEXT:    [[ICMP40:%.*]] = icmp ult ptr [[CALL21]], [[GETELEMENTPTR31]]
-  ; CHECK-NEXT:    [[ICMP41:%.*]] = icmp ult ptr [[CALL12]], [[GETELEMENTPTR30]]
-  ; CHECK-NEXT:    [[AND42:%.*]] = and i1 [[ICMP40]], [[ICMP41]]
-  ; CHECK-NEXT:    br i1 [[AND42]], label %[[BB149]], label %[[BB43:.*]]
-  ; CHECK:       [[BB43]]:
-  ; CHECK-NEXT:    [[ICMP44:%.*]] = icmp ult i32 [[MUL]], 16
-  ; CHECK-NEXT:    br i1 [[ICMP44]], label %[[BB97:.*]], label %[[BB45:.*]]
-  ; CHECK:       [[BB45]]:
-  ; CHECK:         br label %[[BB48:.*]]
-  ; CHECK:       [[BB48]]:
-  ; CHECK-NEXT:    [[PHI49:%.*]] = phi i64 [ 0, %[[BB45]] ], [ [[ADD86:%.*]], %[[BB48]] ]
-  ; CHECK-NEXT:    [[MUL50:%.*]] = mul i64 [[PHI49]], 3
-  ; CHECK-NEXT:    [[GETELEMENTPTR51:%.*]] = getelementptr i8, ptr [[CALL21]], i64 [[MUL50]]
-  ; CHECK-NEXT:    [[MUL52:%.*]] = mul i64 [[PHI49]], 3
-  ; CHECK-NEXT:    [[GETELEMENTPTR53:%.*]] = getelementptr i8, ptr [[CALL12]], i64 [[MUL52]]
-  ; CHECK-NEXT:    [[LOAD54:%.*]] = load <48 x i8>, ptr [[GETELEMENTPTR53]], align 1, !tbaa [[TBAA10]], !alias.scope [[META18:![0-9]+]]
-  ; CHECK-NEXT:    [[SHUFFLEVECTOR:%.*]] = shufflevector <48 x i8> [[LOAD54]], <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
-  ; CHECK-NEXT:    [[SHUFFLEVECTOR55:%.*]] = shufflevector <48 x i8> [[LOAD54]], <48 x i8> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
-  ; CHECK-NEXT:    [[SHUFFLEVECTOR56:%.*]] = shufflevector <48 x i8> [[LOAD54]], <48 x i8> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
-  ; CHECK-NEXT:    [[ZEXT57:%.*]] = zext <16 x i8> [[SHUFFLEVECTOR]] to <16 x i32>
-  ; CHECK-NEXT:    [[ZEXT59:%.*]] = zext <16 x i8> [[SHUFFLEVECTOR55]] to <16 x i32>
-  ; CHECK-NEXT:    [[ZEXT61:%.*]] = zext <16 x i8> [[SHUFFLEVECTOR56]] to <16 x i32>
-  ; CHECK-NEXT:    [[MUL75:%.*]] = mul nuw nsw <16 x i32> [[ZEXT57]], splat (i32 13282)
-  ; CHECK-NEXT:    [[MUL76:%.*]] = mul nuw <16 x i32> [[ZEXT59]], splat (i32 16744449)
-  ; CHECK-NEXT:    [[MUL77:%.*]] = mul nuw nsw <16 x i32> [[ZEXT61]], splat (i32 19485)
-  ; CHECK-NEXT:    [[ADD78:%.*]] = add nuw nsw <16 x i32> [[MUL75]], splat (i32 32768)
-  ; CHECK-NEXT:    [[ADD79:%.*]] = add nuw <16 x i32> [[ADD78]], [[MUL76]]
-  ; CHECK-NEXT:    [[ADD80:%.*]] = add nuw <16 x i32> [[ADD79]], [[MUL77]]
-  ; CHECK-NEXT:    [[LSHR81:%.*]] = lshr <16 x i32> [[ADD80]], splat (i32 16)
-  ; CHECK-NEXT:    [[TRUNC82:%.*]] = trunc <16 x i32> [[LSHR81]] to <16 x i8>
-  ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i32> [[ZEXT57]], <16 x i32> [[ZEXT57]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <32 x i32> [[TMP0]], <i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
-  ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <32 x i32> [[TMP1]], splat (i32 32768)
-  ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[ZEXT59]], <16 x i32> [[ZEXT59]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw <32 x i32> [[TMP3]], <i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097>
-  ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw <32 x i32> [[TMP2]], [[TMP4]]
-  ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[ZEXT61]], <16 x i32> [[ZEXT61]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <32 x i32> [[TMP6]], <i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568>
-  
-  ; COST: Cost Model: Found costs of 0 for:   %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-bb:
-  %alloca = alloca i32, align 4
-  %alloca3 = alloca i32, align 4
-  %alloca4 = alloca i32, align 4
-  %alloca5 = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr nonnull %alloca) #4
-  call void @llvm.lifetime.start.p0(ptr nonnull %alloca3) #4
-  call void @llvm.lifetime.start.p0(ptr nonnull %alloca4) #4
-  call void @llvm.lifetime.start.p0(ptr nonnull %alloca5) #4
-  tail call void @zot() #4
-  %call = tail call i32 @pluto(i32 noundef %arg1, ptr noundef %arg2) #4
-  %load = load ptr, ptr @global.1, align 8, !tbaa !5
-  %icmp = icmp eq ptr %load, null
-  br i1 %icmp, label %bb9, label %bb6
-
-bb6:                                              ; preds = %bb
-  %load7 = load i8, ptr %load, align 1, !tbaa !10
-  %icmp8 = icmp eq i8 %load7, 0
-  br i1 %icmp8, label %bb9, label %bb11
-
-bb9:                                              ; preds = %bb6, %bb
-  %call10 = tail call ptr @pluto.9(ptr noundef nonnull @global.2, ptr noundef nonnull @global.3, i32 noundef 218) #4
-  store ptr %call10, ptr @global.1, align 8, !tbaa !5
-  br label %bb11
-
-bb11:                                             ; preds = %bb9, %bb6
-  %phi = phi ptr [ %call10, %bb9 ], [ %load, %bb6 ]
-  %call12 = call ptr @baz(ptr noundef %phi, ptr noundef nonnull %alloca, ptr noundef nonnull %alloca3, ptr noundef nonnull %alloca4, ptr noundef nonnull %alloca5) #4
-  %load13 = load i32, ptr %alloca5, align 4, !tbaa !11
-  %icmp14 = icmp eq i32 %load13, 3
-  br i1 %icmp14, label %bb17, label %bb15
-
-bb15:                                             ; preds = %bb11
-  %load16 = load ptr, ptr @global.1, align 8, !tbaa !5
-  call void (i32, ptr, ...) @ham.10(i32 noundef 1, ptr noundef nonnull @global.4, ptr noundef %load16, i32 noundef %load13) #4
-  br label %bb17
-
-bb17:                                             ; preds = %bb15, %bb11
-  %load18 = load i32, ptr %alloca, align 4, !tbaa !11
-  %load19 = load i32, ptr %alloca3, align 4, !tbaa !11
-  %mul = mul nsw i32 %load19, %load18
-  %mul20 = mul i32 %mul, 3
-  %zext = zext i32 %mul20 to i64
-  %call21 = call ptr @zot.11(i64 noundef %zext, ptr noundef nonnull @global.3, i32 noundef 228) #4
-  call void @wombat() #4
-  %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 144
-  %load22 = load i32, ptr %getelementptr, align 8, !tbaa !13
-  %icmp23 = icmp eq i32 %load22, 0
-  br i1 %icmp23, label %bb205, label %bb24
+; CHECK:    [[TMP5:%.*]] = mul nuw <32 x i32> [[TMP4:%.*]], <i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097>
 
-bb24:                                             ; preds = %bb17
-  %icmp25 = icmp eq i32 %mul, 0
-  br i1 %icmp25, label %bb205, label %bb26
+; CHECK:    [[TMP8:%.*]] = mul nuw <32 x i32> [[TMP7:%.*]], <i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568>
 
-bb26:                                             ; preds = %bb24
-  %add = add i32 %mul, -1
-  %zext27 = zext i32 %add to i64
-  %mul28 = mul nuw nsw i64 %zext27, 3
-  %add29 = add nuw nsw i64 %mul28, 3
-  %getelementptr30 = getelementptr i8, ptr %call21, i64 %add29
-  %getelementptr31 = getelementptr i8, ptr %call12, i64 %add29
-  %zext32 = zext i32 %mul to i64
-  br label %bb33
-
-bb33:                                             ; preds = %bb191, %bb26
-  %phi34 = phi i64 [ %add201, %bb191 ], [ 0, %bb26 ]
-  %load35 = load volatile i32, ptr @global.5, align 4, !tbaa !11
-  %trunc = trunc i32 %load35 to i8
-  %load36 = load volatile i32, ptr @global.5, align 4, !tbaa !11
-  %sext = sext i32 %load36 to i64
-  %getelementptr37 = getelementptr inbounds i8, ptr %call21, i64 %sext
-  store i8 %trunc, ptr %getelementptr37, align 1, !tbaa !10
-  %icmp38 = icmp ult i32 %mul, 8
-  br i1 %icmp38, label %bb191, label %bb39
-
-bb39:                                             ; preds = %bb33
-  %icmp40 = icmp ult ptr %call21, %getelementptr31
-  %icmp41 = icmp ult ptr %call12, %getelementptr30
-  %and42 = and i1 %icmp40, %icmp41
-  br i1 %and42, label %bb191, label %bb43
-
-bb43:                                             ; preds = %bb39
-  %icmp44 = icmp ult i32 %mul, 16
-  br i1 %icmp44, label %bb191, label %bb45
-
-bb45:                                             ; preds = %bb43
-  %and47 = and i64 %zext32, 4294967280
+; COST: Cost Model: Found costs of 0 for:   %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+bb:
   br label %bb48
 
-bb48:                                             ; preds = %bb48, %bb45
-  %phi49 = phi i64 [ 0, %bb45 ], [ %add86, %bb48 ]
+bb48:                                             ; preds = %bb48, %bb
+  %phi49 = phi i64 [ 0, %bb ], [ %add86, %bb48 ]
   %mul50 = mul i64 %phi49, 3
-  %getelementptr51 = getelementptr i8, ptr %call21, i64 %mul50
-  %mul52 = mul i64 %phi49, 3
-  %getelementptr53 = getelementptr i8, ptr %call12, i64 %mul52
+  %getelementptr53 = getelementptr i8, ptr %call12, i64 %mul50
   %load54 = load <48 x i8>, ptr %getelementptr53, align 1, !tbaa !10, !alias.scope !18
   %shufflevector = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
-  %shufflevector55 = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
-  %shufflevector56 = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
   %zext57 = zext <16 x i8> %shufflevector to <16 x i32>
   %mul58 = mul nuw nsw <16 x i32> %zext57, splat (i32 19595)
-  %zext59 = zext <16 x i8> %shufflevector55 to <16 x i32>
+  %zext59 = zext <16 x i8> %shufflevector to <16 x i32>
   %mul60 = mul nuw nsw <16 x i32> %zext59, splat (i32 38470)
-  %zext61 = zext <16 x i8> %shufflevector56 to <16 x i32>
+  %zext61 = zext <16 x i8> %shufflevector to <16 x i32>
   %mul62 = mul nuw nsw <16 x i32> %zext61, splat (i32 7471)
   %add63 = add nuw nsw <16 x i32> %mul58, splat (i32 32768)
   %add64 = add nuw nsw <16 x i32> %add63, %mul60
@@ -247,76 +51,21 @@ bb48:                                             ; preds = %bb48, %bb45
   %trunc82 = trunc <16 x i32> %lshr81 to <16 x i8>
   %shufflevector83 = shufflevector <16 x i8> %trunc66, <16 x i8> %trunc74, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %shufflevector85 = shufflevector <32 x i8> %shufflevector83, <32 x i8> %shufflevector84, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
-  store <48 x i8> %shufflevector85, ptr %getelementptr51, align 1, !tbaa !10, !alias.scope !21, !noalias !18
+  store <32 x i8> %shufflevector83, ptr %getelementptr53, align 1, !tbaa !10,  !noalias !18
   %add86 = add nuw i64 %phi49, 16
-  %icmp87 = icmp eq i64 %add86, %and47
-  br i1 %icmp87, label %bb88, label %bb48, !llvm.loop !23
-
-bb88:                                             ; preds = %bb48
-  %icmp89 = icmp eq i64 %and47, %zext32
-  br label %bb191
+  %icmp87 = icmp eq i64 %add86, %mul50
+  br i1 %icmp87, label %bb205, label %bb48
 
-bb191:                                            ; preds = %bb88
-  %add201 = add nuw nsw i64 %phi34, 1
-  %load202 = load i32, ptr %getelementptr, align 8, !tbaa !13
-  %zext203 = zext i32 %load202 to i64
-  %icmp204 = icmp samesign ult i64 %add201, %zext203
-  br i1 %icmp204, label %bb33, label %bb205
-
-bb205:                                            ; preds = %bb205, %bb24
+bb205:                                            ; preds = %bb48, %bb
   ret i32 0
 }
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #1
-
-declare void @zot() local_unnamed_addr #2
-
-declare i32 @pluto(i32 noundef, ptr noundef) local_unnamed_addr #2
-
-declare ptr @pluto.9(ptr noundef, ptr noundef, i32 noundef) local_unnamed_addr #2
-
-declare ptr @baz(ptr noundef, ptr noundef, ptr noundef, ptr noundef, ptr noundef) local_unnamed_addr #2
-
-declare void @ham.10(i32 noundef, ptr noundef, ...) local_unnamed_addr #2
+attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+sve"}
 
-declare ptr @zot.11(i64 noundef, ptr noundef, i32 noundef) local_unnamed_addr #2
-
-declare void @wombat() local_unnamed_addr #2
-
-
-attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a,-fmv"}
-attributes #2 = { "target-cpu"="cortex-a57" "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a,-fmv" }
-attributes #4 = { nounwind }
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 8, !"PIC Level", i32 2}
-!2 = !{i32 7, !"PIE Level", i32 2}
-!3 = !{i32 7, !"uwtable", i32 2}
-!4 = !{i32 7, !"frame-pointer", i32 1}
-!5 = !{!6, !6, i64 0}
-!6 = !{!"p1 omnipotent char", !7, i64 0}
-!7 = !{!"any pointer", !8, i64 0}
 !8 = !{!"omnipotent char", !9, i64 0}
 !9 = !{!"Simple C/C++ TBAA"}
 !10 = !{!8, !8, i64 0}
-!11 = !{!12, !12, i64 0}
 !12 = !{!"int", !8, i64 0}
-!13 = !{!14, !12, i64 144}
-!14 = !{!"TCDef", !8, i64 0, !8, i64 16, !8, i64 32, !8, i64 48, !8, i64 64, !15, i64 128, !16, i64 130, !16, i64 134, !16, i64 138, !12, i64 144, !12, i64 148, !12, i64 152, !15, i64 156, !17, i64 160, !17, i64 168, !17, i64 176, !17, i64 184}
-!15 = !{!"short", !8, i64 0}
-!16 = !{!"", !8, i64 0, !8, i64 1, !8, i64 2, !8, i64 3}
-!17 = !{!"long", !8, i64 0}
 !18 = !{!19}
 !19 = distinct !{!19, !20}
 !20 = distinct !{!20, !"LVerDomain"}
-!21 = !{!22}
-!22 = distinct !{!22, !20}
-!23 = distinct !{!23, !24, !25}
-!24 = !{!"llvm.loop.isvectorized", i32 1}
-!25 = !{!"llvm.loop.unroll.runtime.disable"}
-!26 = !{!"branch_weights", i32 8, i32 8}
-!27 = distinct !{!27, !24, !25}
-!28 = distinct !{!28, !24}
-

>From cae9a19c91985f77f057a594d6521c37bc424619 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Thu, 6 Nov 2025 17:55:05 +0530
Subject: [PATCH 3/6] fix tests

---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 26 +++++++------------
 .../CostModel/AArch64/shuffle-other.ll        | 12 +++++++++
 .../AArch64/identity-shuffle-sve.ll           |  4 +--
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 39b9ed1e6df52..280cc11dbdafe 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5895,6 +5895,15 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
     SrcTy = DstTy;
   }
 
+  // Check for identity masks, which we can treat as free for both fixed and
+  // scalable vector paths.
+  if (!Mask.empty() && LT.second.isFixedLengthVector() &&
+      (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
+      all_of(enumerate(Mask), [](const auto &M) {
+        return M.value() < 0 || M.value() == (int)M.index();
+      }))
+    return 0;
+
   // Segmented shuffle matching.
   if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
       !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
@@ -5906,15 +5915,6 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
         VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
     unsigned SegmentElts = VTy->getNumElements() / Segments;
 
-    // Check for identity masks when SVE is available, which we can treat as
-    // free.
-    if (LT.second.isFixedLengthVector() && ST->isSVEorStreamingSVEAvailable() &&
-        (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
-        all_of(enumerate(Mask), [](const auto &M) {
-          return M.value() < 0 || M.value() == (int)M.index();
-        }))
-      return 0;
-
     // dupq zd.t, zn.t[idx]
     if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
         ST->isSVEorStreamingSVEAvailable() &&
@@ -5951,14 +5951,6 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
       all_of(Mask, [](int E) { return E < 8; }))
     return getPerfectShuffleCost(Mask);
 
-  // Check for identity masks, which we can treat as free.
-  if (!Mask.empty() && LT.second.isFixedLengthVector() &&
-      (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
-      all_of(enumerate(Mask), [](const auto &M) {
-        return M.value() < 0 || M.value() == (int)M.index();
-      }))
-    return 0;
-
   // Check for other shuffles that are not SK_ kinds but we have native
   // instructions for, for example ZIP and UZP.
   unsigned Unused;
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 4579acb9b3555..76be4dc4b19fb 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -490,3 +490,15 @@ define void @vst4(ptr %p) {
 
   ret void
 }
+
+define <16 x i8> @identity_shuffle_costs() #0 {
+bb:
+  ; CHECK-LABEL: 'identity_shuffle_costs'
+  ; CHECK: Cost Model: Found costs of 0 for: %shufflevector142 = shufflevector <16 x i8> %trunc125, <16 x i8> %trunc133, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %trunc125 = trunc <16 x i32> zeroinitializer to <16 x i8>
+  %trunc133 = trunc <16 x i32> zeroinitializer to <16 x i8>
+  %shufflevector142 = shufflevector <16 x i8> %trunc125, <16 x i8> %trunc133, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shufflevector142
+}
+
+attributes #0 = { "target-features"="+sve,+neon" }
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
index 45413591bbc96..0a668f856fd3e 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=vector-combine -mtriple=aarch64-unknown-linux-gnu -S %s | FileCheck %s
-; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefix=COST
 target triple = "aarch64-unknown-linux-gnu"
 
 define i32 @ham(ptr %call12) local_unnamed_addr #0 {
@@ -12,7 +11,6 @@ define i32 @ham(ptr %call12) local_unnamed_addr #0 {
 
 ; CHECK:    [[TMP8:%.*]] = mul nuw <32 x i32> [[TMP7:%.*]], <i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 7471, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568, i32 16759568>
 
-; COST: Cost Model: Found costs of 0 for:   %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 bb:
   br label %bb48
 
@@ -60,7 +58,7 @@ bb205:                                            ; preds = %bb48, %bb
   ret i32 0
 }
 
-attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+sve"}
+attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+sve,+neon"}
 
 !8 = !{!"omnipotent char", !9, i64 0}
 !9 = !{!"Simple C/C++ TBAA"}

>From 83ed2d5e0b0e87c8e5b2d4ca9a587b26e411ff48 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Fri, 14 Nov 2025 10:59:10 +0530
Subject: [PATCH 4/6] test refactors

---
 .../CostModel/AArch64/shuffle-other.ll        | 12 +++++-----
 .../AArch64/identity-shuffle-sve.ll           | 22 ++++++-------------
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 76be4dc4b19fb..255877fcdca5e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -491,14 +491,14 @@ define void @vst4(ptr %p) {
   ret void
 }
 
-define <16 x i8> @identity_shuffle_costs() #0 {
+define void @identity_shuffle_costs() #0 {
 bb:
   ; CHECK-LABEL: 'identity_shuffle_costs'
-  ; CHECK: Cost Model: Found costs of 0 for: %shufflevector142 = shufflevector <16 x i8> %trunc125, <16 x i8> %trunc133, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %trunc125 = trunc <16 x i32> zeroinitializer to <16 x i8>
-  %trunc133 = trunc <16 x i32> zeroinitializer to <16 x i8>
-  %shufflevector142 = shufflevector <16 x i8> %trunc125, <16 x i8> %trunc133, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %shufflevector142
+  ; CHECK: Cost Model: Found costs of 0 for: %shufflevector142 = shufflevector <16 x i8> zeroinitializer, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ; CHECK: Cost Model: Found costs of 0 for:   %shufflevector84 = shufflevector <16 x i8> zeroinitializer, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shufflevector142 = shufflevector <16 x i8> zeroinitializer, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shufflevector84 = shufflevector <16 x i8> zeroinitializer, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret void
 }
 
 attributes #0 = { "target-features"="+sve,+neon" }
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
index 0a668f856fd3e..514847978d739 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
@@ -1,10 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -mtriple=aarch64-unknown-linux-gnu -S %s | FileCheck %s
+; NOTE: This test is expected to test the Identity shuffle costs as zero, regardless of target ISA in AArch64, scalable or fixed width shuffle vectors.
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
 target triple = "aarch64-unknown-linux-gnu"
 
-define i32 @ham(ptr %call12) local_unnamed_addr #0 {
+define i32 @ham(ptr %call12) #0 {
 ; CHECK-LABEL: define i32 @ham(
-; CHECK-SAME: ptr [[CALL12:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr [[CALL12:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK:    [[TMP2:%.*]] = mul nuw nsw <32 x i32> [[TMP1:%.*]], <i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 19595, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
 
 ; CHECK:    [[TMP5:%.*]] = mul nuw <32 x i32> [[TMP4:%.*]], <i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 38470, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097, i32 16762097>
@@ -18,7 +18,7 @@ bb48:                                             ; preds = %bb48, %bb
   %phi49 = phi i64 [ 0, %bb ], [ %add86, %bb48 ]
   %mul50 = mul i64 %phi49, 3
   %getelementptr53 = getelementptr i8, ptr %call12, i64 %mul50
-  %load54 = load <48 x i8>, ptr %getelementptr53, align 1, !tbaa !10, !alias.scope !18
+  %load54 = load <48 x i8>, ptr %getelementptr53, align 1
   %shufflevector = shufflevector <48 x i8> %load54, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
   %zext57 = zext <16 x i8> %shufflevector to <16 x i32>
   %mul58 = mul nuw nsw <16 x i32> %zext57, splat (i32 19595)
@@ -49,7 +49,7 @@ bb48:                                             ; preds = %bb48, %bb
   %trunc82 = trunc <16 x i32> %lshr81 to <16 x i8>
   %shufflevector83 = shufflevector <16 x i8> %trunc66, <16 x i8> %trunc74, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %shufflevector84 = shufflevector <16 x i8> %trunc82, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  store <32 x i8> %shufflevector83, ptr %getelementptr53, align 1, !tbaa !10,  !noalias !18
+  store <32 x i8> %shufflevector83, ptr %getelementptr53, align 1
   %add86 = add nuw i64 %phi49, 16
   %icmp87 = icmp eq i64 %add86, %mul50
   br i1 %icmp87, label %bb205, label %bb48
@@ -58,12 +58,4 @@ bb205:                                            ; preds = %bb48, %bb
   ret i32 0
 }
 
-attributes #0 = { nounwind uwtable vscale_range(1,16) "target-cpu"="cortex-a57" "target-features"="+sve,+neon"}
-
-!8 = !{!"omnipotent char", !9, i64 0}
-!9 = !{!"Simple C/C++ TBAA"}
-!10 = !{!8, !8, i64 0}
-!12 = !{!"int", !8, i64 0}
-!18 = !{!19}
-!19 = distinct !{!19, !20}
-!20 = distinct !{!20, !"LVerDomain"}
+attributes #0 = { vscale_range(1,16) "target-features"="+sve,+neon"}

>From 37c4e86191d4ce552a0212630fb24a9ee179e0b3 Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Fri, 14 Nov 2025 11:01:23 +0530
Subject: [PATCH 5/6] note in test

---
 .../Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
index 514847978d739..4e398cf1f8ccf 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
@@ -1,4 +1,4 @@
-; NOTE: This test is expected to test the Identity shuffle costs as zero, regardless of target ISA in AArch64, scalable or fixed width shuffle vectors.
+; NOTE: This test is expected to test the Identity shuffle costs as zero, regardless of target ISA in AArch64, scalable or fixed width shuffle vectors, As a result enabling aggressive vector-combine transforms.
 ; RUN: opt -passes=vector-combine -S %s | FileCheck %s
 target triple = "aarch64-unknown-linux-gnu"
 

>From 1ed5db7b77272138a6e735bc26fcc6757e3c998f Mon Sep 17 00:00:00 2001
From: Pawan Nirpal <pnirpal at qti.qualcomm.com>
Date: Tue, 18 Nov 2025 23:05:22 +0530
Subject: [PATCH 6/6] test note update

---
 .../Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
index 4e398cf1f8ccf..f499ea9a20c6f 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/identity-shuffle-sve.ll
@@ -1,4 +1,4 @@
-; NOTE: This test is expected to test the Identity shuffle costs as zero, regardless of target ISA in AArch64, scalable or fixed width shuffle vectors, As a result enabling aggressive vector-combine transforms.
+; NOTE: This test is expected to test the Identity shuffle costs as zero, regardless of scalable or fixed width shuffle vectors, As a result enabling aggressive vector-combine transforms.
 ; RUN: opt -passes=vector-combine -S %s | FileCheck %s
 target triple = "aarch64-unknown-linux-gnu"
 



More information about the llvm-commits mailing list