[llvm] [AArch64] use `isTRNMask` to calculate shuffle costs (PR #171524)

Fri Dec 12 09:00:14 PST 2025

https://github.com/ginsbach updated https://github.com/llvm/llvm-project/pull/171524

>From 657317cf8d0e7b93f377a7803209404b4a44a848 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Sun, 23 Nov 2025 21:54:40 +0000
Subject: [PATCH 1/5] [AArch64] add slp vectorizer test for new isTRNMask

---
 .../AArch64/transpose-with-constants.ll       | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll
new file mode 100644
index 0000000000000..5eee1d7709aa1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define dso_local <16 x i8> @transpose_splat_constants(i8 noundef %x) {
+; CHECK-LABEL: @transpose_splat_constants(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison, i32 4, i32 poison, i32 5, i32 poison, i32 6, i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> <i8 poison, i8 0, i8 poison, i8 1, i8 poison, i8 2, i8 poison, i8 3, i8 poison, i8 4, i8 poison, i8 5, i8 poison, i8 6, i8 poison, i8 7>, <16 x i8> [[TMP2]], <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+entry:
+  %0 = insertelement <16 x i8> <i8 poison, i8 0, i8 poison, i8 1, i8 poison, i8 2, i8 poison, i8 3, i8 poison, i8 4, i8 poison, i8 5, i8 poison, i8 6, i8 poison, i8 7>, i8 %x, i64 0
+  %1 = insertelement <16 x i8> %0, i8 %x, i64 2
+  %2 = insertelement <16 x i8> %1, i8 %x, i64 4
+  %3 = insertelement <16 x i8> %2, i8 %x, i64 6
+  %4 = insertelement <16 x i8> %3, i8 %x, i64 8
+  %5 = insertelement <16 x i8> %4, i8 %x, i64 10
+  %6 = insertelement <16 x i8> %5, i8 %x, i64 12
+  %7 = insertelement <16 x i8> %6, i8 %x, i64 14
+  ret <16 x i8> %7
+}
+
+define dso_local <16 x i8> @transpose_constants_splat(i8 noundef %x) {
+; CHECK-LABEL: @transpose_constants_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i8> <i8 0, i8 poison, i8 1, i8 poison, i8 2, i8 poison, i8 3, i8 poison, i8 4, i8 poison, i8 5, i8 poison, i8 6, i8 poison, i8 7, i8 poison>, i8 [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[X]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[X]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[X]], i64 7
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[X]], i64 9
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[X]], i64 11
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[X]], i64 13
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[X]], i64 15
+; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
+;
+entry:
+  %0 = insertelement <16 x i8> <i8 0, i8 poison, i8 1, i8 poison, i8 2, i8 poison, i8 3, i8 poison, i8 4, i8 poison, i8 5, i8 poison, i8 6, i8 poison, i8 7, i8 poison>, i8 %x, i64 1
+  %1 = insertelement <16 x i8> %0, i8 %x, i64 3
+  %2 = insertelement <16 x i8> %1, i8 %x, i64 5
+  %3 = insertelement <16 x i8> %2, i8 %x, i64 7
+  %4 = insertelement <16 x i8> %3, i8 %x, i64 9
+  %5 = insertelement <16 x i8> %4, i8 %x, i64 11
+  %6 = insertelement <16 x i8> %5, i8 %x, i64 13
+  %7 = insertelement <16 x i8> %6, i8 %x, i64 15
+  ret <16 x i8> %7
+}

>From 17d3d3934af9ce95990f078bfa6932a7340fe657 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Fri, 14 Nov 2025 21:39:19 +0000
Subject: [PATCH 2/5] [AArch64] use isTRNMask to calculate shuffle costs

---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp   | 13 +++++++++++++
 .../lib/Target/AArch64/AArch64TargetTransformInfo.h |  5 +++++
 .../AArch64/transpose-with-constants.ll             | 12 ++++--------
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 043be554f8441..0e667e6ecea86 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5918,6 +5918,19 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost + 2;
 }
 
+TTI::ShuffleKind AArch64TTIImpl::improveShuffleKindFromMask(
+    TTI::ShuffleKind Kind, ArrayRef<int> Mask, VectorType *SrcTy, int &Index,
+    VectorType *&SubTy) const {
+  TTI::ShuffleKind SuperResult = BasicTTIImplBase::improveShuffleKindFromMask(
+      Kind, Mask, SrcTy, Index, SubTy);
+  if (SuperResult == TTI::ShuffleKind::SK_PermuteTwoSrc && !Mask.empty()) {
+    unsigned DummyUnsigned;
+    if (isTRNMask(Mask, Mask.size(), DummyUnsigned, DummyUnsigned))
+      return TTI::ShuffleKind::SK_Transpose;
+  }
+  return SuperResult;
+}
+
 InstructionCost
 AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                                VectorType *SrcTy, ArrayRef<int> Mask,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ecefe2a7f1380..084c1af4298a1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -494,6 +494,11 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
 
+  TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
+                                              ArrayRef<int> Mask,
+                                              VectorType *SrcTy, int &Index,
+                                              VectorType *&SubTy) const;
+
   InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll
index 5eee1d7709aa1..66461c9d91064 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-with-constants.ll
@@ -28,14 +28,10 @@ entry:
 define dso_local <16 x i8> @transpose_constants_splat(i8 noundef %x) {
 ; CHECK-LABEL: @transpose_constants_splat(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i8> <i8 0, i8 poison, i8 1, i8 poison, i8 2, i8 poison, i8 3, i8 poison, i8 4, i8 poison, i8 5, i8 poison, i8 6, i8 poison, i8 7, i8 poison>, i8 [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[X]], i64 3
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[X]], i64 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[X]], i64 7
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[X]], i64 9
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[X]], i64 11
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[X]], i64 13
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[X]], i64 15
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison, i32 4, i32 poison, i32 5, i32 poison, i32 6, i32 poison, i32 7, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> <i8 0, i8 poison, i8 1, i8 poison, i8 2, i8 poison, i8 3, i8 poison, i8 4, i8 poison, i8 5, i8 poison, i8 6, i8 poison, i8 7, i8 poison>, <16 x i8> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
 ;
 entry:

>From f0a23dccb2787311932b8a7801cf775dc9ac515e Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Fri, 12 Dec 2025 14:59:19 +0000
Subject: [PATCH 3/5] add cost model tests for trn with flipped inputs

---
 .../CostModel/AArch64/shuffle-transpose.ll    | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
index 4c4843088551a..a1e9ee345c5d9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
@@ -12,6 +12,15 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
   ret <8 x i8> %tmp0
 }
 
+define <8 x i8> @trn1.v8i8_flipped(<8 x i8> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: 'trn1.v8i8_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0
+;
+  %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x i8> %tmp0
+}
+
 define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'trn2.v8i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -21,6 +30,15 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
   ret <8 x i8> %tmp0
 }
 
+define <8 x i8> @trn2.v8i8_flipped(<8 x i8> %v0, <8 x i8> %v1) {
+; CHECK-LABEL: 'trn2.v8i8_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0
+;
+  %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x i8> %tmp0
+}
+
 define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'trn1.v16i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -30,6 +48,15 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
   ret <16 x i8> %tmp0
 }
 
+define <16 x i8> @trn1.v16i8_flipped(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: 'trn1.v16i8_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 16, i32 0, i32 18, i32 2, i32 20, i32 4, i32 22, i32 6, i32 24, i32 8, i32 26, i32 10, i32 28, i32 12, i32 30, i32 14>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0
+;
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 16, i32 0, i32 18, i32 2, i32 20, i32 4, i32 22, i32 6, i32 24, i32 8, i32 26, i32 10, i32 28, i32 12, i32 30, i32 14>
+  ret <16 x i8> %tmp0
+}
+
 define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'trn2.v16i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -39,6 +66,15 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
   ret <16 x i8> %tmp0
 }
 
+define <16 x i8> @trn2.v16i8_flipped(<16 x i8> %v0, <16 x i8> %v1) {
+; CHECK-LABEL: 'trn2.v16i8_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 17, i32 1, i32 19, i32 3, i32 21, i32 5, i32 23, i32 7, i32 25, i32 9, i32 27, i32 11, i32 29, i32 13, i32 31, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0
+;
+  %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 17, i32 1, i32 19, i32 3, i32 21, i32 5, i32 23, i32 7, i32 25, i32 9, i32 27, i32 11, i32 29, i32 13, i32 31, i32 15>
+  ret <16 x i8> %tmp0
+}
+
 define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: 'trn1.v4i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -48,6 +84,15 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
   ret <4 x i16> %tmp0
 }
 
+define <4 x i16> @trn1.v4i16_flipped(<4 x i16> %v0, <4 x i16> %v1) {
+; CHECK-LABEL: 'trn1.v4i16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %tmp0
+;
+  %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x i16> %tmp0
+}
+
 define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: 'trn2.v4i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -57,6 +102,15 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
   ret <4 x i16> %tmp0
 }
 
+define <4 x i16> @trn2.v4i16_flipped(<4 x i16> %v0, <4 x i16> %v1) {
+; CHECK-LABEL: 'trn2.v4i16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %tmp0
+;
+  %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x i16> %tmp0
+}
+
 define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'trn1.v8i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -66,6 +120,15 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
   ret <8 x i16> %tmp0
 }
 
+define <8 x i16> @trn1.v8i16_flipped(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: 'trn1.v8i16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0
+;
+  %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x i16> %tmp0
+}
+
 define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'trn2.v8i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -75,6 +138,15 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
   ret <8 x i16> %tmp0
 }
 
+define <8 x i16> @trn2.v8i16_flipped(<8 x i16> %v0, <8 x i16> %v1) {
+; CHECK-LABEL: 'trn2.v8i16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0
+;
+  %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x i16> %tmp0
+}
+
 define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: 'trn1.v2i32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
@@ -84,6 +156,15 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
   ret <2 x i32> %tmp0
 }
 
+define <2 x i32> @trn1.v2i32_flipped(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: 'trn1.v2i32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0
+;
+  %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 0>
+  ret <2 x i32> %tmp0
+}
+
 define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: 'trn2.v2i32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
@@ -93,6 +174,15 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
   ret <2 x i32> %tmp0
 }
 
+define <2 x i32> @trn2.v2i32_flipped(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: 'trn2.v2i32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0
+;
+  %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 3, i32 1>
+  ret <2 x i32> %tmp0
+}
+
 define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: 'trn1.v4i32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -102,6 +192,15 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
   ret <4 x i32> %tmp0
 }
 
+define <4 x i32> @trn1.v4i32_flipped(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: 'trn1.v4i32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %tmp0
+;
+  %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x i32> %tmp0
+}
+
 define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: 'trn2.v4i32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -111,6 +210,15 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
   ret <4 x i32> %tmp0
 }
 
+define <4 x i32> @trn2.v4i32_flipped(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: 'trn2.v4i32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %tmp0
+;
+  %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x i32> %tmp0
+}
+
 define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: 'trn1.v2i64'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 0, i32 2>
@@ -120,6 +228,15 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
   ret <2 x i64> %tmp0
 }
 
+define <2 x i64> @trn1.v2i64_flipped(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: 'trn1.v2i64_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %tmp0
+;
+  %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %tmp0
+}
+
 define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: 'trn2.v2i64'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
@@ -129,6 +246,15 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
   ret <2 x i64> %tmp0
 }
 
+define <2 x i64> @trn2.v2i64_flipped(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: 'trn2.v2i64_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %tmp0
+;
+  %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 3, i32 1>
+  ret <2 x i64> %tmp0
+}
+
 define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: 'trn1.v2f32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
@@ -138,6 +264,15 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
   ret <2 x float> %tmp0
 }
 
+define <2 x float> @trn1.v2f32_flipped(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: 'trn1.v2f32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0
+;
+  %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 2, i32 0>
+  ret <2 x float> %tmp0
+}
+
 define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: 'trn2.v2f32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
@@ -147,6 +282,15 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
   ret <2 x float> %tmp0
 }
 
+define <2 x float> @trn2.v2f32_flipped(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: 'trn2.v2f32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0
+;
+  %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 3, i32 1>
+  ret <2 x float> %tmp0
+}
+
 define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: 'trn1.v4f32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -156,6 +300,15 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
   ret <4 x float> %tmp0
 }
 
+define <4 x float> @trn1.v4f32_flipped(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: 'trn1.v4f32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %tmp0
+;
+  %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x float> %tmp0
+}
+
 define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: 'trn2.v4f32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -165,6 +318,15 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
   ret <4 x float> %tmp0
 }
 
+define <4 x float> @trn2.v4f32_flipped(<4 x float> %v0, <4 x float> %v1) {
+; CHECK-LABEL: 'trn2.v4f32_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %tmp0
+;
+  %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x float> %tmp0
+}
+
 define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: 'trn1.v2f64'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 0, i32 2>
@@ -174,6 +336,15 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
   ret <2 x double> %tmp0
 }
 
+define <2 x double> @trn1.v2f64_flipped(<2 x double> %v0, <2 x double> %v1) {
+; CHECK-LABEL: 'trn1.v2f64_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %tmp0
+;
+  %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %tmp0
+}
+
 define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: 'trn2.v2f64'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
@@ -183,6 +354,15 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
   ret <2 x double> %tmp0
 }
 
+define <2 x double> @trn2.v2f64_flipped(<2 x double> %v0, <2 x double> %v1) {
+; CHECK-LABEL: 'trn2.v2f64_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 3, i32 1>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %tmp0
+;
+  %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 3, i32 1>
+  ret <2 x double> %tmp0
+}
+
 define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: 'trn1.v4f16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -192,6 +372,15 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
   ret <4 x half> %tmp0
 }
 
+define <4 x half> @trn1.v4f16_flipped(<4 x half> %v0, <4 x half> %v1) {
+; CHECK-LABEL: 'trn1.v4f16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %tmp0
+;
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x half> %tmp0
+}
+
 define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: 'trn2.v4f16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -201,6 +390,15 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
   ret <4 x half> %tmp0
 }
 
+define <4 x half> @trn2.v4f16_flipped(<4 x half> %v0, <4 x half> %v1) {
+; CHECK-LABEL: 'trn2.v4f16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %tmp0
+;
+  %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x half> %tmp0
+}
+
 define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: 'trn1.v8f16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -210,6 +408,15 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
   ret <8 x half> %tmp0
 }
 
+define <8 x half> @trn1.v8f16_flipped(<8 x half> %v0, <8 x half> %v1) {
+; CHECK-LABEL: 'trn1.v8f16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %tmp0
+;
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x half> %tmp0
+}
+
 define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: 'trn2.v8f16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -219,6 +426,15 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
   ret <8 x half> %tmp0
 }
 
+define <8 x half> @trn2.v8f16_flipped(<8 x half> %v0, <8 x half> %v1) {
+; CHECK-LABEL: 'trn2.v8f16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %tmp0
+;
+  %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x half> %tmp0
+}
+
 define <4 x bfloat> @trn1.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
 ; CHECK-LABEL: 'trn1.v4bf16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -228,6 +444,15 @@ define <4 x bfloat> @trn1.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
   ret <4 x bfloat> %tmp0
 }
 
+define <4 x bfloat> @trn1.v4bf16_flipped(<4 x bfloat> %v0, <4 x bfloat> %v1) {
+; CHECK-LABEL: 'trn1.v4bf16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x bfloat> %tmp0
+}
+
 define <4 x bfloat> @trn2.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
 ; CHECK-LABEL: 'trn2.v4bf16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -237,6 +462,15 @@ define <4 x bfloat> @trn2.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
   ret <4 x bfloat> %tmp0
 }
 
+define <4 x bfloat> @trn2.v4bf16_flipped(<4 x bfloat> %v0, <4 x bfloat> %v1) {
+; CHECK-LABEL: 'trn2.v4bf16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x bfloat> %tmp0
+}
+
 define <8 x bfloat> @trn1.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
 ; CHECK-LABEL: 'trn1.v8bf16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -246,6 +480,15 @@ define <8 x bfloat> @trn1.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
   ret <8 x bfloat> %tmp0
 }
 
+define <8 x bfloat> @trn1.v8bf16_flipped(<8 x bfloat> %v0, <8 x bfloat> %v1) {
+; CHECK-LABEL: 'trn1.v8bf16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x bfloat> %tmp0
+}
+
 define <8 x bfloat> @trn2.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
 ; CHECK-LABEL: 'trn2.v8bf16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -254,3 +497,12 @@ define <8 x bfloat> @trn2.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
   %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x bfloat> %tmp0
 }
+
+define <8 x bfloat> @trn2.v8bf16_flipped(<8 x bfloat> %v0, <8 x bfloat> %v1) {
+; CHECK-LABEL: 'trn2.v8bf16_flipped'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x bfloat> %tmp0
+}

>From ece0baf6c84108e3c9665fff25f79281c26ecebf Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Fri, 12 Dec 2025 14:45:55 +0000
Subject: [PATCH 4/5] alternative implementation directly in getShuffleCost

---
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  | 14 +-------------
 .../Target/AArch64/AArch64TargetTransformInfo.h    |  5 -----
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0e667e6ecea86..6e96a25bec845 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5918,19 +5918,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost + 2;
 }
 
-TTI::ShuffleKind AArch64TTIImpl::improveShuffleKindFromMask(
-    TTI::ShuffleKind Kind, ArrayRef<int> Mask, VectorType *SrcTy, int &Index,
-    VectorType *&SubTy) const {
-  TTI::ShuffleKind SuperResult = BasicTTIImplBase::improveShuffleKindFromMask(
-      Kind, Mask, SrcTy, Index, SubTy);
-  if (SuperResult == TTI::ShuffleKind::SK_PermuteTwoSrc && !Mask.empty()) {
-    unsigned DummyUnsigned;
-    if (isTRNMask(Mask, Mask.size(), DummyUnsigned, DummyUnsigned))
-      return TTI::ShuffleKind::SK_Transpose;
-  }
-  return SuperResult;
-}
-
 InstructionCost
 AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                                VectorType *SrcTy, ArrayRef<int> Mask,
@@ -6128,6 +6115,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
       LT.second.getVectorNumElements() == Mask.size() &&
       (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
       (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
+       isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
        isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
        isREVMask(Mask, LT.second.getScalarSizeInBits(),
                  LT.second.getVectorNumElements(), 16) ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 084c1af4298a1..ecefe2a7f1380 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -494,11 +494,6 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
 
-  TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
-                                              ArrayRef<int> Mask,
-                                              VectorType *SrcTy, int &Index,
-                                              VectorType *&SubTy) const;
-
   InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,

>From 4b835dac8f00e62fe3012a4c4915ea4a9de1a251 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Fri, 12 Dec 2025 15:54:48 +0000
Subject: [PATCH 5/5] detect flipped trn miscategorised as SK_InsertSubvector

---
 .../lib/Target/AArch64/AArch64TargetTransformInfo.cpp |  2 +-
 .../Analysis/CostModel/AArch64/shuffle-transpose.ll   |  4 ++--
 .../AArch64/extractelements-to-shuffle.ll             | 11 +++++------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6e96a25bec845..766644000a4f2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6113,7 +6113,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   unsigned Unused;
   if (LT.second.isFixedLengthVector() &&
       LT.second.getVectorNumElements() == Mask.size() &&
-      (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
+      (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_InsertSubvector) &&
       (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
        isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
        isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
index a1e9ee345c5d9..402313d00de36 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
@@ -158,7 +158,7 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <2 x i32> @trn1.v2i32_flipped(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: 'trn1.v2i32_flipped'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0
 ;
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 2, i32 0>
@@ -266,7 +266,7 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
 
 define <2 x float> @trn1.v2f32_flipped(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: 'trn1.v2f32_flipped'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 2, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0
 ;
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 2, i32 0>
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 10a17f7e3f9a6..7f737cd169147 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -645,19 +645,18 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo)
 ; CHECK-NEXT:    [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
 ; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
 ; CHECK-NEXT:    [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
 ; CHECK-NEXT:    [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
-; CHECK-NEXT:    [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0
-; CHECK-NEXT:    [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8
 ; CHECK-NEXT:    [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
 ; CHECK-NEXT:    [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
 ; CHECK-NEXT:    store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
-; CHECK-NEXT:    [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[O2:%.*]] = or i8 [[E4]], [[E3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
+; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[O3:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[O2:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[O4:%.*]] = or i8 [[O3]], [[O2]]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O4]], 0
 ; CHECK-NEXT:    ret i1 [[C]]