[llvm] [RISCV] Tune flag for fast vrgather.vv (PR #124664)

Petr Penzin via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 21 00:35:56 PST 2025


https://github.com/ppenzin updated https://github.com/llvm/llvm-project/pull/124664

>From f048b525d82757f5acadb68385a5c52680606396 Mon Sep 17 00:00:00 2001
From: Petr Penzin <ppenzin at tenstorrent.com>
Date: Mon, 27 Jan 2025 18:47:59 -0600
Subject: [PATCH 1/7] [RISCV] Tune flag for fast vrgather.vv

Add tune knob for N*Log2(N) vrgather.vv cost.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td      | 4 ++++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 +++++
 llvm/lib/Target/RISCV/RISCVProcessors.td    | 1 +
 llvm/test/CodeGen/RISCV/features-info.ll    | 1 +
 4 files changed, 11 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 4119dd77804f1..966c56185b3fd 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1365,6 +1365,10 @@ def FeatureUnalignedVectorMem
                       "true", "Has reasonably performant unaligned vector "
                       "loads and stores">;
 
+def TuneFastVRGather
+   : SubtargetFeature<"fast-vrgather", "HasFastVRGather",
+                      "true", "Has vrgather.vv with LMUL*log2(LMUL) latency">;
+
 def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5e5bc0819a10c..e49c1e7ce9edb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2848,6 +2848,11 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const {
 /// is generally quadratic in the number of vreg implied by LMUL.  Note that
 /// operand (index and possibly mask) are handled separately.
 InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
+  auto LMULCost = getLMULCost(VT);
+  if (true && Subtarget.hasFastVRGather() && LMULCost.isValid()) {
+    unsigned Log = Log2_64(*LMULCost.getValue());
+    return LMULCost * Log;
+  }
   return getLMULCost(VT) * getLMULCost(VT);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6dfed7ddeb9f6..4f6c9a0229d51 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -490,6 +490,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
                                                   FeatureUnalignedScalarMem,
                                                   FeatureUnalignedVectorMem]),
                                                  [TuneNoDefaultUnroll,
+                                                  TuneFastVRGather,
                                                   TuneOptimizedZeroStrideLoad,
                                                   TunePostRAScheduler]>;
 
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 70fbda47a14a1..dab9bf92cef17 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -31,6 +31,7 @@
 ; CHECK:   experimental-zvbc32e             - 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements).
 ; CHECK:   experimental-zvkgs               - 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography).
 ; CHECK:   f                                - 'F' (Single-Precision Floating-Point).
+; CHECK:   fast-vrgather                    - Has vrgather.vv with LMUL*log2(LMUL) latency
 ; CHECK:   forced-atomics                   - Assume that lock-free native-width atomics are available.
 ; CHECK:   h                                - 'H' (Hypervisor).
 ; CHECK:   i                                - 'I' (Base Integer Instruction Set).

>From 63adbee9ece6de7d6c826b6e445a51658bae14ed Mon Sep 17 00:00:00 2001
From: Petr Penzin <penzin.dev at gmail.com>
Date: Tue, 28 Jan 2025 09:20:48 -0800
Subject: [PATCH 2/7] Fix typo

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e49c1e7ce9edb..5202a1868e7c5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2849,7 +2849,7 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const {
 /// operand (index and possibly mask) are handled separately.
 InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
   auto LMULCost = getLMULCost(VT);
-  if (true && Subtarget.hasFastVRGather() && LMULCost.isValid()) {
+  if (Subtarget.hasFastVRGather() && LMULCost.isValid()) {
     unsigned Log = Log2_64(*LMULCost.getValue());
     return LMULCost * Log;
   }

>From 92944de4b90ae842ae0174185b498ca035afb235 Mon Sep 17 00:00:00 2001
From: Petr Penzin <penzin.dev at gmail.com>
Date: Tue, 28 Jan 2025 10:03:26 -0800
Subject: [PATCH 3/7] Add run line to RISCV shuffle-permute.ll test

---
 .../CostModel/RISCV/shuffle-permute.ll        | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
index 105cc8a1c2ba3..c423d454c68be 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
@@ -3,6 +3,8 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -44,6 +46,24 @@ define void @general_permute_single_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; FAST-VRG-LABEL: 'general_permute_single_source'
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
@@ -133,6 +153,37 @@ define void @general_permute_two_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; FAST-VRG-LABEL: 'general_permute_two_source'
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>

>From f0da8f6effbe5b34906e37e4724f89e3d98068be Mon Sep 17 00:00:00 2001
From: Petr Penzin <penzin.dev at gmail.com>
Date: Tue, 28 Jan 2025 21:51:59 -0800
Subject: [PATCH 4/7] Remove code size from analysis test

---
 .../CostModel/RISCV/shuffle-permute.ll        | 94 +++++++++----------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
index c423d454c68be..f6ddc9ff92b0e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
@@ -3,8 +3,8 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -48,22 +48,22 @@ define void @general_permute_single_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; FAST-VRG-LABEL: 'general_permute_single_source'
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
@@ -155,35 +155,35 @@ define void @general_permute_two_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; FAST-VRG-LABEL: 'general_permute_two_source'
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>

>From fb7c6be218c15964ebea5842362ca64ebad6bfca Mon Sep 17 00:00:00 2001
From: Petr Penzin <penzin.dev at gmail.com>
Date: Wed, 29 Jan 2025 08:16:21 -0800
Subject: [PATCH 5/7] Reuse calculated LMUL cost

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5202a1868e7c5..7fe1a12d6a687 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2853,7 +2853,7 @@ InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
     unsigned Log = Log2_64(*LMULCost.getValue());
     return LMULCost * Log;
   }
-  return getLMULCost(VT) * getLMULCost(VT);
+  return LMULCost * LMULCost;
 }
 
 /// Return the cost of a vrgather.vi (or vx) instruction for the type VT.

>From 4ddd029ca67639621d7c05b0f48985ff2a111659 Mon Sep 17 00:00:00 2001
From: Petr Penzin <ppenzin at tenstorrent.com>
Date: Wed, 5 Feb 2025 17:35:05 -0600
Subject: [PATCH 6/7] Guard against zero and negative log values

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7fe1a12d6a687..ca1b6306d233c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2851,7 +2851,8 @@ InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
   auto LMULCost = getLMULCost(VT);
   if (Subtarget.hasFastVRGather() && LMULCost.isValid()) {
     unsigned Log = Log2_64(*LMULCost.getValue());
-    return LMULCost * Log;
+    if (Log > 0)
+      return LMULCost * Log;
   }
   return LMULCost * LMULCost;
 }

>From 1a0a1f9c29d1351d467919a1299b80c75714a7db Mon Sep 17 00:00:00 2001
From: Petr Penzin <ppenzin at tenstorrent.com>
Date: Fri, 14 Feb 2025 17:17:00 -0600
Subject: [PATCH 7/7] Change vrgather cost model to use an enum

Rename option to explicitly say "log", change tune flag to set one of
the values.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  6 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  6 +-
 llvm/lib/Target/RISCV/RISCVProcessors.td      |  2 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |  7 ++
 .../CostModel/RISCV/shuffle-permute.ll        | 98 +++++++++----------
 5 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 966c56185b3fd..cad78bf66d35a 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1365,9 +1365,9 @@ def FeatureUnalignedVectorMem
                       "true", "Has reasonably performant unaligned vector "
                       "loads and stores">;
 
-def TuneFastVRGather
-   : SubtargetFeature<"fast-vrgather", "HasFastVRGather",
-                      "true", "Has vrgather.vv with LMUL*log2(LMUL) latency">;
+def TuneNLogNVRGather
+   : SubtargetFeature<"log-vrgather", "RISCVVRGatherCostModel", "NLog2N",
+                      "Has vrgather.vv with LMUL*log2(LMUL) latency">;
 
 def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ca1b6306d233c..6bed257a0c669 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2849,7 +2849,9 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const {
 /// operand (index and possibly mask) are handled separately.
 InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
   auto LMULCost = getLMULCost(VT);
-  if (Subtarget.hasFastVRGather() && LMULCost.isValid()) {
+  bool Log2CostModel =
+      Subtarget.getVRGatherCostModel() == llvm::RISCVSubtarget::NLog2N;
+  if (Log2CostModel && LMULCost.isValid()) {
     unsigned Log = Log2_64(*LMULCost.getValue());
     if (Log > 0)
       return LMULCost * Log;
@@ -4234,7 +4236,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   // Cap the cost at a value linear to the number of elements in the vector.
   // The default lowering is to use the stack.  The vector store + scalar loads
   // is linear in VL.  However, at high lmuls vslide1down and vslidedown end up
-  // being (at least) linear in LMUL.  As a result, using the vslidedown
+  // being (at least) linear in LMUL.  As a resultdedown
   // lowering for every element ends up being VL*LMUL..
   // TODO: Should we be directly costing the stack alternative?  Doing so might
   // give us a more accurate upper bound.
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 4f6c9a0229d51..1dac5aef71ec0 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -490,7 +490,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
                                                   FeatureUnalignedScalarMem,
                                                   FeatureUnalignedVectorMem]),
                                                  [TuneNoDefaultUnroll,
-                                                  TuneFastVRGather,
+                                                  TuneNLogNVRGather,
                                                   TuneOptimizedZeroStrideLoad,
                                                   TunePostRAScheduler]>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 87d508c394173..2919cb6f739c3 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -84,11 +84,16 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     VentanaVeyron,
     MIPSP8700,
   };
+  enum RISCVVRGatherCostModelEnum : uint8_t {
+    Quadratic,
+    NLog2N,
+  };
   // clang-format on
 private:
   virtual void anchor();
 
   RISCVProcFamilyEnum RISCVProcFamily = Others;
+  RISCVVRGatherCostModelEnum RISCVVRGatherCostModel = Quadratic;
 
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
   bool ATTRIBUTE = DEFAULT;
@@ -155,6 +160,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   /// initializeProperties().
   RISCVProcFamilyEnum getProcFamily() const { return RISCVProcFamily; }
 
+  RISCVVRGatherCostModelEnum getVRGatherCostModel() const { return RISCVVRGatherCostModel; }
+
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
   bool GETTER() const { return ATTRIBUTE; }
 #include "RISCVGenSubtargetInfo.inc"
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
index f6ddc9ff92b0e..c4f6f3f1d874b 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll
@@ -3,8 +3,8 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG
 ; Check that we don't crash querying costs when vectors are not enabled.
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
 
@@ -47,23 +47,23 @@ define void @general_permute_single_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; FAST-VRG-LABEL: 'general_permute_single_source'
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; LOG-VRG-LABEL: 'general_permute_single_source'
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
@@ -154,36 +154,36 @@ define void @general_permute_two_source() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; FAST-VRG-LABEL: 'general_permute_two_source'
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; FAST-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; LOG-VRG-LABEL: 'general_permute_two_source'
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; LOG-VRG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>



More information about the llvm-commits mailing list