[llvm] [WIP][RISCV] Tune flag for fast vrgather.vv (PR #124664)

Mon Jan 27 16:53:40 PST 2025

https://github.com/ppenzin created https://github.com/llvm/llvm-project/pull/124664

WIP, _for review_

Add tune knob for N*Log2(N) vrgather.vv cost.

>From f048b525d82757f5acadb68385a5c52680606396 Mon Sep 17 00:00:00 2001
From: Petr Penzin <ppenzin at tenstorrent.com>
Date: Mon, 27 Jan 2025 18:47:59 -0600
Subject: [PATCH] [RISCV] Tune flag for fast vrgather.vv

Add tune knob for N*Log2(N) vrgather.vv cost.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td      | 4 ++++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 +++++
 llvm/lib/Target/RISCV/RISCVProcessors.td    | 1 +
 llvm/test/CodeGen/RISCV/features-info.ll    | 1 +
 4 files changed, 11 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 4119dd77804f1a..966c56185b3fd9 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1365,6 +1365,10 @@ def FeatureUnalignedVectorMem
                       "true", "Has reasonably performant unaligned vector "
                       "loads and stores">;
 
+def TuneFastVRGather
+   : SubtargetFeature<"fast-vrgather", "HasFastVRGather",
+                      "true", "Has vrgather.vv with LMUL*log2(LMUL) latency">;
+
 def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5e5bc0819a10cc..e49c1e7ce9edbc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2848,6 +2848,11 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const {
 /// is generally quadratic in the number of vreg implied by LMUL.  Note that
 /// operand (index and possibly mask) are handled separately.
 InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
+  auto LMULCost = getLMULCost(VT);
+  if (true && Subtarget.hasFastVRGather() && LMULCost.isValid()) {
+    unsigned Log = Log2_64(*LMULCost.getValue());
+    return LMULCost * Log;
+  }
   return getLMULCost(VT) * getLMULCost(VT);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6dfed7ddeb9f63..4f6c9a0229d51b 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -490,6 +490,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
                                                   FeatureUnalignedScalarMem,
                                                   FeatureUnalignedVectorMem]),
                                                  [TuneNoDefaultUnroll,
+                                                  TuneFastVRGather,
                                                   TuneOptimizedZeroStrideLoad,
                                                   TunePostRAScheduler]>;
 
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 70fbda47a14a14..dab9bf92cef17d 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -31,6 +31,7 @@
 ; CHECK:   experimental-zvbc32e             - 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements).
 ; CHECK:   experimental-zvkgs               - 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography).
 ; CHECK:   f                                - 'F' (Single-Precision Floating-Point).
+; CHECK:   fast-vrgather                    - Has vrgather.vv with LMUL*log2(LMUL) latency
 ; CHECK:   forced-atomics                   - Assume that lock-free native-width atomics are available.
 ; CHECK:   h                                - 'H' (Hypervisor).
 ; CHECK:   i                                - 'I' (Base Integer Instruction Set).