[llvm] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions (PR #145324)

Mon Jun 23 06:12:55 PDT 2025

https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/145324

>From 6fb473f884ba57a8a53bcf655627060963454bfd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 23 Jun 2025 14:03:59 +0100
Subject: [PATCH] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions

Fixes #145276
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 77 +++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..9726b26d18916 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12096,6 +12096,38 @@ static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
   return DAG.getBitcast(VT, Res);
 }
 
+static SDValue lowerShuffleAsVSHLD(const SDLoc &DL, MVT VT, SDValue V1,
+                                   SDValue V2, ArrayRef<int> Mask,
+                                   const X86Subtarget &Subtarget,
+                                   SelectionDAG &DAG) {
+  if (!Subtarget.hasVBMI2())
+    return SDValue();
+  if (!Subtarget.hasVLX() && !VT.is512BitVector())
+    return SDValue();
+
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  for (int Scale = 2; Scale * ScalarSizeInBits <= 64; Scale *= 2) {
+    unsigned LaneSize = Scale * ScalarSizeInBits;
+    SmallVector<int, 8> RepeatedMask;
+    if (isRepeatedShuffleMask(LaneSize, VT, Mask, RepeatedMask)) {
+      for (int Shift = 1; Shift != Scale; ++Shift) {
+        if (isSequentialOrUndefInRange(RepeatedMask, 0, Shift,
+                                       (2 * Scale) - Shift) &&
+            isSequentialOrUndefInRange(RepeatedMask, Shift, Scale - Shift, 0)) {
+          MVT ShiftVT = MVT::getIntegerVT(LaneSize);
+          ShiftVT = MVT::getVectorVT(ShiftVT, VT.getSizeInBits() / LaneSize);
+          return DAG.getBitcast(
+              VT, DAG.getNode(X86ISD::VSHLD, DL, ShiftVT, V1, V2,
+                              DAG.getTargetConstant(Shift * ScalarSizeInBits,
+                                                    DL, MVT::i8)));
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
 ///
 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -13789,6 +13821,11 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Rotate;
   }
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Assume that a single SHUFPS is faster than an alternative sequence of
   // multiple instructions (even if the CPU has a domain penalty).
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
@@ -14507,6 +14544,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   if (SDValue BitBlend =
           lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return BitBlend;
@@ -14702,6 +14744,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V =
           lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
@@ -16861,6 +16908,11 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16955,6 +17007,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -17078,6 +17135,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Try to use bit rotation instructions.
   if (V2.isUndef())
     if (SDValue Rotate =
@@ -17590,6 +17652,11 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                   Subtarget, DAG))
       return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Assume that a single SHUFPS is faster than using a permv shuffle.
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
@@ -17655,6 +17722,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   if (V2.isUndef()) {
     // Try to use bit rotation instructions.
     if (SDValue Rotate =
@@ -17726,6 +17798,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use funnel shift instructions.
+  if (SDValue Funnel =
+          lowerShuffleAsVSHLD(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+    return Funnel;
+
   // Try to use bit rotation instructions.
   if (V2.isUndef())
     if (SDValue Rotate =