[llvm] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions (PR #145324)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 23 06:04:43 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/145324
Fixes #145276
>From 77d1e2dd3b0dbc4e9c298f182771f030291a82e8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 23 Jun 2025 14:03:59 +0100
Subject: [PATCH] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions
Fixes #145276
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 76 +++++++++++++++++++++++++
1 file changed, 76 insertions(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..2edae92747f0a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12096,6 +12096,37 @@ static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getBitcast(VT, Res);
}
+static SDValue lowerShuffleAsVSHLD(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget.hasVBMI2())
+ return SDValue();
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ return SDValue();
+
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+ for (int Scale = 2; Scale * ScalarSizeInBits <= 64; Scale *= 2) {
+ unsigned LaneSize = Scale * ScalarSizeInBits;
+ SmallVector<int, 8> RepeatedMask;
+ if (isRepeatedShuffleMask(LaneSize, VT, Mask, RepeatedMask)) {
+ for (int Shift = 1; Shift != Scale; ++Shift) {
+ if (isSequentialOrUndefInRange(RepeatedMask, 0, Shift,
+ (2 * Scale) - Shift) &&
+ isSequentialOrUndefInRange(RepeatedMask, Shift, Scale - Shift, 0)) {
+ MVT ShiftVT = MVT::getIntegerVT(LaneSize);
+ ShiftVT = MVT::getVectorVT(ShiftVT, VT.getSizeInBits() / LaneSize);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VSHLD, DL, ShiftVT, V1, V2,
+ DAG.getTargetConstant(8 * Shift, DL, MVT::i8)));
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -13789,6 +13820,11 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
}
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
@@ -14507,6 +14543,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
if (SDValue BitBlend =
lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
@@ -14702,6 +14743,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V =
lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
@@ -16861,6 +16907,11 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16954,6 +17005,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
+
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
@@ -17078,6 +17134,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to use bit rotation instructions.
if (V2.isUndef())
if (SDValue Rotate =
@@ -17590,6 +17651,11 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Assume that a single SHUFPS is faster than using a permv shuffle.
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
@@ -17655,6 +17721,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
if (V2.isUndef()) {
// Try to use bit rotation instructions.
if (SDValue Rotate =
@@ -17726,6 +17797,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to use bit rotation instructions.
if (V2.isUndef())
if (SDValue Rotate =
More information about the llvm-commits
mailing list