[llvm] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions (PR #145324)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 23 06:12:55 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/145324
>From 6fb473f884ba57a8a53bcf655627060963454bfd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 23 Jun 2025 14:03:59 +0100
Subject: [PATCH] [X86][VBMI2] Try to lower shuffle as VSHLDI instructions
Fixes #145276
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 77 +++++++++++++++++++++++++
1 file changed, 77 insertions(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2541182de1208..9726b26d18916 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12096,6 +12096,38 @@ static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getBitcast(VT, Res);
}
+static SDValue lowerShuffleAsVSHLD(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget.hasVBMI2())
+ return SDValue();
+ if (!Subtarget.hasVLX() && !VT.is512BitVector())
+ return SDValue();
+
+ unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+ for (int Scale = 2; Scale * ScalarSizeInBits <= 64; Scale *= 2) {
+ unsigned LaneSize = Scale * ScalarSizeInBits;
+ SmallVector<int, 8> RepeatedMask;
+ if (isRepeatedShuffleMask(LaneSize, VT, Mask, RepeatedMask)) {
+ for (int Shift = 1; Shift != Scale; ++Shift) {
+ if (isSequentialOrUndefInRange(RepeatedMask, 0, Shift,
+ (2 * Scale) - Shift) &&
+ isSequentialOrUndefInRange(RepeatedMask, Shift, Scale - Shift, 0)) {
+ MVT ShiftVT = MVT::getIntegerVT(LaneSize);
+ ShiftVT = MVT::getVectorVT(ShiftVT, VT.getSizeInBits() / LaneSize);
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VSHLD, DL, ShiftVT, V1, V2,
+ DAG.getTargetConstant(Shift * ScalarSizeInBits,
+ DL, MVT::i8)));
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -13789,6 +13821,11 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
}
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
@@ -14507,6 +14544,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
if (SDValue BitBlend =
lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
return BitBlend;
@@ -14702,6 +14744,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Use dedicated pack instructions for masks that match their pattern.
if (SDValue V =
lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
@@ -16861,6 +16908,11 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16955,6 +17007,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -17078,6 +17135,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to use bit rotation instructions.
if (V2.isUndef())
if (SDValue Rotate =
@@ -17590,6 +17652,11 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Assume that a single SHUFPS is faster than using a permv shuffle.
// If some CPU is harmed by the domain switch, we can fix it in a later pass.
if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
@@ -17655,6 +17722,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
if (V2.isUndef()) {
// Try to use bit rotation instructions.
if (SDValue Rotate =
@@ -17726,6 +17798,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use funnel shift instructions.
+ if (SDValue Funnel =
+ lowerShuffleAsVSHLD(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return Funnel;
+
// Try to use bit rotation instructions.
if (V2.isUndef())
if (SDValue Rotate =
More information about the llvm-commits
mailing list