[llvm] [X86] Declare 128/256-bit funnel shifts legal on VBMI2 + NOVLX targets (PR #184634)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 01:20:03 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/184634
>From 4a479214c780ca09c7867279f41c881971e2213b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 4 Mar 2026 15:31:56 +0000
Subject: [PATCH] [X86] Declare 128/256-bit funnel shifts legal on VBMI2 +
NOVLX targets
Similar to what we do for 128/256-bit AVX512F rotates already - and I've take the opportunity to create similar macros to reduce duplication in the rotate patterns
Another step towards #184002
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +-
llvm/lib/Target/X86/X86InstrAVX512.td | 164 ++++++++++--------------
2 files changed, 69 insertions(+), 109 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6384c4d58a480..7df9fdbd3074f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2157,8 +2157,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
MVT::v4i64}) {
- setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FSHL, VT, Legal);
+ setOperationAction(ISD::FSHR, VT, Legal);
}
}
@@ -31736,16 +31736,6 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
APInt APIntShiftAmt;
bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
unsigned NumElts = VT.getVectorNumElements();
-
- // For non-VLX VBMI2 targets, widen 128/256-bit to 512-bit so
- // the rest of the lowering/isel can select the VBMI2 forms.
- // Only Custom types (v8i16, v4i32, v2i64, v16i16, v8i32, v4i64) can
- // reach LowerFunnelShift with VBMI2 but no VLX, so no type check needed.
- if (Subtarget.hasVBMI2() && !Subtarget.hasVLX() && EltSizeInBits > 8) {
- return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
- {Op0, Op1, Amt}, DAG, Subtarget);
- }
-
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index df0d614a0251f..3b456aee7fceb 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -6261,107 +6261,35 @@ defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI
defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
-
-// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
-
- def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
+// Use 512bit VPROL/VPROR version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+multiclass avx512_rotate_novlx<X86VectorVTInfo Dst, X86VectorVTInfo Src,
+ Instruction InstrStrRI, SDNode OpNodeRI,
+ Instruction InstrStrRR, SDNode OpNodeRR> {
+ def : Pat<(Src.VT (OpNodeRI (Src.VT Src.RC:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (Dst.VT
+ (InstrStrRI
+ (Dst.VT (INSERT_SUBREG (IMPLICIT_DEF), Src.RC:$src1, Src.SubRegIdx)),
+ timm:$src2)),
+ Src.SubRegIdx)>;
+
+ def : Pat<(Src.VT (OpNodeRR (Src.VT Src.RC:$src1), (Src.VT Src.RC:$src2))),
+ (EXTRACT_SUBREG (Dst.VT
+ (InstrStrRR
+ (Dst.VT (INSERT_SUBREG (IMPLICIT_DEF), Src.RC:$src1, Src.SubRegIdx)),
+ (Dst.VT (INSERT_SUBREG (IMPLICIT_DEF), Src.RC:$src2, Src.SubRegIdx)))),
+ Src.SubRegIdx)>;
}
-// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
+ defm : avx512_rotate_novlx<v8i64_info, v4i64x_info, VPROLQZri, X86vrotli, VPROLVQZrr, rotl>;
+ defm : avx512_rotate_novlx<v8i64_info, v4i64x_info, VPRORQZri, X86vrotri, VPRORVQZrr, rotr>;
+ defm : avx512_rotate_novlx<v8i64_info, v2i64x_info, VPROLQZri, X86vrotli, VPROLVQZrr, rotl>;
+ defm : avx512_rotate_novlx<v8i64_info, v2i64x_info, VPRORQZri, X86vrotri, VPRORVQZrr, rotr>;
- def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
+ defm : avx512_rotate_novlx<v16i32_info, v8i32x_info, VPROLDZri, X86vrotli, VPROLVDZrr, rotl>;
+ defm : avx512_rotate_novlx<v16i32_info, v8i32x_info, VPRORDZri, X86vrotri, VPRORVDZrr, rotr>;
+ defm : avx512_rotate_novlx<v16i32_info, v4i32x_info, VPROLDZri, X86vrotli, VPROLVDZrr, rotl>;
+ defm : avx512_rotate_novlx<v16i32_info, v4i32x_info, VPRORDZri, X86vrotri, VPRORVDZrr, rotr>;
}
//===-------------------------------------------------------------------===//
@@ -12525,6 +12453,48 @@ defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", WriteVarShuffle256,
defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
avx512vl_i16_info, HasVBMI2>, EVEX, REX_W;
+// Use 512bit VPSHLD/VPSHRD version to implement 128/256 bit in case NoVLX.
+multiclass vbmi2_funnel_novlx<X86VectorVTInfo Dst, X86VectorVTInfo Src,
+ Instruction InstrStrRI, SDNode OpNodeRI,
+ Instruction InstrStrRR, SDNode OpNodeRR, bit SwapLR> {
+ def : Pat<(Src.VT (OpNodeRI (Src.VT Src.RC:$src1), (Src.VT Src.RC:$src2), (i8 timm:$src3))),
+ (EXTRACT_SUBREG
+ (InstrStrRI
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src1, Src.SubRegIdx),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src2, Src.SubRegIdx),
+ timm:$src3),
+ Src.SubRegIdx)>;
+
+ def : Pat<(Src.VT (OpNodeRR (Src.VT Src.RC:$src1), (Src.VT Src.RC:$src2), (Src.VT Src.RC:$src3))),
+ (EXTRACT_SUBREG
+ (InstrStrRR
+ !if(SwapLR,
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src2, Src.SubRegIdx),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src1, Src.SubRegIdx)),
+ !if(SwapLR,
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src1, Src.SubRegIdx),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src2, Src.SubRegIdx)),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src3, Src.SubRegIdx)),
+ Src.SubRegIdx)>;
+}
+
+let Predicates = [HasVBMI2, NoVLX] in {
+ defm : vbmi2_funnel_novlx<v8i64_info, v4i64x_info, VPSHLDQZrri, X86VShld, VPSHLDVQZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v8i64_info, v4i64x_info, VPSHRDQZrri, X86VShrd, VPSHRDVQZr, fshr, 1>;
+ defm : vbmi2_funnel_novlx<v8i64_info, v2i64x_info, VPSHLDQZrri, X86VShld, VPSHLDVQZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v8i64_info, v2i64x_info, VPSHRDQZrri, X86VShrd, VPSHRDVQZr, fshr, 1>;
+
+ defm : vbmi2_funnel_novlx<v16i32_info, v8i32x_info, VPSHLDDZrri, X86VShld, VPSHLDVDZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v16i32_info, v8i32x_info, VPSHRDDZrri, X86VShrd, VPSHRDVDZr, fshr, 1>;
+ defm : vbmi2_funnel_novlx<v16i32_info, v4i32x_info, VPSHLDDZrri, X86VShld, VPSHLDVDZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v16i32_info, v4i32x_info, VPSHRDDZrri, X86VShrd, VPSHRDVDZr, fshr, 1>;
+
+ defm : vbmi2_funnel_novlx<v32i16_info, v16i16x_info, VPSHLDWZrri, X86VShld, VPSHLDVWZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v32i16_info, v16i16x_info, VPSHRDWZrri, X86VShrd, VPSHRDVWZr, fshr, 1>;
+ defm : vbmi2_funnel_novlx<v32i16_info, v8i16x_info, VPSHLDWZrri, X86VShld, VPSHLDVWZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v32i16_info, v8i16x_info, VPSHRDWZrri, X86VShrd, VPSHRDVWZr, fshr, 1>;
+}
+
//===----------------------------------------------------------------------===//
// VNNI
//===----------------------------------------------------------------------===//
More information about the llvm-commits
mailing list