[llvm] 768182d - [X86] Declare 128/256-bit funnel shifts legal on VBMI2 + NOVLX targets (#184634)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 01:53:28 PST 2026
Author: Simon Pilgrim
Date: 2026-03-05T09:53:23Z
New Revision: 768182dff6ec12a91beae27fb2f5c2f20f6d4993
URL: https://github.com/llvm/llvm-project/commit/768182dff6ec12a91beae27fb2f5c2f20f6d4993
DIFF: https://github.com/llvm/llvm-project/commit/768182dff6ec12a91beae27fb2f5c2f20f6d4993.diff
LOG: [X86] Declare 128/256-bit funnel shifts legal on VBMI2 + NOVLX targets (#184634)
Add tablegen patterns to widen 128/256-bit funnel shift to 512-bit
Similar to what we do for 128/256-bit AVX512F rotates (and a lot of
other instructions) already - and I've take the opportunity to create
similar macros to reduce duplication in the rotate patterns
Another step towards #184002
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86InstrAVX512.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2612befd71b15..6bb558f4ef6da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2159,8 +2159,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
MVT::v4i64}) {
- setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom);
- setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FSHL, VT, Legal);
+ setOperationAction(ISD::FSHR, VT, Legal);
}
}
@@ -31740,16 +31740,6 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
APInt APIntShiftAmt;
bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
unsigned NumElts = VT.getVectorNumElements();
-
- // For non-VLX VBMI2 targets, widen 128/256-bit to 512-bit so
- // the rest of the lowering/isel can select the VBMI2 forms.
- // Only Custom types (v8i16, v4i32, v2i64, v16i16, v8i32, v4i64) can
- // reach LowerFunnelShift with VBMI2 but no VLX, so no type check needed.
- if (Subtarget.hasVBMI2() && !Subtarget.hasVLX() && EltSizeInBits > 8) {
- return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
- {Op0, Op1, Amt}, DAG, Subtarget);
- }
-
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index df0d614a0251f..3b456aee7fceb 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -6261,107 +6261,35 @@ defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI
defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
-
-// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPROLQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
-
- def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPROLDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
+// Use 512bit VPROL/VPROR version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+multiclass avx512_rotate_novlx<X86VectorVTInfo Dst, X86VectorVTInfo Src,
+ Instruction InstrStrRI, SDNode OpNodeRI,
+ Instruction InstrStrRR, SDNode OpNodeRR> {
+ def : Pat<(Src.VT (OpNodeRI (Src.VT Src.RC:$src1), (i8 timm:$src2))),
+ (EXTRACT_SUBREG (Dst.VT
+ (InstrStrRI
+ (Dst.VT (INSERT_SUBREG (IMPLICIT_DEF), Src.RC:$src1, Src.SubRegIdx)),
+ timm:$src2)),
+ Src.SubRegIdx)>;
+
+ def : Pat<(Src.VT (OpNodeRR (Src.VT Src.RC:$src1), (Src.VT Src.RC:$src2))),
+ (EXTRACT_SUBREG (Dst.VT
+ (InstrStrRR
+ (Dst.VT (INSERT_SUBREG (IMPLICIT_DEF), Src.RC:$src1, Src.SubRegIdx)),
+ (Dst.VT (INSERT_SUBREG (IMPLICIT_DEF), Src.RC:$src2, Src.SubRegIdx)))),
+ Src.SubRegIdx)>;
}
-// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
- def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORVQZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORVDZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
- sub_ymm)>;
-
- def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v8i64
- (VPRORQZri
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
+ defm : avx512_rotate_novlx<v8i64_info, v4i64x_info, VPROLQZri, X86vrotli, VPROLVQZrr, rotl>;
+ defm : avx512_rotate_novlx<v8i64_info, v4i64x_info, VPRORQZri, X86vrotri, VPRORVQZrr, rotr>;
+ defm : avx512_rotate_novlx<v8i64_info, v2i64x_info, VPROLQZri, X86vrotli, VPROLVQZrr, rotl>;
+ defm : avx512_rotate_novlx<v8i64_info, v2i64x_info, VPRORQZri, X86vrotri, VPRORVQZrr, rotr>;
- def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- timm:$src2)), sub_xmm)>;
- def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 timm:$src2))),
- (EXTRACT_SUBREG (v16i32
- (VPRORDZri
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- timm:$src2)), sub_ymm)>;
+ defm : avx512_rotate_novlx<v16i32_info, v8i32x_info, VPROLDZri, X86vrotli, VPROLVDZrr, rotl>;
+ defm : avx512_rotate_novlx<v16i32_info, v8i32x_info, VPRORDZri, X86vrotri, VPRORVDZrr, rotr>;
+ defm : avx512_rotate_novlx<v16i32_info, v4i32x_info, VPROLDZri, X86vrotli, VPROLVDZrr, rotl>;
+ defm : avx512_rotate_novlx<v16i32_info, v4i32x_info, VPRORDZri, X86vrotri, VPRORVDZrr, rotr>;
}
//===-------------------------------------------------------------------===//
@@ -12525,6 +12453,48 @@ defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", WriteVarShuffle256,
defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256,
avx512vl_i16_info, HasVBMI2>, EVEX, REX_W;
+// Use 512bit VPSHLD/VPSHRD version to implement 128/256 bit in case NoVLX.
+multiclass vbmi2_funnel_novlx<X86VectorVTInfo Dst, X86VectorVTInfo Src,
+ Instruction InstrStrRI, SDNode OpNodeRI,
+ Instruction InstrStrRR, SDNode OpNodeRR, bit SwapLR> {
+ def : Pat<(Src.VT (OpNodeRI (Src.VT Src.RC:$src1), (Src.VT Src.RC:$src2), (i8 timm:$src3))),
+ (EXTRACT_SUBREG
+ (InstrStrRI
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src1, Src.SubRegIdx),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src2, Src.SubRegIdx),
+ timm:$src3),
+ Src.SubRegIdx)>;
+
+ def : Pat<(Src.VT (OpNodeRR (Src.VT Src.RC:$src1), (Src.VT Src.RC:$src2), (Src.VT Src.RC:$src3))),
+ (EXTRACT_SUBREG
+ (InstrStrRR
+ !if(SwapLR,
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src2, Src.SubRegIdx),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src1, Src.SubRegIdx)),
+ !if(SwapLR,
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src1, Src.SubRegIdx),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src2, Src.SubRegIdx)),
+ (INSERT_SUBREG (Dst.VT (IMPLICIT_DEF)), Src.RC:$src3, Src.SubRegIdx)),
+ Src.SubRegIdx)>;
+}
+
+let Predicates = [HasVBMI2, NoVLX] in {
+ defm : vbmi2_funnel_novlx<v8i64_info, v4i64x_info, VPSHLDQZrri, X86VShld, VPSHLDVQZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v8i64_info, v4i64x_info, VPSHRDQZrri, X86VShrd, VPSHRDVQZr, fshr, 1>;
+ defm : vbmi2_funnel_novlx<v8i64_info, v2i64x_info, VPSHLDQZrri, X86VShld, VPSHLDVQZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v8i64_info, v2i64x_info, VPSHRDQZrri, X86VShrd, VPSHRDVQZr, fshr, 1>;
+
+ defm : vbmi2_funnel_novlx<v16i32_info, v8i32x_info, VPSHLDDZrri, X86VShld, VPSHLDVDZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v16i32_info, v8i32x_info, VPSHRDDZrri, X86VShrd, VPSHRDVDZr, fshr, 1>;
+ defm : vbmi2_funnel_novlx<v16i32_info, v4i32x_info, VPSHLDDZrri, X86VShld, VPSHLDVDZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v16i32_info, v4i32x_info, VPSHRDDZrri, X86VShrd, VPSHRDVDZr, fshr, 1>;
+
+ defm : vbmi2_funnel_novlx<v32i16_info, v16i16x_info, VPSHLDWZrri, X86VShld, VPSHLDVWZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v32i16_info, v16i16x_info, VPSHRDWZrri, X86VShrd, VPSHRDVWZr, fshr, 1>;
+ defm : vbmi2_funnel_novlx<v32i16_info, v8i16x_info, VPSHLDWZrri, X86VShld, VPSHLDVWZr, fshl, 0>;
+ defm : vbmi2_funnel_novlx<v32i16_info, v8i16x_info, VPSHRDWZrri, X86VShrd, VPSHRDVWZr, fshr, 1>;
+}
+
//===----------------------------------------------------------------------===//
// VNNI
//===----------------------------------------------------------------------===//
More information about the llvm-commits
mailing list