[llvm-branch-commits] [llvm] release/18.x: [X86][EVEX512] Add `HasEVEX512` when `NoVLX` used for 512-bit patterns (#91106) (PR #91118)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun May 5 04:52:43 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (llvmbot)
<details>
<summary>Changes</summary>
Backport 7963d9a2b3c20561278a85b19e156e013231342c
Requested by: @<!-- -->phoebewang
---
Full diff: https://github.com/llvm/llvm-project/pull/91118.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+3-1)
- (modified) llvm/lib/Target/X86/X86InstrAVX512.td (+21-21)
- (added) llvm/test/CodeGen/X86/pr90844.ll (+19)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 71fc6b5047eaa9..c572b27fe401e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29841,7 +29841,9 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return R;
// AVX512 implicitly uses modulo rotation amounts.
- if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
+ if ((Subtarget.hasVLX() ||
+ (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
+ 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
if (IsCstSplat) {
unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index bb5e22c7142793..0564f2167d8eea 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -814,7 +814,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
// smaller extract to enable EVEX->VEX.
-let Predicates = [NoVLX] in {
+let Predicates = [NoVLX, HasEVEX512] in {
def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
(v2i64 (VEXTRACTI128rr
(v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
@@ -3068,7 +3068,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
}
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
@@ -3099,7 +3099,7 @@ let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
}
-let Predicates = [HasBWI, NoVLX] in {
+let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
@@ -3493,7 +3493,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
// available. Use a 512-bit operation and extract.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
@@ -3505,7 +3505,7 @@ let Predicates = [HasAVX512, NoVLX] in {
defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
}
-let Predicates = [HasBWI, NoVLX] in {
+let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
@@ -4998,8 +4998,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
SchedWriteVecALU, HasAVX512, 1>, T8;
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasDQI, NoVLX] in {
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512.
+let Predicates = [HasDQI, NoVLX, HasEVEX512] in {
def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
(EXTRACT_SUBREG
(VPMULLQZrr
@@ -5055,7 +5055,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
sub_xmm)>;
}
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
@@ -6032,7 +6032,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
SchedWriteVecShift>;
// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZrr
@@ -6161,14 +6161,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>;
// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLVQZrr
@@ -6219,7 +6219,7 @@ let Predicates = [HasAVX512, NoVLX] in {
}
// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORVQZrr
@@ -9816,7 +9816,7 @@ defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
truncstore_us_vi8, masked_truncstore_us_vi8,
X86vtruncus, X86vmtruncus>;
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
(v8i16 (EXTRACT_SUBREG
(v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
@@ -9827,7 +9827,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
VR256X:$src, sub_ymm)))), sub_xmm))>;
}
-let Predicates = [HasBWI, NoVLX] in {
+let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
(v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm))), sub_xmm))>;
@@ -10370,7 +10370,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
EVEX_V128;
}
- let Predicates = [prd, NoVLX] in {
+ let Predicates = [prd, NoVLX, HasEVEX512] in {
defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
}
@@ -11157,7 +11157,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
SchedWriteVecALU>;
// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
def : Pat<(v4i64 (abs VR256X:$src)),
(EXTRACT_SUBREG
(VPABSQZrr
@@ -11173,7 +11173,7 @@ let Predicates = [HasAVX512, NoVLX] in {
// Use 512bit version to implement 128/256 bit.
multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
AVX512VLVectorVTInfo _, Predicate prd> {
- let Predicates = [prd, NoVLX] in {
+ let Predicates = [prd, NoVLX, HasEVEX512] in {
def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
@@ -11792,7 +11792,7 @@ let Predicates = [HasAVX512] in {
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
}
-let Predicates = [HasAVX512, NoVLX] in {
+let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
def : Pat<(v16i8 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
diff --git a/llvm/test/CodeGen/X86/pr90844.ll b/llvm/test/CodeGen/X86/pr90844.ll
new file mode 100644
index 00000000000000..6feece7f66d877
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr90844.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-evex512 < %s | FileCheck %s
+
+define void @PR90844() {
+; CHECK-LABEL: PR90844:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, (%rax)
+; CHECK-NEXT: retq
+entry:
+ %0 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> poison, <2 x i32> poison, <2 x i32> <i32 8, i32 24>)
+ %1 = and <2 x i32> %0, <i32 16711935, i32 -134152448>
+ %2 = or disjoint <2 x i32> zeroinitializer, %1
+ %3 = zext <2 x i32> %2 to <2 x i64>
+ %4 = shl nuw <2 x i64> %3, <i64 32, i64 32>
+ %5 = or disjoint <2 x i64> %4, zeroinitializer
+ store <2 x i64> %5, ptr poison, align 16
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/91118
More information about the llvm-branch-commits
mailing list