[llvm] [X86][AVX] Prefer VPSRAV to VPSRA style shifts for known splats #39424 (PR #87913)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 8 03:50:15 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (SahilPatidar)
<details>
<summary>Changes</summary>
Resolve #<!-- -->39424
---
Patch is 259.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87913.diff
31 Files Affected:
- (modified) llvm/lib/Target/X86/X86.td (+6-1)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4-2)
- (modified) llvm/test/CodeGen/X86/avx2-vector-shifts.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vector-fshl-128.ll (+67-35)
- (modified) llvm/test/CodeGen/X86/vector-fshl-256.ll (+49-40)
- (modified) llvm/test/CodeGen/X86/vector-fshl-512.ll (+36-30)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-128.ll (+38-21)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-256.ll (+22-18)
- (modified) llvm/test/CodeGen/X86/vector-fshl-rot-512.ll (+12-10)
- (modified) llvm/test/CodeGen/X86/vector-fshr-128.ll (+66-34)
- (modified) llvm/test/CodeGen/X86/vector-fshr-256.ll (+49-40)
- (modified) llvm/test/CodeGen/X86/vector-fshr-512.ll (+36-30)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-128.ll (+34-17)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-256.ll (+20-16)
- (modified) llvm/test/CodeGen/X86/vector-fshr-rot-512.ll (+12-10)
- (modified) llvm/test/CodeGen/X86/vector-rotate-128.ll (+32-18)
- (modified) llvm/test/CodeGen/X86/vector-rotate-256.ll (+21-17)
- (modified) llvm/test/CodeGen/X86/vector-rotate-512.ll (+12-10)
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-128.ll (+158-77)
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-256.ll (+143-79)
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-512.ll (+19-11)
- (modified) llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll (+112-58)
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-128.ll (+187-86)
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-256.ll (+119-63)
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-512.ll (+16-10)
- (modified) llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll (+112-58)
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-128.ll (+187-86)
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-256.ll (+119-63)
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-512.ll (+16-10)
- (modified) llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll (+112-58)
- (modified) llvm/test/CodeGen/X86/vselect-avx.ll (+8-8)
``````````diff
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 78bc043911f2fc..a9f3ae1f847552 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -599,6 +599,10 @@ def TuningPreferShiftShuffle : SubtargetFeature<"faster-shift-than-shuffle",
"PreferLowerShuffleAsShift", "true",
"Shifts are faster (or as fast) as shuffle">;
+def TuningPreferPerEltVectorShift : SubtargetFeature<"tuning-fast-per-element-vector-shift",
+ "PreferPerEltVectorShift", "true",
+ "Vector per element shifts are faster (1/cycle latency)">;
+
def TuningFastImmVectorShift : SubtargetFeature<"tuning-fast-imm-vector-shift",
"FastImmVectorShift", "true",
"Vector shifts are fast (2/cycle) as opposed to slow (1/cycle)">;
@@ -996,7 +1000,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningPreferPerEltVectorShift];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b9a87f9024c7de..2d0bd115d7b030 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29318,8 +29318,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
return V;
- if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
- return V;
+ if (!supportedVectorVarShift(VT, Subtarget, Opc) &&
+ !Subtarget.preferPerEltVectorShift())
+ if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
+ return V;
if (supportedVectorVarShift(VT, Subtarget, Opc))
return Op;
diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 983c69d1a1c2e8..6ad8106bba6d4f 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -61,14 +61,14 @@ define <8 x i32> @test_vpslld_var(i32 %shift) {
; X86: # %bb.0:
; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
-; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0
+; X86-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: test_vpslld_var:
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
-; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
+; X64-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%amt = insertelement <8 x i32> undef, i32 %shift, i32 0
%tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 1addedf3c3d960..577a86dff54e96 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -992,47 +992,62 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: splatvar_funnnel_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: splatvar_funnnel_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
+; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: splatvar_funnnel_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
+; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v2i64:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VL-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v2i64:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
@@ -1048,12 +1063,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
@@ -1063,16 +1079,28 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: retq
;
-; XOP-LABEL: splatvar_funnnel_v2i64:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4
-; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0
-; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
+; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
+; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: splatvar_funnnel_v2i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastq %xmm2, %xmm2
+; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
+; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
+; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; XOPAVX2-NEXT: vpsllvq %xmm2, %xmm0, %xmm0
+; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
@@ -1255,13 +1283,16 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512BW-LABEL: splatvar_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
-; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
@@ -1276,12 +1307,13 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0]
+; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index ebcb1cb15a600e..e11f26e10b0ea6 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -778,45 +778,49 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX2-LABEL: splatvar_funnnel_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: splatvar_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1
-; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
@@ -831,12 +835,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
@@ -866,12 +871,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %
;
; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; XOPAVX2-NEXT: vpbroadcastq %xmm2, %ymm2
+; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
+; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4
; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
+; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
%splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -1049,12 +1055,14 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512BW-LABEL: splatvar_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
-; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
@@ -1069,12 +1077,13 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0]
-; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4
; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
+; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index e23855361e57a2..fe8af191d0ff40 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -426,34 +426,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpandnq %zmm3, %zmm2, %zmm4
; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm2, %zmm4
; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
-; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4
+; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
+; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4
; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
@@ -465,12 +468,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,6...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/87913
More information about the llvm-commits
mailing list