[llvm] [X86] Support X86ISD::VBROADCAST for v4X32/v2X64 types on AVX1 (PR #184188)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 09:59:48 PST 2026
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/184188
This is a proxy for a permute node, but helps simplify some shuffles to reduce dependencies
>From 62ee80de1deb77a7916909c22335390a1b695ca0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 8 Nov 2024 18:35:01 +0000
Subject: [PATCH] [X86] Support X86ISD::VBROADCAST for v4X32/v2X64 types on
AVX1
This is a proxy for a permute node, but helps simplify some shuffles to reduce dependencies
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +++-
llvm/lib/Target/X86/X86InstrSSE.td | 6 ++++++
llvm/test/CodeGen/X86/buildvec-insertvec.ll | 2 +-
llvm/test/CodeGen/X86/matrix-multiply.ll | 2 +-
llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll | 4 ++--
llvm/test/CodeGen/X86/vector-fshl-256.ll | 8 ++++++--
llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll | 8 ++++++--
7 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2188f6466682b..832843b9e524b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13511,7 +13511,9 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
: X86ISD::VBROADCAST;
- bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
+ bool BroadcastFromReg =
+ (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2() ||
+ (NumEltBits >= 32 && VT.is128BitVector() && Subtarget.hasAVX());
// Check that the mask is a broadcast.
int BroadcastIdx = getSplatIndex(Mask);
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index f67bc0e74acc8..3c20eeb18f6a4 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7800,6 +7800,8 @@ let Predicates = [HasAVX, NoVLX] in {
let Predicates = [HasAVX1Only] in {
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
(VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
+ def : Pat<(v4f32 (X86VBroadcast v4f32:$src)),
+ (VPERMILPSri VR128:$src, 0)>;
def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
(VINSERTF128rri (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
(v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
@@ -7819,6 +7821,8 @@ let Predicates = [HasAVX1Only] in {
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
(VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
+ def : Pat<(v4i32 (X86VBroadcast v4i32:$src)),
+ (VPSHUFDri VR128:$src, 0)>;
def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
(VINSERTF128rri (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
(v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
@@ -7830,6 +7834,8 @@ let Predicates = [HasAVX1Only] in {
def : Pat<(v2i64 (X86VBroadcast i64:$src)),
(VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
+ def : Pat<(v2i64 (X86VBroadcast v2i64:$src)),
+ (VPSHUFDri VR128:$src, 0x44)>;
def : Pat<(v4i64 (X86VBroadcast v2i64:$src)),
(VINSERTF128rri (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
(v2i64 (VPSHUFDri VR128:$src, 0x44)), sub_xmm),
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 4b0e5441b4abf..860008432cc73 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -738,9 +738,9 @@ define void @PR46461(i16 %x, ptr %y) {
; AVX1-LABEL: PR46461:
; AVX1: # %bb.0:
; AVX1-NEXT: movzwl %di, %eax
+; AVX1-NEXT: shrl %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps %ymm0, 32(%rsi)
; AVX1-NEXT: vmovaps %ymm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index f38b769fe4987..bb496b5b6c839 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -38,7 +38,7 @@ define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind {
; AVX1-LABEL: test_mul2x2_f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT: vmovsldup {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,0,0]
; AVX1-NEXT: vmulps %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; AVX1-NEXT: vmulps %xmm4, %xmm2, %xmm4
diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
index a88510591ceb8..3c87bcc7565e5 100644
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -279,7 +279,7 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
; AVX1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: test11:
@@ -310,7 +310,7 @@ define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX512-LABEL: test12:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index b8dd9a7a10609..9bc4892a316e5 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1263,8 +1263,12 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
; AVX1-LABEL: fancierRotate2:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd %edx, %xmm1
-; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [31,0]
diff --git a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
index b453f925b94e8..d65ae9198e740 100644
--- a/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll
@@ -152,8 +152,12 @@ define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture r
; AVX1-NEXT: .LBB0_3: # %vector.ph
; AVX1-NEXT: movl %r9d, %edx
; AVX1-NEXT: andl $-32, %edx
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vmovd %r8d, %xmm8
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7
+; AVX1-NEXT: vmovd %r8d, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
More information about the llvm-commits
mailing list