[llvm] [X86][AVX] Match v4f64 blend from shuffle of scalar values. (PR #135753)
Leon Clark via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 27 18:47:49 PDT 2025
https://github.com/PeddleSpam updated https://github.com/llvm/llvm-project/pull/135753
>From 6e49d4dff706690376a824a49329c22bb9f6ca3a Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 15 Apr 2025 07:45:46 +0100
Subject: [PATCH 01/13] [X86][AVX] Match v4f64 blend from shuffle of scalar
values.
Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that will lower to AVX blend.
---
llvm/test/CodeGen/X86/shuffle-blendw.ll | 422 ++++++++++++++++++++++++
1 file changed, 422 insertions(+)
diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll
index 9f90657dc64d1..28af382ec3e07 100644
--- a/llvm/test/CodeGen/X86/shuffle-blendw.ll
+++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll
@@ -263,3 +263,425 @@ define <8 x i16> @blendw_to_blendd_fail_16(<8 x i16> %x, <8 x i16> %y, <8 x i16>
%shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
ret <8 x i16> %shuffle
}
+
+define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v4f64:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movaps (%ecx), %xmm2
+; X86-SSE41-NEXT: movaps (%eax), %xmm1
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v4f64:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movaps (%rdi), %xmm2
+; X64-SSE41-NEXT: movaps (%rsi), %xmm1
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v4f64:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v4f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v4f64:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v4f64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v4f64:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v4f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; X64-AVX512-NEXT: retq
+ %ld0 = load <4 x double>, ptr %p0, align 32
+ %ld1 = load <4 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v2f64:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movaps (%ecx), %xmm2
+; X86-SSE41-NEXT: movaps (%eax), %xmm1
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v2f64:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movaps (%rdi), %xmm2
+; X64-SSE41-NEXT: movaps (%rsi), %xmm1
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v2f64:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X86-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v2f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v2f64:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X86-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v2f64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v2f64:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
+; X86-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v2f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
+; X64-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
+; X64-AVX512-NEXT: retq
+ %ld0 = load <2 x double>, ptr %p0, align 32
+ %ld1 = load <2 x double>, ptr %p1, align 32
+ %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> <i32 0, i32 2, i32 2, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v1f64:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v1f64:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v1f64:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v1f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v1f64:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v1f64:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v1f64:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v1f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX512-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v1f64_4x:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v1f64_4x:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v1f64_4x:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v1f64_4x:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX512-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
+; X86-SSE41-LABEL: blend_broadcasts_v1f64_2x:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE41-NEXT: movaps %xmm2, %xmm0
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-SSE41-NEXT: retl
+;
+; X64-SSE41-LABEL: blend_broadcasts_v1f64_2x:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE41-NEXT: movaps %xmm2, %xmm0
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-SSE41-NEXT: retq
+;
+; X86-AVX-LABEL: blend_broadcasts_v1f64_2x:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX-NEXT: retl
+;
+; X64-AVX-LABEL: blend_broadcasts_v1f64_2x:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x:
+; X86-AVX2: # %bb.0:
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: retl
+;
+; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: retq
+;
+; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX512-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer
+ %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ ret <4 x double> %blend
+}
>From b71d063b1c9d34dc6cdc4710a75a5ebb01288b8c Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 15 Apr 2025 07:59:42 +0100
Subject: [PATCH 02/13] Add lowering code and update tests.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 33 ++++++
llvm/test/CodeGen/X86/shuffle-blendw.ll | 144 +++++++++---------------
2 files changed, 87 insertions(+), 90 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0fc50dc1a87b6..682083beb064a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9093,6 +9093,39 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
unsigned NumElems = Op.getNumOperands();
+ // Match BUILD_VECTOR of scalars that we can lower to X86ISD::BLENDI via
+ // shuffles.
+ //
+ // v4f64 = BUILD_VECTOR X,Y,Y,X
+ // >>>
+ // t1: v4f64 = BUILD_VECTOR X,u,u,u
+ // t3: v4f64 = vector_shuffle<0,u,u,0> t1, u
+ // t2: v4f64 = BUILD_VECTOR Y,u,u,u
+ // t4: v4f64 = vector_shuffle<u,0,0,u> t2, u
+ // v4f64 = vector_shuffle<0,5,6,3> t3, t4
+ //
+ if (Subtarget.hasAVX() && VT == MVT::v4f64 && Op->getNumOperands() == 4u) {
+ auto Op0 = Op->getOperand(0u);
+ auto Op1 = Op->getOperand(1u);
+ auto Op2 = Op->getOperand(2u);
+ auto Op3 = Op->getOperand(3u);
+
+ // Match X,Y,Y,X inputs.
+ if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) {
+ auto PsnVal = DAG.getUNDEF(MVT::f64);
+
+ auto NewOp0 = DAG.getBuildVector(VT, dl, {Op0, PsnVal, PsnVal, PsnVal});
+ NewOp0 = DAG.getVectorShuffle(VT, dl, NewOp0, DAG.getUNDEF(VT),
+ {0, -1, -1, 0});
+
+ auto NewOp1 = DAG.getBuildVector(VT, dl, {Op1, PsnVal, PsnVal, PsnVal});
+ NewOp1 = DAG.getVectorShuffle(VT, dl, NewOp1, DAG.getUNDEF(VT),
+ {-1, 0, 0, -1});
+
+ return DAG.getVectorShuffle(VT, dl, NewOp0, NewOp1, {0, 5, 6, 3});
+ }
+ }
+
// Generate vectors for predicate vectors.
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll
index 28af382ec3e07..a1af29550f64f 100644
--- a/llvm/test/CodeGen/X86/shuffle-blendw.ll
+++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll
@@ -449,60 +449,48 @@ define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: blend_broadcasts_v1f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX-NEXT: retq
;
; X86-AVX2-LABEL: blend_broadcasts_v1f64:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: blend_broadcasts_v1f64:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX2-NEXT: retq
;
; X86-AVX512-LABEL: blend_broadcasts_v1f64:
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX512-NEXT: retl
;
; X64-AVX512-LABEL: blend_broadcasts_v1f64:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX512-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
@@ -535,60 +523,48 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: blend_broadcasts_v1f64_4x:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX-NEXT: retq
;
; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX2-NEXT: retq
;
; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x:
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX512-NEXT: retl
;
; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX512-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
@@ -623,60 +599,48 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
; X86-AVX: # %bb.0:
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX-NEXT: retl
;
; X64-AVX-LABEL: blend_broadcasts_v1f64_2x:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX-NEXT: retq
;
; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX2-NEXT: retq
;
; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x:
; X86-AVX512: # %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
+; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
+; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X86-AVX512-NEXT: retl
;
; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
+; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
+; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; X64-AVX512-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
>From 2040abe38513468e650e18c2d65b20062c53a023 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Tue, 15 Apr 2025 08:25:03 +0100
Subject: [PATCH 03/13] Replace undef in tests.
---
llvm/test/CodeGen/X86/shuffle-blendw.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll
index a1af29550f64f..20239362d2480 100644
--- a/llvm/test/CodeGen/X86/shuffle-blendw.ll
+++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll
@@ -334,8 +334,8 @@ define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
; X64-AVX512-NEXT: retq
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer
+ %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer
%blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
ret <4 x double> %blend
}
@@ -568,8 +568,8 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
; X64-AVX512-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer
%blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
ret <4 x double> %blend
}
@@ -644,8 +644,8 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
; X64-AVX512-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer
- %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer
%blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
ret <4 x double> %blend
}
>From 926084298a0d0b0061026a580569b249e43b0339 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 16 Apr 2025 06:43:27 +0100
Subject: [PATCH 04/13] Address comments.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 62 ++-
llvm/test/CodeGen/X86/shuffle-blendw.ll | 386 ------------------
.../test/CodeGen/X86/vector-shuffle-256-v4.ll | 81 ++++
3 files changed, 110 insertions(+), 419 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 682083beb064a..f864f273e5437 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8743,6 +8743,33 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
return LowerShift(Res, Subtarget, DAG);
}
+/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
+/// representing a blend.
+static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
+ X86Subtarget const &Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget.hasAVX())
+ return {};
+
+ auto VT = BVOp->getSimpleValueType(0u);
+
+ if (VT == MVT::v4f64 && BVOp->getNumOperands() == 4u) {
+ SDValue Op0 = BVOp->getOperand(0u);
+ SDValue Op1 = BVOp->getOperand(1u);
+ SDValue Op2 = BVOp->getOperand(2u);
+ SDValue Op3 = BVOp->getOperand(3u);
+
+ // Match X,Y,Y,X inputs.
+ if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) {
+ auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
+ auto NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
+ return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, {0, 5, 6, 3});
+ }
+ }
+
+ return {};
+}
+
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
/// functionality to do this, so it's all zeros, all ones, or some derivation
/// that is cheap to calculate.
@@ -9093,39 +9120,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
unsigned NumElems = Op.getNumOperands();
- // Match BUILD_VECTOR of scalars that we can lower to X86ISD::BLENDI via
- // shuffles.
- //
- // v4f64 = BUILD_VECTOR X,Y,Y,X
- // >>>
- // t1: v4f64 = BUILD_VECTOR X,u,u,u
- // t3: v4f64 = vector_shuffle<0,u,u,0> t1, u
- // t2: v4f64 = BUILD_VECTOR Y,u,u,u
- // t4: v4f64 = vector_shuffle<u,0,0,u> t2, u
- // v4f64 = vector_shuffle<0,5,6,3> t3, t4
- //
- if (Subtarget.hasAVX() && VT == MVT::v4f64 && Op->getNumOperands() == 4u) {
- auto Op0 = Op->getOperand(0u);
- auto Op1 = Op->getOperand(1u);
- auto Op2 = Op->getOperand(2u);
- auto Op3 = Op->getOperand(3u);
-
- // Match X,Y,Y,X inputs.
- if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) {
- auto PsnVal = DAG.getUNDEF(MVT::f64);
-
- auto NewOp0 = DAG.getBuildVector(VT, dl, {Op0, PsnVal, PsnVal, PsnVal});
- NewOp0 = DAG.getVectorShuffle(VT, dl, NewOp0, DAG.getUNDEF(VT),
- {0, -1, -1, 0});
-
- auto NewOp1 = DAG.getBuildVector(VT, dl, {Op1, PsnVal, PsnVal, PsnVal});
- NewOp1 = DAG.getVectorShuffle(VT, dl, NewOp1, DAG.getUNDEF(VT),
- {-1, 0, 0, -1});
-
- return DAG.getVectorShuffle(VT, dl, NewOp0, NewOp1, {0, 5, 6, 3});
- }
- }
-
// Generate vectors for predicate vectors.
if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
@@ -9238,6 +9232,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return Broadcast;
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
return BitOp;
+ if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
+ return Blend;
unsigned NumZero = ZeroMask.popcount();
unsigned NumNonZero = NonZeroMask.popcount();
diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll
index 20239362d2480..9f90657dc64d1 100644
--- a/llvm/test/CodeGen/X86/shuffle-blendw.ll
+++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll
@@ -263,389 +263,3 @@ define <8 x i16> @blendw_to_blendd_fail_16(<8 x i16> %x, <8 x i16> %y, <8 x i16>
%shuffle = shufflevector <8 x i16> %x1, <8 x i16> %y, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 14, i32 15>
ret <8 x i16> %shuffle
}
-
-define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; X86-SSE41-LABEL: blend_broadcasts_v4f64:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movaps (%ecx), %xmm2
-; X86-SSE41-NEXT: movaps (%eax), %xmm1
-; X86-SSE41-NEXT: movaps %xmm2, %xmm0
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE41-NEXT: retl
-;
-; X64-SSE41-LABEL: blend_broadcasts_v4f64:
-; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movaps (%rdi), %xmm2
-; X64-SSE41-NEXT: movaps (%rsi), %xmm1
-; X64-SSE41-NEXT: movaps %xmm2, %xmm0
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-SSE41-NEXT: retq
-;
-; X86-AVX-LABEL: blend_broadcasts_v4f64:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; X86-AVX-NEXT: retl
-;
-; X64-AVX-LABEL: blend_broadcasts_v4f64:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
-; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm1
-; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; X64-AVX-NEXT: retq
-;
-; X86-AVX2-LABEL: blend_broadcasts_v4f64:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; X86-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: blend_broadcasts_v4f64:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
-; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm1
-; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; X64-AVX2-NEXT: retq
-;
-; X86-AVX512-LABEL: blend_broadcasts_v4f64:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; X86-AVX512-NEXT: retl
-;
-; X64-AVX512-LABEL: blend_broadcasts_v4f64:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
-; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm1
-; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; X64-AVX512-NEXT: retq
- %ld0 = load <4 x double>, ptr %p0, align 32
- %ld1 = load <4 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer
- %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
- ret <4 x double> %blend
-}
-
-define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) {
-; X86-SSE41-LABEL: blend_broadcasts_v2f64:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movaps (%ecx), %xmm2
-; X86-SSE41-NEXT: movaps (%eax), %xmm1
-; X86-SSE41-NEXT: movaps %xmm2, %xmm0
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE41-NEXT: retl
-;
-; X64-SSE41-LABEL: blend_broadcasts_v2f64:
-; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movaps (%rdi), %xmm2
-; X64-SSE41-NEXT: movaps (%rsi), %xmm1
-; X64-SSE41-NEXT: movaps %xmm2, %xmm0
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-SSE41-NEXT: retq
-;
-; X86-AVX-LABEL: blend_broadcasts_v2f64:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; X86-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; X86-AVX-NEXT: retl
-;
-; X64-AVX-LABEL: blend_broadcasts_v2f64:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; X64-AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; X64-AVX-NEXT: retq
-;
-; X86-AVX2-LABEL: blend_broadcasts_v2f64:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X86-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; X86-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; X86-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: blend_broadcasts_v2f64:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; X64-AVX2-NEXT: retq
-;
-; X86-AVX512-LABEL: blend_broadcasts_v2f64:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
-; X86-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
-; X86-AVX512-NEXT: retl
-;
-; X64-AVX512-LABEL: blend_broadcasts_v2f64:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
-; X64-AVX512-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
-; X64-AVX512-NEXT: retq
- %ld0 = load <2 x double>, ptr %p0, align 32
- %ld1 = load <2 x double>, ptr %p1, align 32
- %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> <i32 0, i32 2, i32 2, i32 0>
- ret <4 x double> %blend
-}
-
-define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
-; X86-SSE41-LABEL: blend_broadcasts_v1f64:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X86-SSE41-NEXT: movaps %xmm2, %xmm0
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE41-NEXT: retl
-;
-; X64-SSE41-LABEL: blend_broadcasts_v1f64:
-; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT: movaps %xmm2, %xmm0
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-SSE41-NEXT: retq
-;
-; X86-AVX-LABEL: blend_broadcasts_v1f64:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX-NEXT: retl
-;
-; X64-AVX-LABEL: blend_broadcasts_v1f64:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX-NEXT: retq
-;
-; X86-AVX2-LABEL: blend_broadcasts_v1f64:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: blend_broadcasts_v1f64:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX2-NEXT: retq
-;
-; X86-AVX512-LABEL: blend_broadcasts_v1f64:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX512-NEXT: retl
-;
-; X64-AVX512-LABEL: blend_broadcasts_v1f64:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX512-NEXT: retq
- %ld0 = load <1 x double>, ptr %p0, align 32
- %ld1 = load <1 x double>, ptr %p1, align 32
- %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
- ret <4 x double> %blend
-}
-
-define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
-; X86-SSE41-LABEL: blend_broadcasts_v1f64_4x:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X86-SSE41-NEXT: movaps %xmm2, %xmm0
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE41-NEXT: retl
-;
-; X64-SSE41-LABEL: blend_broadcasts_v1f64_4x:
-; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT: movaps %xmm2, %xmm0
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-SSE41-NEXT: retq
-;
-; X86-AVX-LABEL: blend_broadcasts_v1f64_4x:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX-NEXT: retl
-;
-; X64-AVX-LABEL: blend_broadcasts_v1f64_4x:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX-NEXT: retq
-;
-; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX2-NEXT: retq
-;
-; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX512-NEXT: retl
-;
-; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX512-NEXT: retq
- %ld0 = load <1 x double>, ptr %p0, align 32
- %ld1 = load <1 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer
- %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
- ret <4 x double> %blend
-}
-
-define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
-; X86-SSE41-LABEL: blend_broadcasts_v1f64_2x:
-; X86-SSE41: # %bb.0:
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X86-SSE41-NEXT: movaps %xmm2, %xmm0
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X86-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE41-NEXT: retl
-;
-; X64-SSE41-LABEL: blend_broadcasts_v1f64_2x:
-; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT: movaps %xmm2, %xmm0
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X64-SSE41-NEXT: retq
-;
-; X86-AVX-LABEL: blend_broadcasts_v1f64_2x:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX-NEXT: retl
-;
-; X64-AVX-LABEL: blend_broadcasts_v1f64_2x:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX-NEXT: retq
-;
-; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX2-NEXT: retq
-;
-; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
-; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
-; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X86-AVX512-NEXT: retl
-;
-; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
-; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
-; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; X64-AVX512-NEXT: retq
- %ld0 = load <1 x double>, ptr %p0, align 32
- %ld1 = load <1 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer
- %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer
- %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
- ret <4 x double> %blend
-}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index fb8618be17f06..f74b6867786b1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2367,6 +2367,87 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
ret <4 x double> %unpckh
}
+define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
+; ALL-LABEL: blend_broadcasts_v4f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT: vbroadcastsd (%rsi), %ymm1
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; ALL-NEXT: retq
+ %ld0 = load <4 x double>, ptr %p0, align 32
+ %ld1 = load <4 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) {
+; AVX1OR2-LABEL: blend_broadcasts_v2f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v2f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
+; AVX512VL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
+ %ld0 = load <2 x double>, ptr %p0, align 32
+ %ld1 = load <2 x double>, ptr %p1, align 32
+ %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> <i32 0, i32 2, i32 2, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
+; ALL-LABEL: blend_broadcasts_v1f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vbroadcastsd (%rsi), %ymm0
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm1
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; ALL-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
+; ALL-LABEL: blend_broadcasts_v1f64_4x:
+; ALL: # %bb.0:
+; ALL-NEXT: vbroadcastsd (%rsi), %ymm0
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm1
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; ALL-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
+; ALL-LABEL: blend_broadcasts_v1f64_2x:
+; ALL: # %bb.0:
+; ALL-NEXT: vbroadcastsd (%rsi), %ymm0
+; ALL-NEXT: vbroadcastsd (%rdi), %ymm1
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; ALL-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer
+ %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ ret <4 x double> %blend
+}
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
>From 0edbdd0aa93e75869344bcac5d3a64bcc851723c Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Wed, 16 Apr 2025 06:50:20 +0100
Subject: [PATCH 05/13] Remove undef.
---
llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index f74b6867786b1..f57287a5ebc62 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2376,8 +2376,8 @@ define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
; ALL-NEXT: retq
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <4 x double> %ld0, <4 x double> undef, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <4 x double> %ld1, <4 x double> undef, <4 x i32> zeroinitializer
+ %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer
%blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
ret <4 x double> %blend
}
@@ -2427,8 +2427,8 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
; ALL-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <4 x i32> zeroinitializer
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer
%blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
ret <4 x double> %blend
}
@@ -2442,8 +2442,8 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
; ALL-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <1 x double> %ld0, <1 x double> undef, <2 x i32> zeroinitializer
- %bcst1 = shufflevector <1 x double> %ld1, <1 x double> undef, <2 x i32> zeroinitializer
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer
%blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
ret <4 x double> %blend
}
>From 753ac5e1be2345f0186b704dc974987ff220ad52 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 06:59:25 +0100
Subject: [PATCH 06/13] Address comments.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 46 ++++++++++++++++---------
1 file changed, 29 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f864f273e5437..9447bedb6c04e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21,6 +21,7 @@
#include "X86TargetMachine.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
@@ -37,6 +38,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SDPatternMatch.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallingConv.h"
@@ -8748,23 +8750,33 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
X86Subtarget const &Subtarget,
SelectionDAG &DAG) {
- if (!Subtarget.hasAVX())
- return {};
-
- auto VT = BVOp->getSimpleValueType(0u);
-
- if (VT == MVT::v4f64 && BVOp->getNumOperands() == 4u) {
- SDValue Op0 = BVOp->getOperand(0u);
- SDValue Op1 = BVOp->getOperand(1u);
- SDValue Op2 = BVOp->getOperand(2u);
- SDValue Op3 = BVOp->getOperand(3u);
-
- // Match X,Y,Y,X inputs.
- if (Op0 == Op3 && Op1 == Op2 && Op0 != Op1) {
- auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
- auto NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
- return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, {0, 5, 6, 3});
- }
+ MVT VT = BVOp->getSimpleValueType(0u);
+ auto const NumElems = VT.getVectorNumElements();
+
+ if (Subtarget.hasAVX() && VT == MVT::v4f64) {
+ // Collect unique operands.
+ auto UniqueOps = SmallSet<SDValue, 16u>();
+ for (auto &Op : BVOp->ops()) {
+ if (isIntOrFPConstant(Op) || Op.get()->isUndef())
+ return {};
+ UniqueOps.insert(Op);
+ }
+ // Candidate BUILD_VECTOR must have 2 unique operands.
+ if (UniqueOps.size() != 2u)
+ return {};
+ // Create shuffle mask.
+ auto Op0 = BVOp->getOperand(0u);
+ auto Mask = std::vector<int>();
+ Mask.reserve(NumElems);
+ for (auto I = 0u; I < NumElems; ++I) {
+ auto &Op = BVOp->getOperand(I);
+ Mask.push_back(Op == Op0 ? I : I + NumElems);
+ }
+ // Create shuffle of splats.
+ UniqueOps.erase(Op0);
+ auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
+ auto NewOp1 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin());
+ return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
}
return {};
>From 1016286928ef8c6e3e669980f0fc6c249041ff01 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 07:30:40 +0100
Subject: [PATCH 07/13] Update tests.
---
llvm/test/CodeGen/X86/build-vector-256.ll | 44 ++++++++++---------
llvm/test/CodeGen/X86/build-vector-512.ll | 36 ---------------
.../test/CodeGen/X86/vector-shuffle-256-v4.ll | 38 ----------------
3 files changed, 24 insertions(+), 94 deletions(-)
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 6c1cbfb4014b6..ed00cfe4c32f1 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -417,18 +417,26 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
; AVX-32-LABEL: test_buildvector_4f64_2_var:
; AVX-32: # %bb.0:
-; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1
+; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_4f64_2_var:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: retq
+; AVX1-64-LABEL: test_buildvector_4f64_2_var:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX1-64-NEXT: retq
+;
+; AVX2-64-LABEL: test_buildvector_4f64_2_var:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-64-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX2-64-NEXT: retq
%v0 = insertelement <4 x double> poison, double %a0, i32 0
%v1 = insertelement <4 x double> %v0, double %a1, i32 1
%v2 = insertelement <4 x double> %v1, double %a1, i32 2
@@ -441,20 +449,16 @@ define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) {
; AVX-32: # %bb.0:
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-32-NEXT: vbroadcastsd (%ecx), %ymm0
+; AVX-32-NEXT: vbroadcastsd (%eax), %ymm1
+; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_4f64_2_load:
; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-64-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX-64-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX-64-NEXT: retq
%a0 = load double, ptr %p0
%a1 = load double, ptr %p1
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 5d38f087aa1b3..33493f43fd134 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -480,23 +480,6 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; build vectors of repeated elements
define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
-; AVX-32-LABEL: test_buildvector_8f64_2_var:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm1
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-32-NEXT: retl
-;
-; AVX-64-LABEL: test_buildvector_8f64_2_var:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-64-NEXT: retq
%v0 = insertelement <8 x double> poison, double %a0, i32 0
%v1 = insertelement <8 x double> %v0, double %a1, i32 1
%v2 = insertelement <8 x double> %v1, double %a0, i32 2
@@ -509,25 +492,6 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
}
define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) {
-; AVX-32-LABEL: test_buildvector_8f64_2_load:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-32-NEXT: retl
-;
-; AVX-64-LABEL: test_buildvector_8f64_2_load:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX-64-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-64-NEXT: retq
%a0 = load double, ptr %p0
%a1 = load double, ptr %p1
%v0 = insertelement <8 x double> poison, double %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index f57287a5ebc62..a746f3528a050 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2367,44 +2367,6 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
ret <4 x double> %unpckh
}
-define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; ALL-LABEL: blend_broadcasts_v4f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
-; ALL-NEXT: vbroadcastsd (%rsi), %ymm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; ALL-NEXT: retq
- %ld0 = load <4 x double>, ptr %p0, align 32
- %ld1 = load <4 x double>, ptr %p1, align 32
- %bcst0 = shufflevector <4 x double> %ld0, <4 x double> poison, <4 x i32> zeroinitializer
- %bcst1 = shufflevector <4 x double> %ld1, <4 x double> poison, <4 x i32> zeroinitializer
- %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
- ret <4 x double> %blend
-}
-
-define <4 x double> @blend_broadcasts_v2f64(ptr %p0, ptr %p1) {
-; AVX1OR2-LABEL: blend_broadcasts_v2f64:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; AVX1OR2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: blend_broadcasts_v2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,4,6,2]
-; AVX512VL-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
-; AVX512VL-NEXT: retq
- %ld0 = load <2 x double>, ptr %p0, align 32
- %ld1 = load <2 x double>, ptr %p1, align 32
- %blend = shufflevector <2 x double> %ld0, <2 x double> %ld1, <4 x i32> <i32 0, i32 2, i32 2, i32 0>
- ret <4 x double> %blend
-}
-
define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
; ALL-LABEL: blend_broadcasts_v1f64:
; ALL: # %bb.0:
>From 1dadc11819ae6c0f00ec445e1e811b6453d82b3b Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 17:19:21 +0100
Subject: [PATCH 08/13] Address comments.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9447bedb6c04e..5a9485b21b310 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8753,29 +8753,28 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
MVT VT = BVOp->getSimpleValueType(0u);
auto const NumElems = VT.getVectorNumElements();
- if (Subtarget.hasAVX() && VT == MVT::v4f64) {
+ if (VT == MVT::v4f64) {
// Collect unique operands.
auto UniqueOps = SmallSet<SDValue, 16u>();
- for (auto &Op : BVOp->ops()) {
- if (isIntOrFPConstant(Op) || Op.get()->isUndef())
- return {};
+ for (SDValue Op : BVOp->ops()) {
+ if (isIntOrFPConstant(Op) || Op.isUndef())
+ return SDValue();
UniqueOps.insert(Op);
}
// Candidate BUILD_VECTOR must have 2 unique operands.
if (UniqueOps.size() != 2u)
- return {};
+ return SDValue();
// Create shuffle mask.
- auto Op0 = BVOp->getOperand(0u);
- auto Mask = std::vector<int>();
- Mask.reserve(NumElems);
+ SDValue Op0 = BVOp->getOperand(0u);
+ SmallVector<int, 16u> Mask(NumElems);
for (auto I = 0u; I < NumElems; ++I) {
- auto &Op = BVOp->getOperand(I);
- Mask.push_back(Op == Op0 ? I : I + NumElems);
+ SDValue Op = BVOp->getOperand(I);
+ Mask[I] = Op == Op0 ? I : I + NumElems;
}
// Create shuffle of splats.
- UniqueOps.erase(Op0);
- auto NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
- auto NewOp1 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin());
+
+ SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin());
+ SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, *(++UniqueOps.begin()));
return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
}
>From 95136f6194cf3755c92b33d6588b57d717de2783 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 17:25:06 +0100
Subject: [PATCH 09/13] Address comments.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5a9485b21b310..e502a54bacc30 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8765,16 +8765,16 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
if (UniqueOps.size() != 2u)
return SDValue();
// Create shuffle mask.
- SDValue Op0 = BVOp->getOperand(0u);
+ SDValue Op0 = *(UniqueOps.begin());
+ SDValue Op1 = *(++UniqueOps.begin());
SmallVector<int, 16u> Mask(NumElems);
for (auto I = 0u; I < NumElems; ++I) {
SDValue Op = BVOp->getOperand(I);
Mask[I] = Op == Op0 ? I : I + NumElems;
}
// Create shuffle of splats.
-
- SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, *UniqueOps.begin());
- SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, *(++UniqueOps.begin()));
+ SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
+ SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
}
>From feefba6c9c24e027391e789f9b6f6fc6b66782a6 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 19:15:14 +0100
Subject: [PATCH 10/13] Update tests.
---
llvm/test/CodeGen/X86/build-vector-512.ll | 74 +++++++++++++++++++++--
1 file changed, 70 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 33493f43fd134..789196c5e4848 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-64
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-32,AVX512F-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-64,AVX512F-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512BW-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512BW-64
define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) {
; AVX-32-LABEL: test_buildvector_v8f64:
@@ -480,6 +480,37 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; build vectors of repeated elements
define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
+; AVX512F-32-LABEL: test_buildvector_8f64_2_var:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
+; AVX512F-32-NEXT: movb $-126, %al
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
+; AVX512F-32-NEXT: retl
+;
+; AVX512F-64-LABEL: test_buildvector_8f64_2_var:
+; AVX512F-64: # %bb.0:
+; AVX512F-64-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-64-NEXT: movb $-126, %al
+; AVX512F-64-NEXT: kmovw %eax, %k1
+; AVX512F-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; AVX512F-64-NEXT: retq
+;
+; AVX512BW-32-LABEL: test_buildvector_8f64_2_var:
+; AVX512BW-32: # %bb.0:
+; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
+; AVX512BW-32-NEXT: movb $-126, %al
+; AVX512BW-32-NEXT: kmovd %eax, %k1
+; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
+; AVX512BW-32-NEXT: retl
+;
+; AVX512BW-64-LABEL: test_buildvector_8f64_2_var:
+; AVX512BW-64: # %bb.0:
+; AVX512BW-64-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512BW-64-NEXT: movb $-126, %al
+; AVX512BW-64-NEXT: kmovd %eax, %k1
+; AVX512BW-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; AVX512BW-64-NEXT: retq
%v0 = insertelement <8 x double> poison, double %a0, i32 0
%v1 = insertelement <8 x double> %v0, double %a1, i32 1
%v2 = insertelement <8 x double> %v1, double %a0, i32 2
@@ -492,6 +523,41 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
}
define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) {
+; AVX512F-32-LABEL: test_buildvector_8f64_2_load:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vbroadcastsd (%ecx), %zmm0
+; AVX512F-32-NEXT: movb $-126, %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: retl
+;
+; AVX512F-64-LABEL: test_buildvector_8f64_2_load:
+; AVX512F-64: # %bb.0:
+; AVX512F-64-NEXT: vbroadcastsd (%rdi), %zmm0
+; AVX512F-64-NEXT: movb $-126, %al
+; AVX512F-64-NEXT: kmovw %eax, %k1
+; AVX512F-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1}
+; AVX512F-64-NEXT: retq
+;
+; AVX512BW-32-LABEL: test_buildvector_8f64_2_load:
+; AVX512BW-32: # %bb.0:
+; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512BW-32-NEXT: vbroadcastsd (%ecx), %zmm0
+; AVX512BW-32-NEXT: movb $-126, %cl
+; AVX512BW-32-NEXT: kmovd %ecx, %k1
+; AVX512BW-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1}
+; AVX512BW-32-NEXT: retl
+;
+; AVX512BW-64-LABEL: test_buildvector_8f64_2_load:
+; AVX512BW-64: # %bb.0:
+; AVX512BW-64-NEXT: vbroadcastsd (%rdi), %zmm0
+; AVX512BW-64-NEXT: movb $-126, %al
+; AVX512BW-64-NEXT: kmovd %eax, %k1
+; AVX512BW-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1}
+; AVX512BW-64-NEXT: retq
%a0 = load double, ptr %p0
%a1 = load double, ptr %p1
%v0 = insertelement <8 x double> poison, double %a0, i32 0
>From 521f52209afde736a42e5fb68497fc0954233c7d Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 19:53:55 +0100
Subject: [PATCH 11/13] Address comments.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 37 ++++----
llvm/test/CodeGen/X86/build-vector-256.ll | 76 +++++++++++------
.../test/CodeGen/X86/vector-shuffle-256-v4.ll | 84 +++++++++++++++----
3 files changed, 140 insertions(+), 57 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e502a54bacc30..c51f7ce0678e3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8745,28 +8745,37 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
return LowerShift(Res, Subtarget, DAG);
}
+static bool isShuffleFoldableLoad(SDValue);
+
/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
/// representing a blend.
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
X86Subtarget const &Subtarget,
SelectionDAG &DAG) {
MVT VT = BVOp->getSimpleValueType(0u);
- auto const NumElems = VT.getVectorNumElements();
- if (VT == MVT::v4f64) {
- // Collect unique operands.
- auto UniqueOps = SmallSet<SDValue, 16u>();
- for (SDValue Op : BVOp->ops()) {
- if (isIntOrFPConstant(Op) || Op.isUndef())
- return SDValue();
- UniqueOps.insert(Op);
- }
- // Candidate BUILD_VECTOR must have 2 unique operands.
- if (UniqueOps.size() != 2u)
+ if (VT != MVT::v4f64)
+ return SDValue();
+
+ // Collect unique operands.
+ auto UniqueOps = SmallSet<SDValue, 16u>();
+ for (SDValue Op : BVOp->ops()) {
+ if (isIntOrFPConstant(Op) || Op.isUndef())
return SDValue();
+ UniqueOps.insert(Op);
+ }
+
+ // Candidate BUILD_VECTOR must have 2 unique operands.
+ if (UniqueOps.size() != 2u)
+ return SDValue();
+
+ SDValue Op0 = *(UniqueOps.begin());
+ SDValue Op1 = *(++UniqueOps.begin());
+
+ if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) ||
+ Subtarget.hasAVX2()) {
// Create shuffle mask.
- SDValue Op0 = *(UniqueOps.begin());
- SDValue Op1 = *(++UniqueOps.begin());
+ auto const NumElems = VT.getVectorNumElements();
SmallVector<int, 16u> Mask(NumElems);
for (auto I = 0u; I < NumElems; ++I) {
SDValue Op = BVOp->getOperand(I);
@@ -8778,7 +8787,7 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
}
- return {};
+ return SDValue();
}
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index ed00cfe4c32f1..3edb712e53c8d 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -415,22 +415,28 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; build vectors of repeated elements
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
-; AVX-32-LABEL: test_buildvector_4f64_2_var:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
-; AVX-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1
-; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: test_buildvector_4f64_2_var:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_4f64_2_var:
; AVX1-64: # %bb.0:
-; AVX1-64-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-64-NEXT: retq
;
+; AVX2-32-LABEL: test_buildvector_4f64_2_var:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1
+; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-32-NEXT: retl
+;
; AVX2-64-LABEL: test_buildvector_4f64_2_var:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1
@@ -445,21 +451,41 @@ define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
}
define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) {
-; AVX-32-LABEL: test_buildvector_4f64_2_load:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vbroadcastsd (%ecx), %ymm0
-; AVX-32-NEXT: vbroadcastsd (%eax), %ymm1
-; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: test_buildvector_4f64_2_load:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_4f64_2_load:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vbroadcastsd (%rsi), %ymm0
-; AVX-64-NEXT: vbroadcastsd (%rdi), %ymm1
-; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; AVX-64-NEXT: retq
+; AVX1-64-LABEL: test_buildvector_4f64_2_load:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_4f64_2_load:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX2-32-NEXT: vbroadcastsd (%ecx), %ymm0
+; AVX2-32-NEXT: vbroadcastsd (%eax), %ymm1
+; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_4f64_2_load:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-64-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-64-NEXT: retq
%a0 = load double, ptr %p0
%a1 = load double, ptr %p1
%v0 = insertelement <4 x double> poison, double %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index a746f3528a050..4cdc65e5c1b97 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2368,12 +2368,28 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
}
define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
-; ALL-LABEL: blend_broadcasts_v1f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vbroadcastsd (%rsi), %ymm0
-; ALL-NEXT: vbroadcastsd (%rdi), %ymm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; ALL-NEXT: retq
+; AVX1-LABEL: blend_broadcasts_v1f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_broadcasts_v1f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v1f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
%blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
@@ -2381,12 +2397,28 @@ define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
}
define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
-; ALL-LABEL: blend_broadcasts_v1f64_4x:
-; ALL: # %bb.0:
-; ALL-NEXT: vbroadcastsd (%rsi), %ymm0
-; ALL-NEXT: vbroadcastsd (%rdi), %ymm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; ALL-NEXT: retq
+; AVX1-LABEL: blend_broadcasts_v1f64_4x:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_broadcasts_v1f64_4x:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v1f64_4x:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
%bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer
@@ -2396,12 +2428,28 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
}
define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
-; ALL-LABEL: blend_broadcasts_v1f64_2x:
-; ALL: # %bb.0:
-; ALL-NEXT: vbroadcastsd (%rsi), %ymm0
-; ALL-NEXT: vbroadcastsd (%rdi), %ymm1
-; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
-; ALL-NEXT: retq
+; AVX1-LABEL: blend_broadcasts_v1f64_2x:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_broadcasts_v1f64_2x:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v1f64_2x:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: retq
%ld0 = load <1 x double>, ptr %p0, align 32
%ld1 = load <1 x double>, ptr %p1, align 32
%bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer
>From 379f6c638b3fc5e07b8df73c35a1e718461be2d0 Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Sat, 26 Apr 2025 20:08:07 +0100
Subject: [PATCH 12/13] Formatting.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c51f7ce0678e3..9732707b8b0e0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8768,11 +8768,11 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
// Candidate BUILD_VECTOR must have 2 unique operands.
if (UniqueOps.size() != 2u)
return SDValue();
-
+
SDValue Op0 = *(UniqueOps.begin());
SDValue Op1 = *(++UniqueOps.begin());
- if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) ||
+ if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) ||
Subtarget.hasAVX2()) {
// Create shuffle mask.
auto const NumElems = VT.getVectorNumElements();
>From 975f4d8b1f2350b75aedfd40490ca71e5707a9fa Mon Sep 17 00:00:00 2001
From: Leon Clark <leoclark at amd.com>
Date: Mon, 28 Apr 2025 02:47:22 +0100
Subject: [PATCH 13/13] Address comments.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9732707b8b0e0..69fa7a807d4fc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8769,8 +8769,9 @@ static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
if (UniqueOps.size() != 2u)
return SDValue();
- SDValue Op0 = *(UniqueOps.begin());
- SDValue Op1 = *(++UniqueOps.begin());
+ SDValue Op0 = BVOp->getOperand(0u);
+ UniqueOps.erase(Op0);
+ SDValue Op1 = *UniqueOps.begin();
if (isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1) ||
Subtarget.hasAVX2()) {
More information about the llvm-commits
mailing list