[llvm] 0922789 - [X86][AVX] Match v4f64 blend from shuffle of scalar values. (#135753)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 6 06:27:33 PDT 2025
Author: Leon Clark
Date: 2025-05-06T14:27:27+01:00
New Revision: 0922789e4742e0961f8db171fea1590a8580937e
URL: https://github.com/llvm/llvm-project/commit/0922789e4742e0961f8db171fea1590a8580937e
DIFF: https://github.com/llvm/llvm-project/commit/0922789e4742e0961f8db171fea1590a8580937e.diff
LOG: [X86][AVX] Match v4f64 blend from shuffle of scalar values. (#135753)
Convert a BUILD_VECTOR of scalar values to a shuffle of shuffles that
will lower to AVX blend.
This addresses a regression in #128938.
---------
Co-authored-by: Leon Clark <leoclark at amd.com>
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/build-vector-256.ll
llvm/test/CodeGen/X86/build-vector-512.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd1bbb8fbb7b7..f04603867a587 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21,6 +21,7 @@
#include "X86TargetMachine.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
@@ -37,6 +38,7 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SDPatternMatch.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallingConv.h"
@@ -8783,6 +8785,52 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL,
return LowerShift(Res, Subtarget, DAG);
}
+static bool isShuffleFoldableLoad(SDValue);
+
+/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
+/// representing a blend.
+static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL,
+ X86Subtarget const &Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = BVOp->getSimpleValueType(0u);
+
+ if (VT != MVT::v4f64)
+ return SDValue();
+
+ // Collect unique operands.
+ auto UniqueOps = SmallSet<SDValue, 16u>();
+ for (SDValue Op : BVOp->ops()) {
+ if (isIntOrFPConstant(Op) || Op.isUndef())
+ return SDValue();
+ UniqueOps.insert(Op);
+ }
+
+ // Candidate BUILD_VECTOR must have 2 unique operands.
+ if (UniqueOps.size() != 2u)
+ return SDValue();
+
+ SDValue Op0 = BVOp->getOperand(0u);
+ UniqueOps.erase(Op0);
+ SDValue Op1 = *UniqueOps.begin();
+
+ if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
+ isShuffleFoldableLoad(Op1)) {
+ // Create shuffle mask.
+ auto const NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16u> Mask(NumElems);
+ for (auto I = 0u; I < NumElems; ++I) {
+ SDValue Op = BVOp->getOperand(I);
+ Mask[I] = Op == Op0 ? I : I + NumElems;
+ }
+ // Create shuffle of splats.
+ SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
+ SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
+ return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
+ }
+
+ return SDValue();
+}
+
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
/// functionality to do this, so it's all zeros, all ones, or some derivation
/// that is cheap to calculate.
@@ -9245,6 +9293,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return Broadcast;
if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
return BitOp;
+ if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
+ return Blend;
unsigned NumZero = ZeroMask.popcount();
unsigned NumNonZero = NonZeroMask.popcount();
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 6c1cbfb4014b6..3edb712e53c8d 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -415,20 +415,34 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; build vectors of repeated elements
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
-; AVX-32-LABEL: test_buildvector_4f64_2_var:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: test_buildvector_4f64_2_var:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_4f64_2_var:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: retq
+; AVX1-64-LABEL: test_buildvector_4f64_2_var:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_4f64_2_var:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; AVX2-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm1
+; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_4f64_2_var:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-64-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX2-64-NEXT: retq
%v0 = insertelement <4 x double> poison, double %a0, i32 0
%v1 = insertelement <4 x double> %v0, double %a1, i32 1
%v2 = insertelement <4 x double> %v1, double %a1, i32 2
@@ -437,25 +451,41 @@ define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
}
define <4 x double> @test_buildvector_4f64_2_load(ptr %p0, ptr %p1) {
-; AVX-32-LABEL: test_buildvector_4f64_2_load:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: test_buildvector_4f64_2_load:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX1-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_4f64_2_load:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT: retq
+; AVX1-64-LABEL: test_buildvector_4f64_2_load:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-64-NEXT: retq
+;
+; AVX2-32-LABEL: test_buildvector_4f64_2_load:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX2-32-NEXT: vbroadcastsd (%ecx), %ymm0
+; AVX2-32-NEXT: vbroadcastsd (%eax), %ymm1
+; AVX2-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-32-NEXT: retl
+;
+; AVX2-64-LABEL: test_buildvector_4f64_2_load:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-64-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-64-NEXT: retq
%a0 = load double, ptr %p0
%a1 = load double, ptr %p1
%v0 = insertelement <4 x double> poison, double %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 5d38f087aa1b3..789196c5e4848 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX-64
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-32,AVX512F-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX-64,AVX512F-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512BW-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512BW-64
define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) {
; AVX-32-LABEL: test_buildvector_v8f64:
@@ -480,23 +480,37 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; build vectors of repeated elements
define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
-; AVX-32-LABEL: test_buildvector_8f64_2_var:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm1
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-32-NEXT: retl
+; AVX512F-32-LABEL: test_buildvector_8f64_2_var:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
+; AVX512F-32-NEXT: movb $-126, %al
+; AVX512F-32-NEXT: kmovw %eax, %k1
+; AVX512F-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
+; AVX512F-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_8f64_2_var:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX-64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-64-NEXT: retq
+; AVX512F-64-LABEL: test_buildvector_8f64_2_var:
+; AVX512F-64: # %bb.0:
+; AVX512F-64-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-64-NEXT: movb $-126, %al
+; AVX512F-64-NEXT: kmovw %eax, %k1
+; AVX512F-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; AVX512F-64-NEXT: retq
+;
+; AVX512BW-32-LABEL: test_buildvector_8f64_2_var:
+; AVX512BW-32: # %bb.0:
+; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
+; AVX512BW-32-NEXT: movb $-126, %al
+; AVX512BW-32-NEXT: kmovd %eax, %k1
+; AVX512BW-32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 {%k1}
+; AVX512BW-32-NEXT: retl
+;
+; AVX512BW-64-LABEL: test_buildvector_8f64_2_var:
+; AVX512BW-64: # %bb.0:
+; AVX512BW-64-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512BW-64-NEXT: movb $-126, %al
+; AVX512BW-64-NEXT: kmovd %eax, %k1
+; AVX512BW-64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; AVX512BW-64-NEXT: retq
%v0 = insertelement <8 x double> poison, double %a0, i32 0
%v1 = insertelement <8 x double> %v0, double %a1, i32 1
%v2 = insertelement <8 x double> %v1, double %a0, i32 2
@@ -509,25 +523,41 @@ define <8 x double> @test_buildvector_8f64_2_var(double %a0, double %a1) {
}
define <8 x double> @test_buildvector_8f64_2_load(ptr %p0, ptr %p1) {
-; AVX-32-LABEL: test_buildvector_8f64_2_load:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-32-NEXT: retl
+; AVX512F-32-LABEL: test_buildvector_8f64_2_load:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vbroadcastsd (%ecx), %zmm0
+; AVX512F-32-NEXT: movb $-126, %cl
+; AVX512F-32-NEXT: kmovw %ecx, %k1
+; AVX512F-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_8f64_2_load:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX-64-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
-; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX-64-NEXT: retq
+; AVX512F-64-LABEL: test_buildvector_8f64_2_load:
+; AVX512F-64: # %bb.0:
+; AVX512F-64-NEXT: vbroadcastsd (%rdi), %zmm0
+; AVX512F-64-NEXT: movb $-126, %al
+; AVX512F-64-NEXT: kmovw %eax, %k1
+; AVX512F-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1}
+; AVX512F-64-NEXT: retq
+;
+; AVX512BW-32-LABEL: test_buildvector_8f64_2_load:
+; AVX512BW-32: # %bb.0:
+; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512BW-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512BW-32-NEXT: vbroadcastsd (%ecx), %zmm0
+; AVX512BW-32-NEXT: movb $-126, %cl
+; AVX512BW-32-NEXT: kmovd %ecx, %k1
+; AVX512BW-32-NEXT: vbroadcastsd (%eax), %zmm0 {%k1}
+; AVX512BW-32-NEXT: retl
+;
+; AVX512BW-64-LABEL: test_buildvector_8f64_2_load:
+; AVX512BW-64: # %bb.0:
+; AVX512BW-64-NEXT: vbroadcastsd (%rdi), %zmm0
+; AVX512BW-64-NEXT: movb $-126, %al
+; AVX512BW-64-NEXT: kmovd %eax, %k1
+; AVX512BW-64-NEXT: vbroadcastsd (%rsi), %zmm0 {%k1}
+; AVX512BW-64-NEXT: retq
%a0 = load double, ptr %p0
%a1 = load double, ptr %p1
%v0 = insertelement <8 x double> poison, double %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index fb8618be17f06..4cdc65e5c1b97 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2367,6 +2367,97 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
ret <4 x double> %unpckh
}
+define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
+; AVX1-LABEL: blend_broadcasts_v1f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_broadcasts_v1f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v1f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %blend = shufflevector <1 x double> %ld0, <1 x double> %ld1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
+; AVX1-LABEL: blend_broadcasts_v1f64_4x:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_broadcasts_v1f64_4x:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v1f64_4x:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <4 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <4 x i32> zeroinitializer
+ %blend = shufflevector <4 x double> %bcst0, <4 x double> %bcst1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %blend
+}
+
+define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
+; AVX1-LABEL: blend_broadcasts_v1f64_2x:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_broadcasts_v1f64_2x:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: blend_broadcasts_v1f64_2x:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd (%rsi), %ymm0
+; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL-NEXT: retq
+ %ld0 = load <1 x double>, ptr %p0, align 32
+ %ld1 = load <1 x double>, ptr %p1, align 32
+ %bcst0 = shufflevector <1 x double> %ld0, <1 x double> poison, <2 x i32> zeroinitializer
+ %bcst1 = shufflevector <1 x double> %ld1, <1 x double> poison, <2 x i32> zeroinitializer
+ %blend = shufflevector <2 x double> %bcst0, <2 x double> %bcst1, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
+ ret <4 x double> %blend
+}
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"ProfileSummary", !1}
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
More information about the llvm-commits
mailing list