[llvm] e4b260e - [Codegen][X86] `LowerBUILD_VECTOR()`: improve lowering w/ multiple FREEZE-UNDEF ops
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 4 10:16:31 PST 2023
Author: Roman Lebedev
Date: 2023-01-04T21:16:11+03:00
New Revision: e4b260efb2c8dca3035597d30ffd066ffb77daea
URL: https://github.com/llvm/llvm-project/commit/e4b260efb2c8dca3035597d30ffd066ffb77daea
DIFF: https://github.com/llvm/llvm-project/commit/e4b260efb2c8dca3035597d30ffd066ffb77daea.diff
LOG: [Codegen][X86] `LowerBUILD_VECTOR()`: improve lowering w/ multiple FREEZE-UNDEF ops
While we have great handling for UNDEF operands,
FREEZE-UNDEF operands are effectively normal operands.
We are better off "interleaving" such BUILD_VECTORS into a blend
between a splat of FREEZE-UNDEF, and "thawed" source BUILD_VECTOR,
both of which are more natural for us to handle.
Refs. https://github.com/llvm/llvm-project/commit/f738ab9075f838dd4365adf3a92ca1acced114d7#r95017306
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/freeze-vector.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c0aeed8e45fb1..1babfa2da2539 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11137,6 +11137,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+ MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
unsigned NumElems = Op.getNumOperands();
// Generate vectors for predicate vectors.
@@ -11151,6 +11152,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned EVTBits = EltVT.getSizeInBits();
APInt UndefMask = APInt::getZero(NumElems);
+ APInt FrozenUndefMask = APInt::getZero(NumElems);
APInt ZeroMask = APInt::getZero(NumElems);
APInt NonZeroMask = APInt::getZero(NumElems);
bool IsAllConstants = true;
@@ -11162,6 +11164,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
UndefMask.setBit(i);
continue;
}
+ if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {
+ FrozenUndefMask.setBit(i);
+ continue;
+ }
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
@@ -11175,11 +11181,37 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
// All undef vector. Return an UNDEF. All zero vectors were handled above.
- if (NonZeroMask == 0) {
+ unsigned NumFrozenUndefElts = FrozenUndefMask.countPopulation();
+ if (NonZeroMask == 0 && NumFrozenUndefElts != NumElems) {
assert(UndefMask.isAllOnes() && "Fully undef mask expected");
return DAG.getUNDEF(VT);
}
+ // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
+ // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
+ // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
+ // and blend the FREEZE-UNDEF operands back in.
+ // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
+ if (NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
+ SmallVector<int, 16> BlendMask(NumElems, -1);
+ SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (UndefMask[i]) {
+ BlendMask[i] = -1;
+ continue;
+ }
+ BlendMask[i] = i;
+ if (!FrozenUndefMask[i])
+ Elts[i] = Op.getOperand(i);
+ else
+ BlendMask[i] += NumElems;
+ }
+ SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
+ SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
+ SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
+ return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
+ }
+
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
// If the upper elts of a ymm/zmm are undef/zero then we might be better off
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 1dec0a454cb0e..1ed5ab2352da8 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -337,17 +337,17 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
-; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm1
-; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7]
-; X86-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X86-NEXT: vmovd %edx, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; X86-NEXT: vmovd %eax, %xmm2
+; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X86-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
+; X86-NEXT: vpand %xmm3, %xmm1, %xmm1
; X86-NEXT: vmovdqa %xmm1, (%ecx)
-; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
+; X86-NEXT: vpand %xmm3, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: retl
;
@@ -356,16 +356,15 @@ define void @freeze_two_frozen_buildvectors(ptr %origin0, ptr %origin1, ptr %dst
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: andl $15, %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
-; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
-; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X64-NEXT: vmovdqa %xmm1, (%rdx)
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7]
+; X64-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X64-NEXT: vmovdqa %xmm2, (%rdx)
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; X64-NEXT: vpand %xmm3, %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: retq
%i0.src = load i32, ptr %origin0
@@ -392,15 +391,15 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1,
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X86-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7]
-; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X86-NEXT: vmovd %edx, %xmm1
+; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
+; X86-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7]
+; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
-; X86-NEXT: vmovd %edx, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: retl
;
@@ -409,15 +408,14 @@ define void @freeze_two_buildvectors_only_one_frozen(ptr %origin0, ptr %origin1,
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: andl $15, %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; X64-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
-; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
-; X64-NEXT: vmovdqa %xmm0, (%rdx)
-; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
+; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vmovdqa %xmm0, (%rdx)
+; X64-NEXT: vpand %xmm2, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: retq
%i0.src = load i32, ptr %origin0
@@ -443,30 +441,24 @@ define void @freeze_two_buildvectors_one_undef_elt(ptr %origin0, ptr %origin1, p
; X86-NEXT: movl (%edx), %edx
; X86-NEXT: andl $15, %edx
; X86-NEXT: vmovd %eax, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X86-NEXT: vmovd %edx, %xmm1
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X86-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
; X86-NEXT: vmovddup {{.*#+}} xmm2 = [7,7]
; X86-NEXT: # xmm2 = mem[0,0]
; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%ecx)
-; X86-NEXT: vpand %xmm2, %xmm1, %xmm0
+; X86-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X86-NEXT: vpand %xmm2, %xmm0, %xmm0
; X86-NEXT: vmovdqa %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: freeze_two_buildvectors_one_undef_elt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: andl $15, %eax
-; X64-NEXT: vmovq %rax, %xmm0
-; X64-NEXT: vmovq %rax, %xmm1
-; X64-NEXT: vpbroadcastq %xmm1, %xmm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7]
-; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0
+; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rdx)
-; X64-NEXT: vpand %xmm2, %xmm1, %xmm0
; X64-NEXT: vmovdqa %xmm0, (%rcx)
; X64-NEXT: retq
%i0.src = load i64, ptr %origin0
More information about the llvm-commits
mailing list