[llvm] 6c0a2c2 - [x86] enhance mayFoldLoad to check alignment
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 27 04:54:30 PDT 2021
Author: Sanjay Patel
Date: 2021-10-27T07:54:25-04:00
New Revision: 6c0a2c2804c04064b723f7663928621d95cac28f
URL: https://github.com/llvm/llvm-project/commit/6c0a2c2804c04064b723f7663928621d95cac28f
DIFF: https://github.com/llvm/llvm-project/commit/6c0a2c2804c04064b723f7663928621d95cac28f.diff
LOG: [x86] enhance mayFoldLoad to check alignment
As noted in D112464, a pre-AVX target may not be able to fold an
under-aligned vector load into another op, so we shouldn't report
that as a load folding candidate. I only found one caller where
this would make a difference -- combineCommutableSHUFP() -- so
that's where I added a test to show the (minor) regression.
Differential Revision: https://reviews.llvm.org/D112545
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/vec_insert-5.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ff83813f29c01..9f8aaca38a7ad 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5039,13 +5039,30 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
// Other Lowering Hooks
//===----------------------------------------------------------------------===//
-static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) {
- return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode());
+static bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+ bool AssumeSingleUse = false) {
+ if (!AssumeSingleUse && !Op.hasOneUse())
+ return false;
+ if (!ISD::isNormalLoad(Op.getNode()))
+ return false;
+
+ // If this is an unaligned vector, make sure the target supports folding it.
+ auto *Ld = cast<LoadSDNode>(Op.getNode());
+ if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
+ Ld->getValueSizeInBits(0) == 128 && Ld->getAlignment() < 16)
+ return false;
+
+ // TODO: If this is a non-temporal load and the target has an instruction
+ // for it, it should not be folded. See "useNonTemporalLoad()".
+
+ return true;
}
-static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+static bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+ const X86Subtarget &Subtarget,
bool AssumeSingleUse = false) {
- if (!MayFoldLoad(Op, AssumeSingleUse))
+ assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
+ if (!mayFoldLoad(Op, Subtarget, AssumeSingleUse))
return false;
// We can not replace a wide volatile load with a broadcast-from-memory,
@@ -8996,8 +9013,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
} else {
if (!Subtarget.hasAVX2() &&
- !MayFoldLoadIntoBroadcastFromMem(
+ !mayFoldLoadIntoBroadcastFromMem(
RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
+ Subtarget,
/*AssumeSingleUse=*/true))
return SDValue();
Broadcast =
@@ -12727,8 +12745,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
&DAG](SDValue &Input,
MutableArrayRef<int> InputMask) {
unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
- if (!Subtarget.hasAVX2() &&
- (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input)))
+ if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
+ !mayFoldLoad(Input, Subtarget)))
return;
if (isNoopShuffleMask(InputMask))
return;
@@ -16413,7 +16431,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
- MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
+ mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
if (!Ld->isNonTemporal()) {
MVT MemVT = VT.getHalfNumVectorElementsVT();
@@ -19413,7 +19431,8 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// FIXME: relax the profitability check iff all N1 uses are insertions.
if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
- (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
+ (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
+ mayFoldLoad(N1, Subtarget)))) {
SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
SmallVector<int, 8> BlendMask;
for (unsigned i = 0; i != NumElts; ++i)
@@ -19486,7 +19505,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// combine either bitwise AND or insert of float 0.0 to set these bits.
bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
- if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
+ if (IdxVal == 0 && (!MinSize || !mayFoldLoad(N1, Subtarget))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
// than an insertps. Blends are simpler operations in hardware and so
@@ -24626,8 +24645,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// being inserted between two CMOV's. (in i16 case too TBN)
// https://bugs.llvm.org/show_bug.cgi?id=40974
if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
- (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
- !MayFoldLoad(Op2))) {
+ (Op.getValueType() == MVT::i16 && !mayFoldLoad(Op1, Subtarget) &&
+ !mayFoldLoad(Op2, Subtarget))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -36974,7 +36993,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (isUndefOrEqual(Mask, 0)) {
if (V1.getValueType() == MaskVT &&
V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
- MayFoldLoad(V1.getOperand(0))) {
+ mayFoldLoad(V1.getOperand(0), Subtarget)) {
if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
return SDValue(); // Nothing to do!
Res = V1.getOperand(0);
@@ -38415,8 +38434,10 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
SDValue N0 = V.getOperand(0);
SDValue N1 = V.getOperand(1);
unsigned Imm = V.getConstantOperandVal(2);
- if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
- MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (!mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
+ mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
return SDValue();
Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
@@ -51652,8 +51673,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
- (Subtarget.hasAVX2() || MayFoldLoadIntoBroadcastFromMem(
- Op0.getOperand(0), VT.getScalarType())))
+ (Subtarget.hasAVX2() ||
+ mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0), VT.getScalarType(),
+ Subtarget)))
return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
Op0.getOperand(0),
@@ -51662,7 +51684,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Subtarget.hasAVX2() ||
- (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ (EltSizeInBits >= 32 && mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
Op0.getOperand(0).getValueType() == VT.getScalarType())
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
@@ -52994,7 +53016,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
case ISD::SRL: {
SDValue N0 = Op.getOperand(0);
// Look out for (store (shl (load), x)).
- if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
+ if (mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
return false;
break;
}
@@ -53009,11 +53031,11 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
SDValue N0 = Op.getOperand(0);
SDValue N1 = Op.getOperand(1);
// Avoid disabling potential load folding opportunities.
- if (MayFoldLoad(N1) &&
+ if (mayFoldLoad(N1, Subtarget) &&
(!Commute || !isa<ConstantSDNode>(N0) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
return false;
- if (MayFoldLoad(N0) &&
+ if (mayFoldLoad(N0, Subtarget) &&
((Commute && !isa<ConstantSDNode>(N1)) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
return false;
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index b7f7321a8d3d4..e52cf736aa1f0 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1398,40 +1398,40 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu 64(%rdi), %xmm9
; SSE2-NEXT: movups 80(%rdi), %xmm8
-; SSE2-NEXT: movups 64(%rdi), %xmm3
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: movups 16(%rdi), %xmm5
-; SSE2-NEXT: movups 32(%rdi), %xmm10
-; SSE2-NEXT: movdqu 48(%rdi), %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: movaps %xmm10, %xmm7
-; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,1],xmm5[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm5[0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,3],xmm10[1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm5[0,2]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: movaps %xmm8, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,1],xmm3[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm3[1,1,1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm8[1,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[2,0]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,3]
-; SSE2-NEXT: movups %xmm5, 16(%rsi)
-; SSE2-NEXT: movups %xmm11, (%rsi)
-; SSE2-NEXT: movups %xmm2, 16(%rdx)
-; SSE2-NEXT: movups %xmm1, (%rdx)
-; SSE2-NEXT: movups %xmm6, 16(%rcx)
-; SSE2-NEXT: movups %xmm0, (%rcx)
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu 16(%rdi), %xmm10
+; SSE2-NEXT: movups 32(%rdi), %xmm5
+; SSE2-NEXT: movdqu 48(%rdi), %xmm3
+; SSE2-NEXT: movaps %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,1,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm5[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm10[2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm5[2,0]
+; SSE2-NEXT: movaps %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm9[2,3]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm8[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm9[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm10[3,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[2,0]
+; SSE2-NEXT: movups %xmm2, 16(%rsi)
+; SSE2-NEXT: movups %xmm4, (%rsi)
+; SSE2-NEXT: movups %xmm3, 16(%rdx)
+; SSE2-NEXT: movups %xmm0, (%rdx)
+; SSE2-NEXT: movups %xmm1, 16(%rcx)
+; SSE2-NEXT: movups %xmm7, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index 3ad75a667eaa4..e943318b1c924 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -97,20 +97,18 @@ define <4 x float> @t4_under_aligned(<4 x float>* %P) nounwind {
; X32-LABEL: t4_under_aligned:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movups (%eax), %xmm1
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: xorps %xmm0, %xmm0
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
-; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; X32-NEXT: retl
;
; ALIGN-LABEL: t4_under_aligned:
; ALIGN: # %bb.0:
-; ALIGN-NEXT: movups (%rdi), %xmm1
-; ALIGN-NEXT: xorps %xmm2, %xmm2
-; ALIGN-NEXT: xorps %xmm0, %xmm0
-; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[3,0]
-; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; ALIGN-NEXT: movups (%rdi), %xmm0
+; ALIGN-NEXT: xorps %xmm1, %xmm1
+; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; ALIGN-NEXT: retq
;
; UNALIGN-LABEL: t4_under_aligned:
More information about the llvm-commits
mailing list