[PATCH] D61546: Stop the DAG combiner from combining vector stores greater than preferred vector width...
Eric Christopher via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Fri May 3 20:16:17 PDT 2019
echristo created this revision.
echristo added a reviewer: craig.topper.
Herald added subscribers: llvm-commits, hiraditya, mcrosier.
Herald added a project: LLVM.
Originally we said that -mpreferred-vector-width was only going to stop the vectorizer and some of code generation, but here's another spot if we want to make sure we don't canonicalize a memcpy/memmmove and then lower it to the widest vector type.
Original testcase:
void Copy256(const char* src, char* dst) {
char tmp[32];
for (int i = 0; i < 32; ++i) tmp[i] = src[i];
for (int i = 0; i < 32; ++i) dst[i] = tmp[i];
}
which is pretty boring, but shows the problem:
vmovups ymm0, ymmword ptr [rdi]
vmovups ymmword ptr [rsi], ymm0
vzeroupper
ret
while the option says that this doesn't necessarily mean no vector code, I think this is a fairly reasonable place to stop some optimization.
Thoughts?
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D61546
Files:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-width-store-merge.ll
Index: llvm/test/CodeGen/X86/vector-width-store-merge.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+; This tests whether or not we generate vectors large than preferred vector width when
+; lowering memmove.
+
+; Function Attrs: nounwind uwtable
+define dso_local void @Copy256(i8* nocapture readonly %src, i8* nocapture %dst) local_unnamed_addr #0 {
+entry:
+; CHECK: Copy256
+; CHECK-NOT: vmovups ymm0
+; CHECK: vmovups %xmm1
+ call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false)
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2058,18 +2058,17 @@
/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
-EVT
-X86TargetLowering::getOptimalMemOpType(
+EVT X86TargetLowering::getOptimalMemOpType(
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
bool ZeroMemset, bool MemcpyStrSrc,
const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 &&
- (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+ if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16)))) {
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX()) {
+ if (Size >= 32 && Subtarget.hasAVX() &&
+ (Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
// choose an optimal type with a vector element larger than a byte,
@@ -2077,11 +2076,12 @@
// multiply) before we splat as a vector.
return MVT::v32i8;
}
- if (Subtarget.hasSSE2())
+ if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v16i8;
// TODO: Can SSE1 handle a byte vector?
// If we have SSE1 registers we should be able to use them.
- if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
+ if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+ (Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
@@ -4964,6 +4964,8 @@
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
return (MemVT.getSizeInBits() <= MaxIntSize);
}
+ if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+ return false;
return true;
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D61546.198124.patch
Type: text/x-patch
Size: 4193 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190504/9d778211/attachment.bin>
More information about the llvm-commits
mailing list