[PATCH] D61546: Stop the DAG combiner from combining vector stores greater than preferred vector width...

Fri May 3 20:16:17 PDT 2019

echristo created this revision.
echristo added a reviewer: craig.topper.
Herald added subscribers: llvm-commits, hiraditya, mcrosier.
Herald added a project: LLVM.

Originally we said that -mpreferred-vector-width was only going to stop the vectorizer and some of code generation, but here's another spot if we want to make sure we don't canonicalize a memcpy/memmmove and then lower it to the widest vector type.

Original testcase:

  void Copy256(const char* src, char* dst) {
    char tmp[32];
    for (int i = 0; i < 32; ++i) tmp[i] = src[i];
    for (int i = 0; i < 32; ++i) dst[i] = tmp[i];
  }

which is pretty boring, but shows the problem:

  vmovups ymm0, ymmword ptr [rdi]
  vmovups ymmword ptr [rsi], ymm0
  vzeroupper
  ret

while the option says that this doesn't necessarily mean no vector code, I think this is a fairly reasonable place to stop some optimization.

Thoughts?


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D61546

Files:
  llvm/lib/Target/X86/X86ISelLowering.cpp
  llvm/test/CodeGen/X86/vector-width-store-merge.ll


Index: llvm/test/CodeGen/X86/vector-width-store-merge.ll
===================================================================

--- /dev/null
+++ llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+; This tests whether or not we generate vectors large than preferred vector width when
+; lowering memmove.
+
+; Function Attrs: nounwind uwtable
+define dso_local void @Copy256(i8* nocapture readonly %src, i8* nocapture %dst) local_unnamed_addr #0 {
+entry:
+; CHECK: Copy256
+; CHECK-NOT: vmovups ymm0
+; CHECK: vmovups %xmm1
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2058,18 +2058,17 @@
 /// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
-EVT
-X86TargetLowering::getOptimalMemOpType(
+EVT X86TargetLowering::getOptimalMemOpType(
     uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
     bool ZeroMemset, bool MemcpyStrSrc,
     const AttributeList &FuncAttributes) const {
   if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
-    if (Size >= 16 &&
-        (!Subtarget.isUnalignedMem16Slow() ||
-         ((DstAlign == 0 || DstAlign >= 16) &&
-          (SrcAlign == 0 || SrcAlign >= 16)))) {
+    if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
+                       ((DstAlign == 0 || DstAlign >= 16) &&
+                        (SrcAlign == 0 || SrcAlign >= 16)))) {
       // FIXME: Check if unaligned 32-byte accesses are slow.
-      if (Size >= 32 && Subtarget.hasAVX()) {
+      if (Size >= 32 && Subtarget.hasAVX() &&
+          (Subtarget.getPreferVectorWidth() >= 256)) {
         // Although this isn't a well-supported type for AVX1, we'll let
         // legalization and shuffle lowering produce the optimal codegen. If we
         // choose an optimal type with a vector element larger than a byte,
@@ -2077,11 +2076,12 @@
         // multiply) before we splat as a vector.
         return MVT::v32i8;
       }
-      if (Subtarget.hasSSE2())
+      if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
         return MVT::v16i8;
       // TODO: Can SSE1 handle a byte vector?
       // If we have SSE1 registers we should be able to use them.
-      if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
+      if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
+          (Subtarget.getPreferVectorWidth() >= 128))
         return MVT::v4f32;
     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
@@ -4964,6 +4964,8 @@
     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
     return (MemVT.getSizeInBits() <= MaxIntSize);
   }
+  if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
+    return false;
   return true;
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D61546.198124.patch
Type: text/x-patch
Size: 4193 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190504/9d778211/attachment.bin>