[llvm] r314988 - [X86] Fix chains update when lowering BUILD_VECTOR to a vector load

Thu Oct 5 09:28:22 PDT 2017

Author: apilipenko
Date: Thu Oct  5 09:28:21 2017
New Revision: 314988

URL: http://llvm.org/viewvc/llvm-project?rev=314988&view=rev
Log:
[X86] Fix chains update when lowering BUILD_VECTOR to a vector load

The code which lowers BUILD_VECTOR of consecutive loads into a single vector
load doesn't update chains properly. As a result the vector load can be
reordered with the store to the same location.

The current code in EltsFromConsecutiveLoads only updates the chain following
the first load. The fix is to update the chains following all the loads
comprising the vector.

This is a fix for PR10114.

Reviewed By: niravd

Differential Revision: https://reviews.llvm.org/D38547

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=314988&r1=314987&r2=314988&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Oct  5 09:28:21 2017
@@ -6540,14 +6540,20 @@ static SDValue EltsFromConsecutiveLoads(
     }
   }
 
-  auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
+  SmallVector<LoadSDNode *, 8> Loads;
+  for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
+    if (LoadMask[i])
+      Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
+
+  auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
     auto MMOFlags = LDBase->getMemOperand()->getFlags();
     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
            "Cannot merge volatile loads.");
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
-    DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
+    for (auto *LD : Loads)
+      DAG.makeEquivalentMemoryOrdering(LD, NewLd);
     return NewLd;
   };
 
@@ -6612,7 +6618,8 @@ static SDValue EltsFromConsecutiveLoads(
                                   LDBase->getAlignment(),
                                   false/*isVolatile*/, true/*ReadMem*/,
                                   false/*WriteMem*/);
-      DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
+      for (auto *LD : Loads)
+        DAG.makeEquivalentMemoryOrdering(LD, ResNode);
       return DAG.getBitcast(VT, ResNode);
     }
   }

Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll?rev=314988&r1=314987&r2=314988&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll Thu Oct  5 09:28:21 2017
@@ -409,6 +409,124 @@ define <4 x i32> @merge_4i32_i32_23u5(i3
   ret <4 x i32> %res3
 }
 
+define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_23u5_inc2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movups 8(%rdi), %xmm0
+; SSE-NEXT:    incl 8(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: merge_4i32_i32_23u5_inc2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovups 8(%rdi), %xmm0
+; AVX-NEXT:    incl 8(%rdi)
+; AVX-NEXT:    retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Lcfi6:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Lcfi7:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Lcfi8:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -12
+; X32-SSE1-NEXT:  .Lcfi9:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 8(%ecx), %edx
+; X32-SSE1-NEXT:    movl 12(%ecx), %esi
+; X32-SSE1-NEXT:    leal 1(%edx), %edi
+; X32-SSE1-NEXT:    movl %edi, 8(%ecx)
+; X32-SSE1-NEXT:    movl 20(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 8(%eax), %xmm0
+; X32-SSE41-NEXT:    incl 8(%eax)
+; X32-SSE41-NEXT:    retl
+  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+  %val0 = load i32, i32* %ptr0
+  %inc = add i32 %val0, 1
+  store i32 %inc, i32* %ptr0
+  %val1 = load i32, i32* %ptr1
+  %val3 = load i32, i32* %ptr3
+  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
+  ret <4 x i32> %res3
+}
+
+define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_23u5_inc3:
+; SSE:       # BB#0:
+; SSE-NEXT:    movups 8(%rdi), %xmm0
+; SSE-NEXT:    incl 12(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: merge_4i32_i32_23u5_inc3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovups 8(%rdi), %xmm0
+; AVX-NEXT:    incl 12(%rdi)
+; AVX-NEXT:    retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Lcfi10:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Lcfi11:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Lcfi12:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -12
+; X32-SSE1-NEXT:  .Lcfi13:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 8(%ecx), %edx
+; X32-SSE1-NEXT:    movl 12(%ecx), %esi
+; X32-SSE1-NEXT:    leal 1(%esi), %edi
+; X32-SSE1-NEXT:    movl %edi, 12(%ecx)
+; X32-SSE1-NEXT:    movl 20(%ecx), %ecx
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    movl %ecx, 12(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movups 8(%eax), %xmm0
+; X32-SSE41-NEXT:    incl 12(%eax)
+; X32-SSE41-NEXT:    retl
+  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+  %val0 = load i32, i32* %ptr0
+  %val1 = load i32, i32* %ptr1
+  %inc = add i32 %val1, 1
+  store i32 %inc, i32* %ptr1
+  %val3 = load i32, i32* %ptr3
+  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
+  ret <4 x i32> %res3
+}
+
 define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
 ; SSE-LABEL: merge_4i32_i32_3zuu:
 ; SSE:       # BB#0:
@@ -513,6 +631,118 @@ define <4 x i32> @merge_4i32_i32_45zz(i3
   ret <4 x i32> %res1
 }
 
+define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_45zz_inc4:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    incl 16(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: merge_4i32_i32_45zz_inc4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    incl 16(%rdi)
+; AVX-NEXT:    retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Lcfi14:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Lcfi15:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Lcfi16:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -12
+; X32-SSE1-NEXT:  .Lcfi17:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 16(%ecx), %edx
+; X32-SSE1-NEXT:    movl 20(%ecx), %esi
+; X32-SSE1-NEXT:    leal 1(%edx), %edi
+; X32-SSE1-NEXT:    movl %edi, 16(%ecx)
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    movl $0, 12(%eax)
+; X32-SSE1-NEXT:    movl $0, 8(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    incl 16(%eax)
+; X32-SSE41-NEXT:    retl
+  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
+  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
+  %val0 = load i32, i32* %ptr0
+  %inc = add i32 %val0, 1
+  store i32 %inc, i32* %ptr0
+  %val1 = load i32, i32* %ptr1
+  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+  ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_45zz_inc5:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    incl 20(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: merge_4i32_i32_45zz_inc5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    incl 20(%rdi)
+; AVX-NEXT:    retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    pushl %edi
+; X32-SSE1-NEXT:  .Lcfi18:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT:    pushl %esi
+; X32-SSE1-NEXT:  .Lcfi19:
+; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT:  .Lcfi20:
+; X32-SSE1-NEXT:    .cfi_offset %esi, -12
+; X32-SSE1-NEXT:  .Lcfi21:
+; X32-SSE1-NEXT:    .cfi_offset %edi, -8
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl 16(%ecx), %edx
+; X32-SSE1-NEXT:    movl 20(%ecx), %esi
+; X32-SSE1-NEXT:    leal 1(%esi), %edi
+; X32-SSE1-NEXT:    movl %edi, 20(%ecx)
+; X32-SSE1-NEXT:    movl %esi, 4(%eax)
+; X32-SSE1-NEXT:    movl %edx, (%eax)
+; X32-SSE1-NEXT:    movl $0, 12(%eax)
+; X32-SSE1-NEXT:    movl $0, 8(%eax)
+; X32-SSE1-NEXT:    popl %esi
+; X32-SSE1-NEXT:    popl %edi
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT:    incl 20(%eax)
+; X32-SSE41-NEXT:    retl
+  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
+  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
+  %val0 = load i32, i32* %ptr0
+  %val1 = load i32, i32* %ptr1
+  %inc = add i32 %val1, 1
+  store i32 %inc, i32* %ptr1
+  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+  ret <4 x i32> %res1
+}
+
 define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
 ; SSE-LABEL: merge_8i16_i16_23u567u9:
 ; SSE:       # BB#0:
@@ -527,14 +757,14 @@ define <8 x i16> @merge_8i16_i16_23u567u
 ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:
 ; X32-SSE1:       # BB#0:
 ; X32-SSE1-NEXT:    pushl %edi
-; X32-SSE1-NEXT:  .Lcfi6:
+; X32-SSE1-NEXT:  .Lcfi22:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
 ; X32-SSE1-NEXT:    pushl %esi
-; X32-SSE1-NEXT:  .Lcfi7:
+; X32-SSE1-NEXT:  .Lcfi23:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X32-SSE1-NEXT:  .Lcfi8:
+; X32-SSE1-NEXT:  .Lcfi24:
 ; X32-SSE1-NEXT:    .cfi_offset %esi, -12
-; X32-SSE1-NEXT:  .Lcfi9:
+; X32-SSE1-NEXT:  .Lcfi25:
 ; X32-SSE1-NEXT:    .cfi_offset %edi, -8
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -667,24 +897,24 @@ define <16 x i8> @merge_16i8_i8_01u34567
 ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
 ; X32-SSE1:       # BB#0:
 ; X32-SSE1-NEXT:    pushl %ebp
-; X32-SSE1-NEXT:  .Lcfi10:
+; X32-SSE1-NEXT:  .Lcfi26:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
 ; X32-SSE1-NEXT:    pushl %ebx
-; X32-SSE1-NEXT:  .Lcfi11:
+; X32-SSE1-NEXT:  .Lcfi27:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
 ; X32-SSE1-NEXT:    pushl %edi
-; X32-SSE1-NEXT:  .Lcfi12:
+; X32-SSE1-NEXT:  .Lcfi28:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X32-SSE1-NEXT:    pushl %esi
-; X32-SSE1-NEXT:  .Lcfi13:
+; X32-SSE1-NEXT:  .Lcfi29:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 20
-; X32-SSE1-NEXT:  .Lcfi14:
+; X32-SSE1-NEXT:  .Lcfi30:
 ; X32-SSE1-NEXT:    .cfi_offset %esi, -20
-; X32-SSE1-NEXT:  .Lcfi15:
+; X32-SSE1-NEXT:  .Lcfi31:
 ; X32-SSE1-NEXT:    .cfi_offset %edi, -16
-; X32-SSE1-NEXT:  .Lcfi16:
+; X32-SSE1-NEXT:  .Lcfi32:
 ; X32-SSE1-NEXT:    .cfi_offset %ebx, -12
-; X32-SSE1-NEXT:  .Lcfi17:
+; X32-SSE1-NEXT:  .Lcfi33:
 ; X32-SSE1-NEXT:    .cfi_offset %ebp, -8
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -917,14 +1147,14 @@ define <2 x i64> @merge_2i64_i64_12_vola
 ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:
 ; X32-SSE1:       # BB#0:
 ; X32-SSE1-NEXT:    pushl %edi
-; X32-SSE1-NEXT:  .Lcfi18:
+; X32-SSE1-NEXT:  .Lcfi34:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 8
 ; X32-SSE1-NEXT:    pushl %esi
-; X32-SSE1-NEXT:  .Lcfi19:
+; X32-SSE1-NEXT:  .Lcfi35:
 ; X32-SSE1-NEXT:    .cfi_def_cfa_offset 12
-; X32-SSE1-NEXT:  .Lcfi20:
+; X32-SSE1-NEXT:  .Lcfi36:
 ; X32-SSE1-NEXT:    .cfi_offset %esi, -12
-; X32-SSE1-NEXT:  .Lcfi21:
+; X32-SSE1-NEXT:  .Lcfi37:
 ; X32-SSE1-NEXT:    .cfi_offset %edi, -8
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx