[llvm] 61a2d6b - [DAG] foldShuffleOfConcatUndefs - ensure shuffles of upper (undef) subvector elements is undef (PR50609)

Tue Jun 8 07:52:03 PDT 2021

Author: Simon Pilgrim
Date: 2021-06-08T15:49:41+01:00
New Revision: 61a2d6bfe48cf3da4b95d1383bf866690287f8e8

URL: https://github.com/llvm/llvm-project/commit/61a2d6bfe48cf3da4b95d1383bf866690287f8e8
DIFF: https://github.com/llvm/llvm-project/commit/61a2d6bfe48cf3da4b95d1383bf866690287f8e8.diff

LOG: [DAG] foldShuffleOfConcatUndefs - ensure shuffles of upper (undef) subvector elements is undef (PR50609)

shuffle(concat(x,undef),concat(y,undef)) -> concat(shuffle(x,y),shuffle(x,y))

If the original shuffle references any of the upper (undef) subvector elements, ensure the split shuffle masks uses undef instead of an out-of-bounds value.

Fixes PR50609

Added: 
    llvm/test/CodeGen/X86/pr50609.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 52de05117b12..22bd89990b1d 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20427,6 +20427,9 @@ static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
   for (unsigned i = 0; i != NumElts; ++i) {
     if (Mask[i] == -1)
       continue;
+    // If we reference the upper (undef) subvector then the element is undef.
+    if ((Mask[i] % NumElts) >= HalfNumElts)
+      continue;
     int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
     if (i < HalfNumElts)
       Mask0[i] = M;

diff  --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll
new file mode 100644
index 000000000000..44e004825e7c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr50609.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s
+
+define void @PR50609(float* noalias nocapture %RET, float* noalias %aFOO, <16 x i32> %__mask) nounwind {
+; CHECK-LABEL: PR50609:
+; CHECK:       # %bb.0: # %allocas
+; CHECK-NEXT:    leal 40(%rsi), %eax
+; CHECK-NEXT:    vmovq %rsi, %xmm2
+; CHECK-NEXT:    vmovd %eax, %xmm3
+; CHECK-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; CHECK-NEXT:    vpsrad $31, %xmm2, %xmm3
+; CHECK-NEXT:    vpsrld $30, %xmm3, %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vpsrad $2, %xmm2, %xmm2
+; CHECK-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; CHECK-NEXT:    vmaskmovps %ymm2, %ymm0, (%rdi)
+; CHECK-NEXT:    vmaskmovps %ymm2, %ymm1, 32(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+allocas:
+  %aFOO_load_ptr2int = ptrtoint float* %aFOO to i64
+  %aFOO_load_ptr2int_broadcast = insertelement <16 x i64> undef, i64 %aFOO_load_ptr2int, i32 0
+  %aFOO_load4_offset = getelementptr float, float* %aFOO, i64 10
+  %c_load_ptr2int = ptrtoint float* %aFOO_load4_offset to i64
+  %c_load_ptr2int_broadcast = insertelement <16 x i64> undef, i64 %c_load_ptr2int, i32 0
+  %0 = sub <16 x i64> %c_load_ptr2int_broadcast, %aFOO_load_ptr2int_broadcast
+  %1 = trunc <16 x i64> %0 to <16 x i32>
+  %2 = sdiv <16 x i32> %1, <i32 4, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %3 = sitofp <16 x i32> %2 to <16 x float>
+  %ptr.i.i = bitcast float* %RET to i8*
+  %val0.i.i = shufflevector <16 x float> %3, <16 x float> undef, <8 x i32> zeroinitializer
+  %mask0.i.i = shufflevector <16 x i32> %__mask, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %mask1.i.i = shufflevector <16 x i32> %__mask, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr.i.i, <8 x i32> %mask0.i.i, <8 x float> %val0.i.i) #1
+  %ptr1.i.i16 = getelementptr float, float* %RET, i64 8
+  %ptr1.i.i = bitcast float* %ptr1.i.i16 to i8*
+  call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr1.i.i, <8 x i32> %mask1.i.i, <8 x float> %val0.i.i) #1
+  ret void
+}
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>)