[llvm] 35b35a3 - [X86] Prevent shuffle combining from creating an identical X86ISD::SHUF128.

Fri Sep 4 14:13:32 PDT 2020

Author: Craig Topper
Date: 2020-09-04T14:12:49-07:00
New Revision: 35b35a373d013df8e80c0c9840c085aa6a79c4dc

URL: https://github.com/llvm/llvm-project/commit/35b35a373d013df8e80c0c9840c085aa6a79c4dc
DIFF: https://github.com/llvm/llvm-project/commit/35b35a373d013df8e80c0c9840c085aa6a79c4dc.diff

LOG: [X86] Prevent shuffle combining from creating an identical X86ISD::SHUF128.

This can cause an infinite loop if SimplifiedDemandedElts asks
for the node to replace itself.

A similar protection exists in other places in shuffle combining.

Fixes ISPC https://github.com/ispc/ispc/issues/1864

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 517e6c093180..1212585b4baf 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34909,6 +34909,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
 
     if (!isAnyZero(Mask) && !PreferPERMQ) {
+      if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+        return SDValue(); // Nothing to do!
       if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
         return DAG.getBitcast(RootVT, V);
     }

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index e9f4aa99f148..4fce1a38a754 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -764,3 +764,47 @@ define <16 x float> @mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_
   %res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <16 x float> %res
 }
+
+%struct.foo = type { [4 x double], [3 x [4 x double]], [4 x double] }
+
+; This test previously hung in shuffle combining. https://github.com/ispc/ispc/issues/1864
+define void @ispc_1864(<16 x float>* %arg) {
+; ALL-LABEL: ispc_1864:
+; ALL:       # %bb.0: # %bb
+; ALL-NEXT:    pushq %rbp
+; ALL-NEXT:    .cfi_def_cfa_offset 16
+; ALL-NEXT:    .cfi_offset %rbp, -16
+; ALL-NEXT:    movq %rsp, %rbp
+; ALL-NEXT:    .cfi_def_cfa_register %rbp
+; ALL-NEXT:    andq $-64, %rsp
+; ALL-NEXT:    subq $4864, %rsp # imm = 0x1300
+; ALL-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0]
+; ALL-NEXT:    vmulps 32(%rdi), %ymm0, %ymm0
+; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
+; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,0,1,0,1]
+; ALL-NEXT:    vmovapd %ymm0, {{[0-9]+}}(%rsp)
+; ALL-NEXT:    movq %rbp, %rsp
+; ALL-NEXT:    popq %rbp
+; ALL-NEXT:    .cfi_def_cfa %rsp, 8
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
+bb:
+  %tmp = alloca [30 x %struct.foo], align 64
+  %tmp1 = load <16 x float>, <16 x float>* %arg, align 4
+  %tmp2 = fmul <16 x float> %tmp1, <float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00>
+  %tmp3 = fpext <16 x float> %tmp2 to <16 x double>
+  %tmp4 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 0
+  %tmp5 = extractelement <16 x double> %tmp3, i32 10
+  store double %tmp5, double* %tmp4, align 32
+  %tmp6 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 1
+  %tmp7 = extractelement <16 x double> %tmp3, i32 11
+  store double %tmp7, double* %tmp6, align 8
+  %tmp8 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 2
+  %tmp9 = extractelement <16 x double> %tmp3, i32 12
+  store double %tmp9, double* %tmp8, align 16
+  %tmp10 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 3
+  %tmp11 = extractelement <16 x double> %tmp3, i32 13
+  store double %tmp11, double* %tmp10, align 8
+  ret void
+}
+