[llvm] [X86] Avoid returning the same shuffle operation for broadcast (PR #70592)

Sun Oct 29 05:21:10 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

<details>
<summary>Changes</summary>

This is to fix a crash since aab8b2eb080d, which generates a new pattern
```
      t35: v8i32 = xor t11, t14
    t36: v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t35, undef:v8i32
```

The pattern exposed a bug introduced since f885c08034, which breaks element widen but doesn't handle the broadcast case.

The patch just solved the crash issue. I observed performance regression cased by above patches in the test, which may need further investigation.

---
Full diff: https://github.com/llvm/llvm-project/pull/70592.diff


2 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+6) 
- (added) llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll (+30) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6411f27da0776d4..18f6a695e4502e9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15293,6 +15293,12 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
       for (int i = 0; i != NumElts; i += NumBroadcastElts)
         for (int j = 0; j != NumBroadcastElts; ++j)
           BroadcastMask[i + j] = j;
+
+      // Avoid returning the same shuffle operation. For example,
+      // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
+      if (BroadcastMask == Mask)
+        return SDValue();
+
       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
                                   BroadcastMask);
     }
diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll
new file mode 100644
index 000000000000000..f012c05a095731e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,avx512vl | FileCheck %s
+
+define i1 @test(ptr %q) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT:    vptest %ymm0, %ymm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i64, ptr %q, align 8
+  %add = add nsw i64 %0, 0
+  %add2 = add nsw i64 %add, 0
+  %add5 = add nsw i64 %add2, 0
+  %vecinit1.i.i68 = insertelement <2 x i64> poison, i64 %add5, i64 0
+  %add8 = add nsw i64 %add5, 0
+  %vecinit.i.i55 = insertelement <4 x i64> undef, i64 %add8, i64 0
+  %1 = bitcast <2 x i64> %vecinit1.i.i68 to <4 x i32>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  %3 = bitcast <4 x i64> %vecinit.i.i55 to <8 x i32>
+  %4 = shufflevector <8 x i32> %3, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  %5 = icmp ne <8 x i32> %2, %4
+  %6 = bitcast <8 x i1> %5 to i8
+  %7 = icmp eq i8 %6, 0
+  ret i1 %7
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/70592