[llvm] [X86][Combine] Ensure single use chain in extract-load combine (PR #136520)

Sun Apr 20 17:55:33 PDT 2025

https://github.com/e-kud created https://github.com/llvm/llvm-project/pull/136520

The problem is that `SrcBC = peekThroughBitcasts(Src)` doesn't ensure single use chain. It results in the situation when a cast may have multiple users and instead of replacing a load we introduce a new one.
The situation is worsened by the fact that we've replaced the token from the original load and its correct memory order now is not guaranteed.

>From b605aab2700d1f657f09e898e77a13251e238596 Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov at intel.com>
Date: Sun, 20 Apr 2025 17:32:38 -0700
Subject: [PATCH 1/2] Test precommit

---
 llvm/test/CodeGen/X86/extractelement-load.ll | 67 ++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 022b25a241533..dfbf9f3b19fc2 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -528,3 +528,70 @@ define i32 @main() nounwind {
   %r = add i32 %e1, %e2
   ret i32 %r
 }
+
+; A test for incorrect combine for single value extraction from VBROADCAST_LOAD.
+; Wrong combine makes the second call (%t8) use the stored result in the
+; previous instructions instead of %t4.
+declare <2 x float> @ccosf(<2 x float>)
+define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr %p, ptr %arr) nounwind {
+; X86-SSE2-LABEL: multiuse_of_single_value_from_vbroadcast_load:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    movups 24(%esi), %xmm0
+; X86-SSE2-NEXT:    movups %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    movhps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movaps 32(%esi), %xmm0
+; X86-SSE2-NEXT:    calll ccosf at PLT
+; X86-SSE2-NEXT:    movlps %xmm0, 32(%esi)
+; X86-SSE2-NEXT:    movups (%esp), %xmm0 # 16-byte Reload
+; X86-SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X86-SSE2-NEXT:    calll ccosf at PLT
+; X86-SSE2-NEXT:    addl $16, %esp
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSSE3-LABEL: multiuse_of_single_value_from_vbroadcast_load:
+; X64-SSSE3:       # %bb.0:
+; X64-SSSE3-NEXT:    pushq %rbx
+; X64-SSSE3-NEXT:    subq $16, %rsp
+; X64-SSSE3-NEXT:    movq %rsi, %rbx
+; X64-SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-SSSE3-NEXT:    movapd %xmm0, (%rsp) # 16-byte Spill
+; X64-SSSE3-NEXT:    movlpd %xmm0, (%rdi)
+; X64-SSSE3-NEXT:    movaps 32(%rsi), %xmm0
+; X64-SSSE3-NEXT:    callq ccosf at PLT
+; X64-SSSE3-NEXT:    movlps %xmm0, 32(%rbx)
+; X64-SSSE3-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; X64-SSSE3-NEXT:    callq ccosf at PLT
+; X64-SSSE3-NEXT:    addq $16, %rsp
+; X64-SSSE3-NEXT:    popq %rbx
+; X64-SSSE3-NEXT:    retq
+;
+; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbx
+; X64-AVX-NEXT:    movq %rsi, %rbx
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; X64-AVX-NEXT:    vmovaps 32(%rsi), %xmm0
+; X64-AVX-NEXT:    callq ccosf at PLT
+; X64-AVX-NEXT:    vmovlps %xmm0, 32(%rbx)
+; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT:    callq ccosf at PLT
+; X64-AVX-NEXT:    popq %rbx
+; X64-AVX-NEXT:    retq
+  %p1 = getelementptr [5 x <2 x float>], ptr %arr, i64 0, i64 3
+  %p2 = getelementptr inbounds [5 x <2 x float>], ptr %arr, i64 0, i64 4, i32 0
+  %t3 = load <4 x float>, ptr %p1, align 8
+  %t4 = shufflevector <4 x float> %t3, <4 x float> poison, <2 x i32> <i32 2, i32 3>
+  store <2 x float> %t4, ptr %p, align 16
+  %t5 = load <4 x float>, ptr %p2, align 32
+  %t6 = shufflevector <4 x float> %t5, <4 x float> poison, <2 x i32> <i32 0, i32 1>
+  %t7 = call <2 x float> @ccosf(<2 x float> %t6)
+  store <2 x float> %t7, ptr %p2, align 32
+  %t8 = call <2 x float> @ccosf(<2 x float> %t4)
+  ret <2 x float> %t8
+}

>From 4d0a3ae7818b25f8f4f362b960912c5f0a2d3345 Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov at intel.com>
Date: Sun, 20 Apr 2025 17:37:33 -0700
Subject: [PATCH 2/2] [X86][Combine] Ensure single use chain in extract-load
 combine

The problem is that `SrcBC = peekThroughBitcasts(Src)` doesn't ensure
single use chain. It results in the situation when a cast may have
multiple users and instead of replacing a load we introduce a new one.
The situation is worsened by the fact that we've replaced the token from
the original load and its correct memory order now is not guaranteed.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      | 3 ++-
 llvm/test/CodeGen/X86/extractelement-load.ll | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 024b653f78cb5..1faf55025db47 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46258,7 +46258,8 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
 
   // If we're extracting a single element from a broadcast load and there are
   // no other users, just create a single load.
-  if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
+  if (peekThroughOneUseBitcasts(Src).getOpcode() == X86ISD::VBROADCAST_LOAD &&
+      SrcBC.hasOneUse()) {
     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index dfbf9f3b19fc2..ce68eebd5b752 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -573,14 +573,17 @@ define dso_local <2 x float> @multiuse_of_single_value_from_vbroadcast_load(ptr
 ; X64-AVX-LABEL: multiuse_of_single_value_from_vbroadcast_load:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    pushq %rbx
+; X64-AVX-NEXT:    subq $16, %rsp
 ; X64-AVX-NEXT:    movq %rsi, %rbx
-; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX-NEXT:    vmovsd %xmm0, (%rdi)
+; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; X64-AVX-NEXT:    vmovlps %xmm0, (%rdi)
 ; X64-AVX-NEXT:    vmovaps 32(%rsi), %xmm0
 ; X64-AVX-NEXT:    callq ccosf at PLT
 ; X64-AVX-NEXT:    vmovlps %xmm0, 32(%rbx)
-; X64-AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
 ; X64-AVX-NEXT:    callq ccosf at PLT
+; X64-AVX-NEXT:    addq $16, %rsp
 ; X64-AVX-NEXT:    popq %rbx
 ; X64-AVX-NEXT:    retq
   %p1 = getelementptr [5 x <2 x float>], ptr %arr, i64 0, i64 3