[llvm] [DAGCombiner] Improve chain handling in fold (fshl ld1, ld0, c) -> (ld0[ofs]) combine. (PR #124871)

Wed Jan 29 20:27:29 PST 2025

https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/124871

>From 1a4087f1af171c226ac52c72b3a02002cd67c015 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 29 Jan 2025 20:20:31 -0800
Subject: [PATCH 1/2] [X86] Add test case for #124871.

---
 llvm/test/CodeGen/X86/fshl.ll | 54 +++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 065b396e82ec31..b56f8002b41301 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -657,6 +657,60 @@ define i64 @combine_fshl_load_i64(ptr %p) nounwind {
   ret i64 %res
 }
 
+define i16 @combine_fshl_load_i16_chain_use(ptr %p, ptr %q, i16 %r) nounwind {
+; X86-FAST-LABEL: combine_fshl_load_i16_chain_use:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movzwl (%eax), %esi
+; X86-FAST-NEXT:    movzwl 2(%eax), %eax
+; X86-FAST-NEXT:    shldw $8, %si, %ax
+; X86-FAST-NEXT:    movw %cx, (%edx)
+; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    retl
+;
+; X86-SLOW-LABEL: combine_fshl_load_i16_chain_use:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movzwl 2(%esi), %eax
+; X86-SLOW-NEXT:    movzbl 1(%esi), %esi
+; X86-SLOW-NEXT:    shll $8, %eax
+; X86-SLOW-NEXT:    orl %esi, %eax
+; X86-SLOW-NEXT:    movw %cx, (%edx)
+; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    retl
+;
+; X64-FAST-LABEL: combine_fshl_load_i16_chain_use:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movzwl (%rdi), %ecx
+; X64-FAST-NEXT:    movzwl 2(%rdi), %eax
+; X64-FAST-NEXT:    shldw $8, %cx, %ax
+; X64-FAST-NEXT:    movw %dx, (%rsi)
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: combine_fshl_load_i16_chain_use:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movzwl 2(%rdi), %eax
+; X64-SLOW-NEXT:    movzbl 1(%rdi), %ecx
+; X64-SLOW-NEXT:    shll $8, %eax
+; X64-SLOW-NEXT:    orl %ecx, %eax
+; X64-SLOW-NEXT:    movw %dx, (%rsi)
+; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-SLOW-NEXT:    retq
+  %p1 = getelementptr i16, ptr %p, i32 1
+  %ld0 = load i16, ptr %p
+  %ld1 = load i16, ptr %p1
+  %res = call i16 @llvm.fshl.i16(i16 %ld1, i16 %ld0, i16 8)
+  store i16 %r, ptr %q
+  ret i16 %res
+}
+
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"ProfileSummary", !1}
 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}

>From 4ff8dc503dbb8498c34192eb22d554e54bd39d97 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 28 Jan 2025 17:37:11 -0800
Subject: [PATCH 2/2] [DAGCombiner] Improve chain handling in fold (fshl ld1,
 ld0, c) -> (ld0[ofs]) combine.

Happened to notice some odd things related to chains in this code.

The code calls hasOneUse on LoadSDNode* which will check users
of the data and the chain. I think this was trying to check that
the data had one use so one of the loads would definitely be
removed by the transform. Load chains don't always have users so
our testing may not have noticed that the chains being used would
block the transform.

The code makes all users of ld1's chain use the new load's chain, but
we don't know that ld1 becomes dead. This can cause incorrect dependencies if
ld1's chain is used and it isn't deleted. I think the better thing to do
is use makeEquivalentMemoryOrdering to make all users of ld0 and ld1
depend on the new load and the original loads. If the olds loads become
dead, their chain will be cleaned up later.

I'm having trouble getting a test for any ordering issue.
areNonVolatileConsecutiveLoads requires the two loads to have the same
input chain. Given that I don't know how to use one of the load chain
results without also using the other. If they are both used we don't
do the transform because hasOneUse will return false for both.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  9 ++-
 llvm/test/CodeGen/X86/fshl.ll                 | 57 +++++--------------
 2 files changed, 17 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a0c703d2df8a2f..925da2f0ff2732 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11005,8 +11005,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
       auto *RHS = dyn_cast<LoadSDNode>(N1);
       if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
           LHS->getAddressSpace() == RHS->getAddressSpace() &&
-          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
-          ISD::isNON_EXTLoad(LHS)) {
+          (LHS->hasNUsesOfValue(1, 0) || RHS->hasNUsesOfValue(1, 0)) &&
+          ISD::isNON_EXTLoad(RHS) && ISD::isNON_EXTLoad(LHS)) {
         if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
           SDLoc DL(RHS);
           uint64_t PtrOff =
@@ -11024,9 +11024,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
                 VT, DL, RHS->getChain(), NewPtr,
                 RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
                 RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
-            // Replace the old load's chain with the new load's chain.
-            WorklistRemover DeadNodes(*this);
-            DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
+            DAG.makeEquivalentMemoryOrdering(LHS, Load.getValue(1));
+            DAG.makeEquivalentMemoryOrdering(RHS, Load.getValue(1));
             return Load;
           }
         }
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index b56f8002b41301..e8c8ccfa8d37f9 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -658,51 +658,20 @@ define i64 @combine_fshl_load_i64(ptr %p) nounwind {
 }
 
 define i16 @combine_fshl_load_i16_chain_use(ptr %p, ptr %q, i16 %r) nounwind {
-; X86-FAST-LABEL: combine_fshl_load_i16_chain_use:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movzwl (%eax), %esi
-; X86-FAST-NEXT:    movzwl 2(%eax), %eax
-; X86-FAST-NEXT:    shldw $8, %si, %ax
-; X86-FAST-NEXT:    movw %cx, (%edx)
-; X86-FAST-NEXT:    popl %esi
-; X86-FAST-NEXT:    retl
-;
-; X86-SLOW-LABEL: combine_fshl_load_i16_chain_use:
-; X86-SLOW:       # %bb.0:
-; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movzwl 2(%esi), %eax
-; X86-SLOW-NEXT:    movzbl 1(%esi), %esi
-; X86-SLOW-NEXT:    shll $8, %eax
-; X86-SLOW-NEXT:    orl %esi, %eax
-; X86-SLOW-NEXT:    movw %cx, (%edx)
-; X86-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-SLOW-NEXT:    popl %esi
-; X86-SLOW-NEXT:    retl
-;
-; X64-FAST-LABEL: combine_fshl_load_i16_chain_use:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    movzwl (%rdi), %ecx
-; X64-FAST-NEXT:    movzwl 2(%rdi), %eax
-; X64-FAST-NEXT:    shldw $8, %cx, %ax
-; X64-FAST-NEXT:    movw %dx, (%rsi)
-; X64-FAST-NEXT:    retq
+; X86-LABEL: combine_fshl_load_i16_chain_use:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl 1(%eax), %eax
+; X86-NEXT:    movw %cx, (%edx)
+; X86-NEXT:    retl
 ;
-; X64-SLOW-LABEL: combine_fshl_load_i16_chain_use:
-; X64-SLOW:       # %bb.0:
-; X64-SLOW-NEXT:    movzwl 2(%rdi), %eax
-; X64-SLOW-NEXT:    movzbl 1(%rdi), %ecx
-; X64-SLOW-NEXT:    shll $8, %eax
-; X64-SLOW-NEXT:    orl %ecx, %eax
-; X64-SLOW-NEXT:    movw %dx, (%rsi)
-; X64-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-SLOW-NEXT:    retq
+; X64-LABEL: combine_fshl_load_i16_chain_use:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl 1(%rdi), %eax
+; X64-NEXT:    movw %dx, (%rsi)
+; X64-NEXT:    retq
   %p1 = getelementptr i16, ptr %p, i32 1
   %ld0 = load i16, ptr %p
   %ld1 = load i16, ptr %p1