[llvm] [X86] narrowBitOpRMW - allow additional uses of the BTC/R/S result (PR #166376)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 5 00:51:34 PST 2025


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/166376

>From 461686633c8ef53da99b1fb4da700e5fbb5e91cc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 4 Nov 2025 14:17:55 +0000
Subject: [PATCH 1/2] [X86] narrowBitOpRMW - allow additional uses of the
 BTC/R/S result

If there are additional uses of the bit twiddled value as well as the rmw store, we can replace them with a (re)loaded copy of the full width integer value after the store.

There's some memory op chain handling to handle here - the additional (re)load is chained after the new store and then any dependencies of the original store are chained after the (re)load.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  19 +-
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 401 ++++---------------
 2 files changed, 89 insertions(+), 331 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6edf0185df813..4da1bb0c81db1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53369,8 +53369,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   //
   // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
   SDValue SrcVal, InsertBit, ShAmt;
-  if (!StoredVal.hasOneUse() ||
-      !(sd_match(StoredVal, m_And(m_Value(SrcVal),
+  if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
                                   m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
         sd_match(StoredVal,
                  m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
@@ -53441,8 +53440,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
     Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
   }
 
-  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
-                      Align(), St->getMemOperand()->getFlags());
+  SDValue NewStore =
+      DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+                   Align(), St->getMemOperand()->getFlags());
+
+  // If there are other uses of StoredVal, replace with a new load of the
+  // whole (updated) value and ensure that any chained dependencies on the
+  // original store are updated to come AFTER the new load.
+  if (!StoredVal.hasOneUse()) {
+    SDValue NewLoad =
+        DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+    DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+    DAG.ReplaceAllUsesWith(SDValue(St, 0), NewLoad.getValue(1));
+  }
+  return NewStore;
 }
 
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index bcb14fd25b975..32d225273a6e1 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -906,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_cmpz_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movl 36(%esp,%esi), %eax
-; X86-NEXT:    movl 40(%esp,%esi), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 32(%esp,%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    xorl 12(%ecx), %esi
-; X86-NEXT:    xorl 8(%ecx), %edx
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    xorl (%ecx), %edi
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    xorl %edx, (%eax,%ecx)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    orl 12(%eax), %edx
+; X86-NEXT:    orl 8(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: complement_cmpz_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %rsi, %rax
-; SSE-NEXT:    xorq 8(%rdi), %rdx
-; SSE-NEXT:    xorq (%rdi), %rax
-; SSE-NEXT:    movq %rax, (%rdi)
-; SSE-NEXT:    movq %rdx, 8(%rdi)
-; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    shll %cl, %eax
+; SSE-NEXT:    andl $96, %ecx
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    xorl %eax, (%rdi,%rcx)
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    orq 8(%rdi), %rax
 ; SSE-NEXT:    setne %al
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: complement_cmpz_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shlxq %rcx, %rax, %rax
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    xorq 8(%rdi), %rdx
-; AVX2-NEXT:    xorq (%rdi), %rax
-; AVX2-NEXT:    movq %rax, (%rdi)
-; AVX2-NEXT:    movq %rdx, 8(%rdi)
-; AVX2-NEXT:    orq %rdx, %rax
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_cmpz_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %edx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rdx, %rsi
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    xorq 8(%rdi), %rsi
-; AVX512-NEXT:    xorq (%rdi), %rdx
-; AVX512-NEXT:    movq %rdx, (%rdi)
-; AVX512-NEXT:    movq %rsi, 8(%rdi)
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    retq
+; AVX-LABEL: complement_cmpz_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    shlxl %esi, %eax, %eax
+; AVX-NEXT:    andl $96, %esi
+; AVX-NEXT:    shrl $3, %esi
+; AVX-NEXT:    xorl %eax, (%rdi,%rsi)
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    orq 8(%rdi), %rax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -1088,247 +1019,63 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
 ; X86-LABEL: chain_reset_i256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movzbl 20(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $28, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl 76(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NEXT:    movl 88(%esp,%eax), %esi
-; X86-NEXT:    movl 92(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    notl %eax
-; X86-NEXT:    notl %edx
-; X86-NEXT:    notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    andl 12(%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 8(%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 20(%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl 16(%ecx), %edx
-; X86-NEXT:    andl 28(%ecx), %eax
-; X86-NEXT:    andl 24(%ecx), %ebx
-; X86-NEXT:    andl 4(%ecx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl (%ecx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, 24(%ecx)
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %edx, 16(%ecx)
-; X86-NEXT:    movl %edi, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-2, %edi
+; X86-NEXT:    roll %cl, %edi
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $28, %ecx
+; X86-NEXT:    andl %edi, (%esi,%ecx)
+; X86-NEXT:    movl 8(%esi), %ebx
+; X86-NEXT:    movl (%esi), %edi
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 12(%esi), %ebp
+; X86-NEXT:    orl 28(%esi), %ebp
+; X86-NEXT:    orl 20(%esi), %ecx
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl 24(%esi), %ebx
+; X86-NEXT:    movl 16(%esi), %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %edi, (%edx)
 ; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    orl %ecx, %ebp
 ; X86-NEXT:    jne .LBB23_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:  .LBB23_2:
-; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: chain_reset_i256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    shrb $3, %al
-; SSE-NEXT:    andb $24, %al
-; SSE-NEXT:    negb %al
-; SSE-NEXT:    movsbq %al, %r10
-; SSE-NEXT:    movq -24(%rsp,%r10), %r8
-; SSE-NEXT:    movq -16(%rsp,%r10), %rax
-; SSE-NEXT:    shldq %cl, %r8, %rax
-; SSE-NEXT:    movq -32(%rsp,%r10), %r9
-; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -40(%rsp,%r10), %r10
-; SSE-NEXT:    shldq %cl, %r10, %r9
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    notq %r10
-; SSE-NEXT:    notq %r9
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    andq 16(%rdi), %r8
-; SSE-NEXT:    andq 8(%rdi), %r9
-; SSE-NEXT:    andq (%rdi), %r10
-; SSE-NEXT:    movq %r8, 16(%rdi)
-; SSE-NEXT:    movq %rax, 24(%rdi)
-; SSE-NEXT:    movq %r10, (%rdi)
-; SSE-NEXT:    movq %r9, 8(%rdi)
-; SSE-NEXT:    orq %rax, %r9
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    movl (%rsi), %eax
-; SSE-NEXT:    movl %r10d, (%rsi)
-; SSE-NEXT:    movl (%rdx), %ecx
-; SSE-NEXT:    addl %ecx, %eax
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: chain_reset_i256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    shrb $3, %al
-; AVX2-NEXT:    andb $24, %al
-; AVX2-NEXT:    negb %al
-; AVX2-NEXT:    movsbq %al, %rax
-; AVX2-NEXT:    movq -32(%rsp,%rax), %r8
-; AVX2-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX2-NEXT:    movq %r9, %r10
-; AVX2-NEXT:    shldq %cl, %r8, %r10
-; AVX2-NEXT:    movq -40(%rsp,%rax), %r11
-; AVX2-NEXT:    movq -16(%rsp,%rax), %rax
-; AVX2-NEXT:    shldq %cl, %r9, %rax
-; AVX2-NEXT:    shldq %cl, %r11, %r8
-; AVX2-NEXT:    andnq 24(%rdi), %rax, %rax
-; AVX2-NEXT:    andnq 16(%rdi), %r10, %r9
-; AVX2-NEXT:    andnq 8(%rdi), %r8, %r8
-; AVX2-NEXT:    shlxq %rcx, %r11, %rcx
-; AVX2-NEXT:    andnq (%rdi), %rcx, %rcx
-; AVX2-NEXT:    movq %r9, 16(%rdi)
-; AVX2-NEXT:    movq %rax, 24(%rdi)
-; AVX2-NEXT:    movq %rcx, (%rdi)
-; AVX2-NEXT:    movq %r8, 8(%rdi)
-; AVX2-NEXT:    orq %rax, %r8
-; AVX2-NEXT:    orq %rcx, %r9
-; AVX2-NEXT:    movl (%rsi), %eax
-; AVX2-NEXT:    movl %ecx, (%rsi)
-; AVX2-NEXT:    movl (%rdx), %ecx
-; AVX2-NEXT:    addl %ecx, %eax
-; AVX2-NEXT:    orq %r8, %r9
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: chain_reset_i256:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; AVX512-NEXT:    movl %ecx, %eax
-; AVX512-NEXT:    shrb $3, %al
-; AVX512-NEXT:    andb $24, %al
-; AVX512-NEXT:    negb %al
-; AVX512-NEXT:    movsbq %al, %rax
-; AVX512-NEXT:    movq -40(%rsp,%rax), %r8
-; AVX512-NEXT:    movq -32(%rsp,%rax), %r9
-; AVX512-NEXT:    movq -24(%rsp,%rax), %r10
-; AVX512-NEXT:    movq %r10, %r11
-; AVX512-NEXT:    shldq %cl, %r9, %r11
-; AVX512-NEXT:    movq -16(%rsp,%rax), %rax
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    shlxq %rcx, %r8, %r10
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %r8, %r9
-; AVX512-NEXT:    andnq 24(%rdi), %rax, %rax
-; AVX512-NEXT:    andnq 16(%rdi), %r11, %rcx
-; AVX512-NEXT:    andnq 8(%rdi), %r9, %r8
-; AVX512-NEXT:    andnq (%rdi), %r10, %r9
-; AVX512-NEXT:    movq %rcx, 16(%rdi)
-; AVX512-NEXT:    movq %rax, 24(%rdi)
-; AVX512-NEXT:    movq %r9, (%rdi)
-; AVX512-NEXT:    movq %r8, 8(%rdi)
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    orq %r9, %rcx
-; AVX512-NEXT:    movl (%rsi), %eax
-; AVX512-NEXT:    movl %r9d, (%rsi)
-; AVX512-NEXT:    movl (%rdx), %edx
-; AVX512-NEXT:    addl %edx, %eax
-; AVX512-NEXT:    orq %r8, %rcx
-; AVX512-NEXT:    cmovnel %edx, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: chain_reset_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    movl $-2, %eax
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $28, %ecx
+; X64-NEXT:    andl %eax, (%rdi,%rcx)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq 8(%rdi), %r8
+; X64-NEXT:    orq 24(%rdi), %r8
+; X64-NEXT:    movq 16(%rdi), %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    movl %ecx, (%rsi)
+; X64-NEXT:    movl (%rdx), %ecx
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    orq %r8, %rdi
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
   %rem = and i32 %position, 255
   %ofs = zext nneg i32 %rem to i256
   %bit = shl nuw i256 1, %ofs

>From 9e7207eccc43fba6806b1a2b42cf0f60e7337ce8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 5 Nov 2025 08:51:13 +0000
Subject: [PATCH 2/2] Drop extra store-load-store chain splicing

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d2eeb76057418..4d44227b3ecd4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53446,13 +53446,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                    Align(), St->getMemOperand()->getFlags());
 
   // If there are other uses of StoredVal, replace with a new load of the
-  // whole (updated) value and ensure that any chained dependencies on the
-  // original store are updated to come AFTER the new load.
+  // whole (updated) value.
   if (!StoredVal.hasOneUse()) {
     SDValue NewLoad =
         DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
     DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
-    DAG.ReplaceAllUsesWith(SDValue(St, 0), NewLoad.getValue(1));
   }
   return NewStore;
 }



More information about the llvm-commits mailing list