[llvm] c4ac31d - [X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching (#165870)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 4 02:48:54 PST 2025
Author: Simon Pilgrim
Date: 2025-11-04T10:48:49Z
New Revision: c4ac31dc52e016a413ac9e2fa98252c6ff643e3c
URL: https://github.com/llvm/llvm-project/commit/c4ac31dc52e016a413ac9e2fa98252c6ff643e3c
DIFF: https://github.com/llvm/llvm-project/commit/c4ac31dc52e016a413ac9e2fa98252c6ff643e3c.diff
LOG: [X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching (#165870)
This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/bittest-big-integer.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d4a4d4339f7e1..6edf0185df813 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53355,21 +53355,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
return SDValue();
@@ -53378,18 +53368,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
// BTC: X ^ (1 << ShAmt)
//
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
- SDValue InsertBit, ShAmt;
+ SDValue SrcVal, InsertBit, ShAmt;
if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+ !(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Or(m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
- m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53423,7 +53421,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDNodeFlags::NoUnsignedWrap);
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 87a54a0b9148d..c197a83835506 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1029,144 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-LABEL: reset_multiload_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 40(%esp,%eax), %edx
-; X86-NEXT: movl 44(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%eax), %edi
-; X86-NEXT: movl 36(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl %ebx, 4(%eax)
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: notl %edi
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl $96, %ebx
-; X86-NEXT: shrl $3, %ebx
-; X86-NEXT: movl (%eax,%ebx), %ebx
-; X86-NEXT: andl %edi, (%eax)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 12(%eax)
-; X86-NEXT: notl %edx
-; X86-NEXT: andl %edx, 8(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: btl %ecx, %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: btrl %edx, %ebx
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl %ebx, (%ecx,%esi)
; X86-NEXT: jae .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB22_2:
-; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_multiload_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %rax, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
-; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movl (%rdx), %eax
-; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %rsi, (%rdi)
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; X64-LABEL: reset_multiload_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %r9d
+; X64-NEXT: movl %r9d, %r8d
+; X64-NEXT: btrl %esi, %r8d
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl %esi, %r9d
+; X64-NEXT: jb .LBB22_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: .LBB22_2:
+; X64-NEXT: movl %r8d, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
More information about the llvm-commits
mailing list