[llvm] [X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching (PR #165870)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 31 08:23:12 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/165870
>From 6e9448d1da7e15c65ebbfd81fb8b14d3a2b77186 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 31 Oct 2025 14:44:14 +0000
Subject: [PATCH] [X86] narrowBitOpRMW - use reachesChainWithoutSideEffects
instead of direct chain matching
This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 40 +++--
llvm/test/CodeGen/X86/bittest-big-integer.ll | 163 ++++---------------
2 files changed, 48 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 007074c3ffc82..62d68f45df177 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53351,21 +53351,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
return SDValue();
@@ -53374,18 +53364,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
// BTC: X ^ (1 << ShAmt)
//
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
- SDValue InsertBit, ShAmt;
+ SDValue SrcVal, InsertBit, ShAmt;
if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+ !(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Or(m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
- m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53419,7 +53417,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDNodeFlags::NoUnsignedWrap);
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 5776c6c82bcc3..c197a83835506 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers
@@ -1029,151 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-LABEL: reset_multiload_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 36(%esp,%edi), %edx
-; X86-NEXT: movl 40(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%edi), %edi
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: andl %ebx, (%ecx)
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: notl %edx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl %edx, 4(%ebx)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 8(%ebx)
-; X86-NEXT: notl %edi
-; X86-NEXT: andl %edi, 12(%ebx)
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: btrl %edx, %ebx
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl %ebx, (%ecx,%esi)
; X86-NEXT: jae .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB22_2:
-; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_multiload_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %rax, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
-; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movl (%rdx), %eax
-; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: andq %rsi, (%rdi)
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; X64-LABEL: reset_multiload_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %r9d
+; X64-NEXT: movl %r9d, %r8d
+; X64-NEXT: btrl %esi, %r8d
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl %esi, %r9d
+; X64-NEXT: jb .LBB22_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: .LBB22_2:
+; X64-NEXT: movl %r8d, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
More information about the llvm-commits
mailing list