[llvm] [DAG] visitBITCAST - fold (bitcast (freeze (load x))) -> (freeze (load (bitcast*)x)) (PR #164618)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 5 07:03:40 PST 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/164618
>From b3f2dfa4c917b0fb60079145dde8cbd9ab1093cd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 22 Oct 2025 13:44:42 +0100
Subject: [PATCH 1/2] [DAG] visitBITCAST - fold (bitcast (freeze (load x))) ->
(freeze (load (bitcast*)x))
Tweak the existing (bitcast (load x)) -> (load (bitcast*)x) fold to handle freeze as well
Inspired by #163070 - attempt to pass the bitcast through a oneuse frozen load
This tries to avoid in place replacement of frozen nodes which has caused infinite loops in the past
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 69 ++++---
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +
.../test/CodeGen/X86/avx10_2_512bf16-arith.ll | 2 +-
llvm/test/CodeGen/X86/avx10_2bf16-arith.ll | 4 +-
.../CodeGen/X86/widen-load-of-small-alloca.ll | 185 ++++++++----------
5 files changed, 128 insertions(+), 138 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d2ea6525e1116..08df4c6466a4b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16693,38 +16693,51 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
}
// fold (conv (load x)) -> (load (conv*)x)
+ // fold (conv (freeze (load x))) -> (freeze (load (conv*)x))
// If the resultant load doesn't need a higher alignment than the original!
- if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
- // Do not remove the cast if the types differ in endian layout.
- TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
- TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
- // If the load is volatile, we only want to change the load type if the
- // resulting load is legal. Otherwise we might increase the number of
- // memory accesses. We don't care if the original type was legal or not
- // as we assume software couldn't rely on the number of accesses of an
- // illegal type.
- ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
- TLI.isOperationLegal(ISD::LOAD, VT))) {
- LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+ auto CastLoad = [this, &VT](SDValue N0, const SDLoc &DL) {
+ auto *LN0 = dyn_cast<LoadSDNode>(N0);
+ if (!LN0 || !ISD::isNormalLoad(LN0) || !N0.hasOneUse())
+ return SDValue();
- if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
- *LN0->getMemOperand())) {
- // If the range metadata type does not match the new memory
- // operation type, remove the range metadata.
- if (const MDNode *MD = LN0->getRanges()) {
- ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0));
- if (Lower->getBitWidth() != VT.getScalarSizeInBits() ||
- !VT.isInteger()) {
- LN0->getMemOperand()->clearRanges();
- }
+ // Do not remove the cast if the types differ in endian layout.
+ if (TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) !=
+ TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()))
+ return SDValue();
+
+ // If the load is volatile, we only want to change the load type if the
+ // resulting load is legal. Otherwise we might increase the number of
+ // memory accesses. We don't care if the original type was legal or not
+ // as we assume software couldn't rely on the number of accesses of an
+ // illegal type.
+ if (((LegalOperations || !LN0->isSimple()) &&
+ !TLI.isOperationLegal(ISD::LOAD, VT)))
+ return SDValue();
+
+ if (!TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
+ *LN0->getMemOperand()))
+ return SDValue();
+
+ // If the range metadata type does not match the new memory
+ // operation type, remove the range metadata.
+ if (const MDNode *MD = LN0->getRanges()) {
+ ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0));
+ if (Lower->getBitWidth() != VT.getScalarSizeInBits() || !VT.isInteger()) {
+ LN0->getMemOperand()->clearRanges();
}
- SDValue Load =
- DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
- LN0->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
- return Load;
}
- }
+ SDValue Load = DAG.getLoad(VT, DL, LN0->getChain(), LN0->getBasePtr(),
+ LN0->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+ return Load;
+ };
+
+ if (SDValue NewLd = CastLoad(N0, SDLoc(N)))
+ return NewLd;
+
+ if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse())
+ if (SDValue NewLd = CastLoad(N0.getOperand(0), SDLoc(N)))
+ return DAG.getFreeze(NewLd);
if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
return V;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d49f25a950e3a..05ee4176c67b6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3453,6 +3453,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
return true;
+ // If we have a large vector type (even if illegal), don't bitcast to large
+ // (illegal) scalar types. Better to load fewer vectors and extract.
+ if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
+ BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
+ return false;
+
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7153c91..d9b4635042256 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
; X86: # %bb.0:
-; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b15d5b4..01b7618753a23 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
; X86: # %bb.0:
-; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8d36eef952a2b..1438f790cea48 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
@@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
>From df55a151e072bb0edf34870809faf7e14992a09a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 5 Nov 2025 15:03:23 +0000
Subject: [PATCH 2/2] Remove extra brackets
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b4ef7afa7e060..9edb43abb145c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16751,8 +16751,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
// memory accesses. We don't care if the original type was legal or not
// as we assume software couldn't rely on the number of accesses of an
// illegal type.
- if (((LegalOperations || !LN0->isSimple()) &&
- !TLI.isOperationLegal(ISD::LOAD, VT)))
+ if ((LegalOperations || !LN0->isSimple()) &&
+ !TLI.isOperationLegal(ISD::LOAD, VT))
return SDValue();
if (!TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
More information about the llvm-commits
mailing list