[llvm] [X86] Attempt to fold trunc(srl(load(p), amt) -> load(p+amt/8) (PR #165266)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 27 08:35:53 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
As reported on #<!-- -->164853 - we only attempt to reduce shifted loads for constant shift amounts, but we could do more with non-constant values if value tracking can confirm basic alignments.
This patch determines if a truncated shifted load of scalar integer, shifts by a whole/aligned number of the truncated integer width and replaces the non-constant shift amount with a pointer offset instead.
I had hoped to make this a generic DAG fold, but reduceLoadWidth isn't ready to be converted to a KnownBits value tracking mechanism, and other targets don't have complex address math like X86.
Fixes #<!-- -->164853
---
Patch is 86.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165266.diff
5 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+34)
- (modified) llvm/test/CodeGen/X86/bfloat-calling-conv.ll (+2-4)
- (modified) llvm/test/CodeGen/X86/trunc-srl-load.ll (+108-1544)
- (modified) llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll (+16-34)
- (modified) llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll (+17-36)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 410f20edc6281..a3df234133623 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54634,6 +54634,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
SDLoc DL(N);
// Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54653,39 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
return V;
+ // Fold trunc(srl(load(p),amt) -> load(p+amt/8)
+ // If we're shifting down whole byte+pow2 aligned bit chunks from a larger
+ // load for truncation, see if we can convert the shift into a pointer
+ // offset instead. Limit this to normal (non-ext) scalar integer loads.
+ if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+ auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+ if (Ld->isSimple() && VT.isByteSized() &&
+ isPowerOf2_64(VT.getSizeInBits())) {
+ SDValue ShAmt = Src.getOperand(1);
+ KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+ // Check the shift amount is aligned to the truncated size.
+ // Check the truncation doesn't use any shifted in (zero) top bits.
+ if (KnownAmt.countMinTrailingZeros() >= Log2_64(VT.getSizeInBits()) &&
+ KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+ VT.getSizeInBits())) {
+ EVT PtrVT = Ld->getBasePtr().getValueType();
+ SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+ SDValue PtrByteOfs =
+ DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+ DAG.getShiftAmountConstant(3, PtrVT, DL));
+ SDValue NewPtr = DAG.getMemBasePlusOffset(
+ Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+ NewLoad.getValue(1));
+ return NewLoad;
+ }
+ }
+ }
+
// The bitcast source is a direct mmx result.
// Detect bitcasts between i32 to x86mmx
if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
index ea4d32bae9ccb..d08749174f85c 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
; SSE2-LABEL: call_ret_v3bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
-; SSE2-NEXT: movl 4(%rdi), %eax
-; SSE2-NEXT: pinsrw $0, %eax, %xmm1
+; SSE2-NEXT: pinsrw $0, 4(%rdi), %xmm1
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: callq returns_v3bf16 at PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
; AVXNECONVERT-LABEL: call_ret_v3bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
-; AVXNECONVERT-NEXT: movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0
; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVXNECONVERT-NEXT: callq returns_v3bf16 at PLT
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index 4dae1433b2196..d9c21d3a3f570 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64
; Tests showing for the analysis of non-constant shift amounts to improve load address math
@@ -12,42 +12,20 @@
define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub64_16:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl 4(%eax), %esi
-; X86-NEXT: movb %ch, %cl
-; X86-NEXT: andb $16, %cl
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: shrdl %cl, %esi, %edx
-; X86-NEXT: testb $32, %ch
-; X86-NEXT: jne .LBB0_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: .LBB0_2:
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $48, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movzwl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub64_16:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: andb $48, %cl
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shrq %cl, %rax
-; SSE-NEXT: # kill: def $ax killed $ax killed $rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub64_16:
-; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX-NEXT: andb $48, %sil
-; AVX-NEXT: shrxq %rsi, (%rdi), %rax
-; AVX-NEXT: # kill: def $ax killed $ax killed $rax
-; AVX-NEXT: retq
+; X64-LABEL: extractSub64_16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $48, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movzwl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 63
%idx_align = and i32 %idx_bounds, -16
%sh = zext nneg i32 %idx_align to i64
@@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub128_16:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movzbl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: movl 8(%ecx), %edi
-; X86-NEXT: movl 12(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: andb $16, %cl
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movl (%esp,%edx), %eax
-; X86-NEXT: movl 4(%esp,%edx), %edx
-; X86-NEXT: shrdl %cl, %edx, %eax
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $112, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movzwl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub128_16:
-; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andb $48, %cl
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: shrq %cl, %rdi
-; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: testb $64, %sil
-; SSE-NEXT: cmovneq %rdi, %rax
-; SSE-NEXT: # kill: def $ax killed $ax killed $rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub128_16:
-; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rdx
-; AVX-NEXT: movq 8(%rdi), %rax
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: andb $48, %cl
-; AVX-NEXT: shrdq %cl, %rax, %rdx
-; AVX-NEXT: shrxq %rcx, %rax, %rax
-; AVX-NEXT: testb $64, %sil
-; AVX-NEXT: cmoveq %rdx, %rax
-; AVX-NEXT: # kill: def $ax killed $ax killed $rax
-; AVX-NEXT: retq
+; X64-LABEL: extractSub128_16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $112, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movzwl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 127
%idx_align = and i32 %idx_bounds, -16
%sh = zext nneg i32 %idx_align to i128
@@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub128_32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movzbl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: movl 8(%ecx), %edi
-; X86-NEXT: movl 12(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andb $96, %al
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl (%esp,%eax), %eax
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $96, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: movl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub128_32:
-; SSE: # %bb.0:
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andb $32, %cl
-; SSE-NEXT: movq %rdx, %rdi
-; SSE-NEXT: shrq %cl, %rdi
-; SSE-NEXT: shrdq %cl, %rdx, %rax
-; SSE-NEXT: testb $64, %sil
-; SSE-NEXT: cmovneq %rdi, %rax
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extractSub128_32:
-; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rdx
-; AVX-NEXT: movq 8(%rdi), %rax
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: andb $32, %cl
-; AVX-NEXT: shrdq %cl, %rax, %rdx
-; AVX-NEXT: shrxq %rcx, %rax, %rax
-; AVX-NEXT: testb $64, %sil
-; AVX-NEXT: cmoveq %rdx, %rax
-; AVX-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX-NEXT: retq
+; X64-LABEL: extractSub128_32:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $96, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movl (%rdi,%rsi), %eax
+; X64-NEXT: retq
%idx_bounds = and i32 %idx, 127
%idx_align = and i32 %idx_bounds, -32
%sh = zext nneg i32 %idx_align to i128
@@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub128_64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $32, %esp
-; X86-NEXT: movzbl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl 4(%ecx), %esi
-; X86-NEXT: movl 8(%ecx), %edi
-; X86-NEXT: movl 12(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, (%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: andb $64, %al
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: movzbl %al, %ecx
-; X86-NEXT: movl (%esp,%ecx), %eax
-; X86-NEXT: movl 4(%esp,%ecx), %edx
-; X86-NEXT: leal -8(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $64, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %eax
+; X86-NEXT: movl 4(%ecx,%edx), %edx
; X86-NEXT: retl
;
; X64-LABEL: extractSub128_64:
; X64: # %bb.0:
-; X64-NEXT: testb $64, %sil
-; X64-NEXT: je .LBB3_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: movq 8(%rdi), %rax
-; X64-NEXT: retq
-; X64-NEXT: .LBB3_1:
-; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andl $64, %esi
+; X64-NEXT: shrl $3, %esi
+; X64-NEXT: movq (%rdi,%rsi), %rax
; X64-NEXT: retq
%idx_bounds = and i32 %idx, 127
%idx_align = and i32 %idx_bounds, -64
@@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
; X86-LABEL: extractSub512_8:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $192, %esp
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl (%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: movl 52(%eax), %edx
-; X86-NEXT: movl 56(%eax), %ecx
-; X86-NEXT: movl 60(%eax), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 12(%ebp), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl $24, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl 48(%esp,%edx), %eax
-; X86-NEXT: movl 52(%esp,%edx), %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %edx, %eax
-; X86-NEXT: # kill: def $al killed $al killed $eax
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $63, %ecx
+; X86-NEXT: movzbl (%eax,%ecx), %eax
; X86-NEXT: retl
;
-; SSE-LABEL: extractSub512_8:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: # kill: def $esi killed $esi def $rsi
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: movups 16(%rdi), %xmm1
-; SSE-NEXT: movups 32(%rdi), %xmm2
-; SSE-NEXT: movups 48(%rdi), %xmm3
-; SSE-NEXT: xorps %xmm4, %xmm4
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $56, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: movq -128(%rsp,%rsi), %rdx
-; SSE-NEXT: shrq %cl, %rdx
-; SSE-NEXT: movl -120(%rsp,%rsi), %eax
-; SSE-NEXT: addl %eax, %eax
-; SSE-NEXT: notl %ecx
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: orl %edx, %eax
-; SSE-NEXT: # kill: def $al killed $al killed $rax
-; SSE-NEXT: popq %rcx
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: extractSub512_8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
-; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $56, %ecx
-; AVX2-NEXT: shrl $3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/165266
More information about the llvm-commits
mailing list