[llvm] [X86] Attempt to fold trunc(srl(load(p), amt) -> load(p+amt/8) (PR #165266)

Mon Oct 27 08:35:53 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

As reported on #164853 - we only attempt to reduce shifted loads for constant shift amounts, but we could do more with non-constant values if value tracking can confirm basic alignments.

This patch determines if a truncated shifted load of scalar integer, shifts by a whole/aligned number of the truncated integer width and replaces the non-constant shift amount with a pointer offset instead.

I had hoped to make this a generic DAG fold, but reduceLoadWidth isn't ready to be converted to a KnownBits value tracking mechanism, and other targets don't have complex address math like X86.

Fixes #164853

---

Patch is 86.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/165266.diff


5 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+34) 
- (modified) llvm/test/CodeGen/X86/bfloat-calling-conv.ll (+2-4) 
- (modified) llvm/test/CodeGen/X86/trunc-srl-load.ll (+108-1544) 
- (modified) llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll (+16-34) 
- (modified) llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll (+17-36) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 410f20edc6281..a3df234133623 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54634,6 +54634,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
   SDLoc DL(N);
 
   // Attempt to pre-truncate inputs to arithmetic ops instead.
@@ -54652,6 +54653,39 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
     return V;
 
+  // Fold trunc(srl(load(p),amt) -> load(p+amt/8)
+  // If we're shifting down whole byte+pow2 aligned bit chunks from a larger
+  // load for truncation, see if we can convert the shift into a pointer
+  // offset instead. Limit this to normal (non-ext) scalar integer loads.
+  if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
+      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+    auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
+    if (Ld->isSimple() && VT.isByteSized() &&
+        isPowerOf2_64(VT.getSizeInBits())) {
+      SDValue ShAmt = Src.getOperand(1);
+      KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+      // Check the shift amount is aligned to the truncated size.
+      // Check the truncation doesn't use any shifted in (zero) top bits.
+      if (KnownAmt.countMinTrailingZeros() >= Log2_64(VT.getSizeInBits()) &&
+          KnownAmt.getMaxValue().ule(SrcVT.getSizeInBits() -
+                                     VT.getSizeInBits())) {
+        EVT PtrVT = Ld->getBasePtr().getValueType();
+        SDValue PtrBitOfs = DAG.getZExtOrTrunc(ShAmt, DL, PtrVT);
+        SDValue PtrByteOfs =
+            DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+                        DAG.getShiftAmountConstant(3, PtrVT, DL));
+        SDValue NewPtr = DAG.getMemBasePlusOffset(
+            Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
+        SDValue NewLoad =
+            DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getMemOperand());
+        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
+                                      NewLoad.getValue(1));
+        return NewLoad;
+      }
+    }
+  }
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
index ea4d32bae9ccb..d08749174f85c 100644
--- a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
@@ -660,8 +660,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; SSE2-LABEL: call_ret_v3bf16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
-; SSE2-NEXT:    movl 4(%rdi), %eax
-; SSE2-NEXT:    pinsrw $0, %eax, %xmm1
+; SSE2-NEXT:    pinsrw $0, 4(%rdi), %xmm1
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    callq returns_v3bf16 at PLT
@@ -725,8 +724,7 @@ define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
 ; AVXNECONVERT-LABEL: call_ret_v3bf16:
 ; AVXNECONVERT:       # %bb.0:
 ; AVXNECONVERT-NEXT:    pushq %rax
-; AVXNECONVERT-NEXT:    movl 4(%rdi), %eax
-; AVXNECONVERT-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; AVXNECONVERT-NEXT:    vpinsrw $0, 4(%rdi), %xmm0, %xmm0
 ; AVXNECONVERT-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVXNECONVERT-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVXNECONVERT-NEXT:    callq returns_v3bf16 at PLT
diff --git a/llvm/test/CodeGen/X86/trunc-srl-load.ll b/llvm/test/CodeGen/X86/trunc-srl-load.ll
index 4dae1433b2196..d9c21d3a3f570 100644
--- a/llvm/test/CodeGen/X86/trunc-srl-load.ll
+++ b/llvm/test/CodeGen/X86/trunc-srl-load.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown                   | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64
 
 ; Tests showing for the analysis of non-constant shift amounts to improve load address math
 
@@ -12,42 +12,20 @@
 define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub64_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl 4(%eax), %esi
-; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    shrdl %cl, %esi, %edx
-; X86-NEXT:    testb $32, %ch
-; X86-NEXT:    jne .LBB0_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $48, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub64_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shrq %cl, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub64_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX-NEXT:    andb $48, %sil
-; AVX-NEXT:    shrxq %rsi, (%rdi), %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub64_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $48, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 63
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i64
@@ -60,67 +38,20 @@ define i16 @extractSub64_16(ptr %word, i32 %idx) nounwind {
 define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    andb $16, %cl
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    movl (%esp,%edx), %eax
-; X86-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $112, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movzwl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $48, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $48, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $112, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movzwl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -16
   %sh = zext nneg i32 %idx_align to i128
@@ -133,62 +64,20 @@ define i16 @extractSub128_16(ptr %word, i32 %idx) nounwind {
 define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $96, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl (%esp,%eax), %eax
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    movl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub128_32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andb $32, %cl
-; SSE-NEXT:    movq %rdx, %rdi
-; SSE-NEXT:    shrq %cl, %rdi
-; SSE-NEXT:    shrdq %cl, %rdx, %rax
-; SSE-NEXT:    testb $64, %sil
-; SSE-NEXT:    cmovneq %rdi, %rax
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extractSub128_32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq (%rdi), %rdx
-; AVX-NEXT:    movq 8(%rdi), %rax
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    andb $32, %cl
-; AVX-NEXT:    shrdq %cl, %rax, %rdx
-; AVX-NEXT:    shrxq %rcx, %rax, %rax
-; AVX-NEXT:    testb $64, %sil
-; AVX-NEXT:    cmoveq %rdx, %rax
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
+; X64-LABEL: extractSub128_32:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $96, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movl (%rdi,%rsi), %eax
+; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -32
   %sh = zext nneg i32 %idx_align to i128
@@ -201,46 +90,20 @@ define i32 @extractSub128_32(ptr %word, i32 %idx) nounwind {
 define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub128_64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movzbl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    movl 8(%ecx), %edi
-; X86-NEXT:    movl 12(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, (%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    andb $64, %al
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    movzbl %al, %ecx
-; X86-NEXT:    movl (%esp,%ecx), %eax
-; X86-NEXT:    movl 4(%esp,%ecx), %edx
-; X86-NEXT:    leal -8(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $64, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %eax
+; X86-NEXT:    movl 4(%ecx,%edx), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: extractSub128_64:
 ; X64:       # %bb.0:
-; X64-NEXT:    testb $64, %sil
-; X64-NEXT:    je .LBB3_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB3_1:
-; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    andl $64, %esi
+; X64-NEXT:    shrl $3, %esi
+; X64-NEXT:    movq (%rdi,%rsi), %rax
 ; X64-NEXT:    retq
   %idx_bounds = and i32 %idx, 127
   %idx_align = and i32 %idx_bounds, -64
@@ -254,185 +117,20 @@ define i64 @extractSub128_64(ptr %word, i32 %idx) nounwind {
 define i8 @extractSub512_8(ptr %word, i32 %idx) nounwind {
 ; X86-LABEL: extractSub512_8:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $192, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl 52(%eax), %edx
-; X86-NEXT:    movl 56(%eax), %ecx
-; X86-NEXT:    movl 60(%eax), %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl $24, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl 48(%esp,%edx), %eax
-; X86-NEXT:    movl 52(%esp,%edx), %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %edx, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $63, %ecx
+; X86-NEXT:    movzbl (%eax,%ecx), %eax
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: extractSub512_8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    movups 16(%rdi), %xmm1
-; SSE-NEXT:    movups 32(%rdi), %xmm2
-; SSE-NEXT:    movups 48(%rdi), %xmm3
-; SSE-NEXT:    xorps %xmm4, %xmm4
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm4, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $56, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rdx
-; SSE-NEXT:    shrq %cl, %rdx
-; SSE-NEXT:    movl -120(%rsp,%rsi), %eax
-; SSE-NEXT:    addl %eax, %eax
-; SSE-NEXT:    notl %ecx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orl %edx, %eax
-; SSE-NEXT:    # kill: def $al killed $al killed $rax
-; SSE-NEXT:    popq %rcx
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: extractSub512_8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
-; AVX2-NEXT:    vmovups (%rdi), %ymm0
-; AVX2-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $56, %ecx
-; AVX2-NEXT:    shrl $3...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/165266