[llvm] [X86] EltsFromConsecutiveLoads - attempt to match consecutive truncated loads (PR #172051)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 09:33:41 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
SelectionDAG::areNonVolatileConsecutiveLoads will only match loads that have a MemoryVT the same size as the stride byte size, which will fail for cases where large loads have been split (typically by shift+truncates) and we're trying to stitch them back together.
As a fallback, this patch checks for cases where the candidate element's byte is a multiple of full MemoryVT bytes distance away from the base load.
---
Full diff: https://github.com/llvm/llvm-project/pull/172051.diff
4 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+14-2)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+13-191)
- (modified) llvm/test/CodeGen/X86/load-partial.ll (+8-40)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+2-92)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6af809be8dfe4..d383553d2e8b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7448,8 +7448,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
}
- return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
- EltIdx - FirstLoadedElt);
+ int Stride = EltIdx - FirstLoadedElt;
+ if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride))
+ return true;
+ // Try again using the memory load size (we might have broken a large load
+ // into smaller elements), ensure the stride is the full memory load size
+ // apart and a whole number of elements fit in each memory load.
+ unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits();
+ if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
+ (BaseMemSizeInBits % BaseSizeInBits) == 0) {
+ unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
+ return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8,
+ Stride / Scale);
+ }
+ return false;
};
// Consecutive loads can contain UNDEFS but not ZERO elements.
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 49eb82e8434cf..4a542949c7859 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -5002,203 +5002,25 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
;
; AVX1-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
; AVX1: # %bb.0:
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq 16(%rdi), %rcx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: movq %rcx, %r8
-; AVX1-NEXT: movq %rcx, %r9
-; AVX1-NEXT: movq %rcx, %r10
-; AVX1-NEXT: movl %ecx, %r11d
-; AVX1-NEXT: movl %ecx, %ebx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $16, %ebx
-; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $24, %r11d
-; AVX1-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $32, %r10
-; AVX1-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $40, %r9
-; AVX1-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $48, %r8
-; AVX1-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: movq 24(%rdi), %rcx
-; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %ecx, %eax
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %ecx, %eax
-; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %ecx, %eax
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $40, %rax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $48, %rax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq (%rdi), %rax
-; AVX1-NEXT: shrq $56, %rcx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $24, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $40, %rcx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq 8(%rdi), %rcx
-; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %ecx, %eax
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %ecx, %eax
-; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %ecx, %eax
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $40, %rax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $48, %rax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shrq $56, %rcx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2
-; AVX1-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3
-; AVX1-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2
+; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdx)
; AVX1-NEXT: vmovdqa %xmm2, 48(%rdx)
-; AVX1-NEXT: popq %rbx
; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 16(%rdi), %rcx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: movq %rcx, %r8
-; AVX2-NEXT: movq %rcx, %r9
-; AVX2-NEXT: movq %rcx, %r10
-; AVX2-NEXT: movl %ecx, %r11d
-; AVX2-NEXT: movl %ecx, %ebx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $16, %ebx
-; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $24, %r11d
-; AVX2-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $32, %r10
-; AVX2-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $40, %r9
-; AVX2-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $48, %r8
-; AVX2-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: movq 24(%rdi), %rcx
-; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $40, %rax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $48, %rax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: shrq $56, %rcx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $24, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $40, %rcx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $40, %rax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $48, %rax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shrq $56, %rcx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
-; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1
+; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll
index dba63582ff08b..a2f4f7ac3d534 100644
--- a/llvm/test/CodeGen/X86/load-partial.ll
+++ b/llvm/test/CodeGen/X86/load-partial.ll
@@ -208,30 +208,14 @@ define <4 x float> @load_float4_float3_trunc_0122(ptr nocapture readonly derefer
}
define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly dereferenceable(16)) nofree nosync {
-; SSE2-LABEL: load_float4_float3_trunc_0123:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movaps (%rdi), %xmm0
-; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_float4_float3_trunc_0123:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movaps (%rdi), %xmm0
-; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_float4_float3_trunc_0123:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movaps (%rdi), %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; SSE41-NEXT: retq
+; SSE-LABEL: load_float4_float3_trunc_0123:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps (%rdi), %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: load_float4_float3_trunc_0123:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; AVX-NEXT: retq
%2 = load i64, ptr %0, align 16
%3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
@@ -254,30 +238,14 @@ define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly derefer
}
define <4 x float> @load_float4_float3_trunc_0123_unaligned(ptr nocapture readonly dereferenceable(16)) nofree nosync {
-; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movups (%rdi), %xmm0
-; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movups (%rdi), %xmm0
-; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movups (%rdi), %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; SSE41-NEXT: retq
+; SSE-LABEL: load_float4_float3_trunc_0123_unaligned:
+; SSE: # %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: load_float4_float3_trunc_0123_unaligned:
; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; AVX-NEXT: retq
%2 = load i64, ptr %0, align 1
%3 = getelementptr inbounds <4 x float>, ptr %0, i64 0, i64 2
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index d5a724139ffd3..0eeb2ee8c75eb 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -5724,51 +5724,7 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
;
; AVX1-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
; AVX1: # %bb.0:
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: movq (%rdi), %rax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: movq %rax, %r8
-; AVX1-NEXT: movq %rax, %r9
-; AVX1-NEXT: movq %rax, %r10
-; AVX1-NEXT: movl %eax, %r11d
-; AVX1-NEXT: movl %eax, %ebx
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: shrl $16, %ebx
-; AVX1-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $24, %r11d
-; AVX1-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $32, %r10
-; AVX1-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $40, %r9
-; AVX1-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $48, %r8
-; AVX1-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: movq 8(%rdi), %rax
-; AVX1-NEXT: shrq $56, %rcx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $24, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $40, %rcx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpaddb 32(%rsi), %xmm0, %xmm1
; AVX1-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX1-NEXT: vmovaps 16(%rsi), %xmm2
@@ -5777,61 +5733,15 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
; AVX1-NEXT: vmovaps %xmm3, 48(%rdx)
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
; AVX1-NEXT: vmovdqa %xmm1, 32(%rdx)
-; AVX1-NEXT: popq %rbx
; AVX1-NEXT: retq
;
; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq %rax, %r9
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movl %eax, %r11d
-; AVX2-NEXT: movl %eax, %ebx
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shrl $16, %ebx
-; AVX2-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $24, %r11d
-; AVX2-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $32, %r10
-; AVX2-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $40, %r9
-; AVX2-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $48, %r8
-; AVX2-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: movq 8(%rdi), %rax
-; AVX2-NEXT: shrq $56, %rcx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $24, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $40, %rcx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
-; AVX2-NEXT: popq %rbx
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
``````````
</details>
https://github.com/llvm/llvm-project/pull/172051
More information about the llvm-commits
mailing list