[llvm] 8359dbc - [X86] combineEXTRACT_SUBVECTOR - fold extract_subvector(subv_broadcast_load(ptr),0) -> load(ptr) (#126523)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 12 01:07:09 PST 2025
Author: Simon Pilgrim
Date: 2025-02-12T09:07:06Z
New Revision: 8359dbc8c08ca4206534d605dd299713dc323b67
URL: https://github.com/llvm/llvm-project/commit/8359dbc8c08ca4206534d605dd299713dc323b67
DIFF: https://github.com/llvm/llvm-project/commit/8359dbc8c08ca4206534d605dd299713dc323b67.diff
LOG: [X86] combineEXTRACT_SUBVECTOR - fold extract_subvector(subv_broadcast_load(ptr),0) -> load(ptr) (#126523)
This is typically handled by SimplifyDemandedVectorElts, but this will
fail when there are multiple uses of the subv_broadcast_load node, but
if there's just one use of the load result (and the rest are uses of the
memory chain), we can still replace with a load and update the chain
accordingly.
Noticed on #126517
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8c28985c8e8e7..91249f0bb009f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58485,10 +58485,26 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
- // If we're extracting a broadcasted subvector, just use the lowest subvector.
- if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
- cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
- return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ // Check if we're extracting a whole broadcasted subvector.
+ if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (MemVT == VT) {
+ // Just use the lowest subvector.
+ if (IdxVal != 0)
+ return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ // If this is the only use, we can replace with a regular load (this may
+ // have been missed by SimplifyDemandedVectorElts due to extra uses of the
+ // memory chain).
+ if (InVec.hasOneUse()) {
+ SDValue Ld =
+ DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
+ MemIntr->getMemOperand());
+ DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
+ return Ld;
+ }
+ }
+ }
// Attempt to extract from the source of a shuffle vector.
if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 3d72319f59ca9..e47a9ac3a0c0b 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3634,19 +3634,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3820,19 +3819,18 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index acedcf4263906..a3f134922ba3c 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -4044,18 +4044,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -4263,17 +4262,16 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
More information about the llvm-commits
mailing list