[llvm] [X86] combineEXTRACT_SUBVECTOR - fold extract_subvector(subv_broadcast_load(ptr),0) -> load(ptr) (PR #126523)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 10 06:38:31 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
This is typically handled by SimplifyDemandedVectorElts, but this will fail when there are multiple uses of the subv_broadcast_load node, but if there's just one use of the load result (and the rest are uses of the memory chain), we can still replace with a load and update the chain accordingly.
Noticed on #<!-- -->126517
---
Full diff: https://github.com/llvm/llvm-project/pull/126523.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+20-4)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+8-10)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+10-12)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 995b4de12ce12c..601bf7bb6b817d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58477,10 +58477,26 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
- // If we're extracting a broadcasted subvector, just use the lowest subvector.
- if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
- cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
- return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ // Check if we're extracting a whole broadcasted subvector.
+ if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (MemVT == VT) {
+ // Just use the lowest subvector.
+ if (IdxVal != 0)
+ return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ // If this is the only use, we can replace with a regular load (this may
+ // have been missed by SimplifyDemandedVectorElts due to extra uses of the
+ // memory chain).
+ if (InVec.hasOneUse()) {
+ SDValue Ld =
+ DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
+ MemIntr->getMemOperand());
+ DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
+ return Ld;
+ }
+ }
+ }
// Attempt to extract from the source of a shuffle vector.
if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 3d72319f59ca9e..e47a9ac3a0c0bf 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3634,19 +3634,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3820,19 +3819,18 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index acedcf42639066..a3f134922ba3cc 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -4044,18 +4044,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -4263,17 +4262,16 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
``````````
</details>
https://github.com/llvm/llvm-project/pull/126523
More information about the llvm-commits
mailing list