[llvm] [X86] combineTargetShuffle - attempt to fold VPERM2X128(ONEUSE(LOAD),UNDEF) -> VBROADCAST128 (PR #142366)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 2 04:28:27 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Matches what we do in lowerV2X128Shuffle, where we often fail the oneuse test as we might not have split other uses at that point.
---
Full diff: https://github.com/llvm/llvm-project/pull/142366.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+21-2)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2beb697548553..71c699a318eb1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42618,9 +42618,11 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
return SDValue();
}
case X86ISD::VPERM2X128: {
- // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ unsigned Imm = N.getConstantOperandVal(2);
+
+ // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
if (LHS.getOpcode() == ISD::BITCAST &&
(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
EVT SrcVT = LHS.getOperand(0).getValueType();
@@ -42653,7 +42655,6 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
}
return SDValue();
};
- unsigned Imm = N.getConstantOperandVal(2);
if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
MVT SubVT = VT.getHalfNumVectorElementsVT();
@@ -42662,6 +42663,24 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
}
}
+
+ // Attempt to match VBROADCAST*128 subvector broadcast load.
+ if (RHS.isUndef()) {
+ SmallVector<int, 4> Mask;
+ DecodeVPERM2X128Mask(4, Imm, Mask);
+ if (isUndefOrInRange(Mask, 0, 4)) {
+ bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
+ bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
+ if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
+ X86::mayFoldLoad(LHS, Subtarget)) {
+ MVT MemVT = VT.getHalfNumVectorElementsVT();
+ unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
+ return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, VT, MemVT,
+ cast<LoadSDNode>(LHS), Ofs, DAG);
+ }
+ }
+ }
+
return SDValue();
}
case X86ISD::PSHUFD:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index e11352abcaec1..9ff685694ce02 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -9613,7 +9613,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3]
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3]
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index d2c64a462a3e7..44b5162812c0e 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1632,12 +1632,12 @@ ret void
define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind {
; AVX1-LABEL: splat2_v4f64_load_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
+; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
-; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
+; AVX1-NEXT: vbroadcastf128 16(%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
-; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
-; AVX1-NEXT: vmovupd %ymm1, (%rsi)
+; AVX1-NEXT: vmovupd %ymm1, 32(%rsi)
+; AVX1-NEXT: vmovupd %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
``````````
</details>
https://github.com/llvm/llvm-project/pull/142366
More information about the llvm-commits
mailing list