[llvm] 71d0fd3 - [X86][AVX] lowerV2X128Shuffle - attempt to recognise broadcastf128 subvector load
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 23 05:20:08 PDT 2021
Author: Simon Pilgrim
Date: 2021-07-23T13:10:38+01:00
New Revision: 71d0fd356467e263e28b7d55d83c0871da536961
URL: https://github.com/llvm/llvm-project/commit/71d0fd356467e263e28b7d55d83c0871da536961
DIFF: https://github.com/llvm/llvm-project/commit/71d0fd356467e263e28b7d55d83c0871da536961.diff
LOG: [X86][AVX] lowerV2X128Shuffle - attempt to recognise broadcastf128 subvector load
As noticed on PR50053 we were failing to recognise when a shuffle of a load was really a subvector broadcast load
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx-vperm2x128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0cdc84d4ee47..35e91dd94009 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16054,9 +16054,33 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
- if (Subtarget.hasAVX2() && V2.isUndef())
- return SDValue();
+ if (V2.isUndef()) {
+ // Attempt to match VBROADCAST*128 subvector broadcast load.
+ bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
+ bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
+ if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
+ MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
+ auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
+ if (!Ld->isNonTemporal()) {
+ MVT MemVT = VT.getHalfNumVectorElementsVT();
+ unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
+ TypeSize::Fixed(Ofs), DL);
+ SDValue Ops[] = {Ld->getChain(), Ptr};
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+ }
+
+ // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+ if (Subtarget.hasAVX2())
+ return SDValue();
+ }
bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
index 91264ee2917e..86b15386797d 100644
--- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v8f32_45670123:
@@ -60,15 +60,10 @@ entry:
}
define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; AVX1-LABEL: shuffle_v8f32_01230123_mem:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vperm2f128 $34, (%rdi), %ymm0, %ymm0 # ymm0 = mem[0,1,0,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_01230123_mem:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpermpd $68, (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_01230123_mem:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
+; ALL-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb
@@ -92,15 +87,10 @@ entry:
}
define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; AVX1-LABEL: shuffle_v8f32_45674567_mem:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_45674567_mem:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpermpd $238, (%rdi), %ymm0 # ymm0 = mem[2,3,2,3]
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_45674567_mem:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
+; ALL-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb
More information about the llvm-commits
mailing list