[llvm] 71d0fd3 - [X86][AVX] lowerV2X128Shuffle - attempt to recognise broadcastf128 subvector load

Fri Jul 23 05:20:08 PDT 2021

Author: Simon Pilgrim
Date: 2021-07-23T13:10:38+01:00
New Revision: 71d0fd356467e263e28b7d55d83c0871da536961

URL: https://github.com/llvm/llvm-project/commit/71d0fd356467e263e28b7d55d83c0871da536961
DIFF: https://github.com/llvm/llvm-project/commit/71d0fd356467e263e28b7d55d83c0871da536961.diff

LOG: [X86][AVX] lowerV2X128Shuffle - attempt to recognise broadcastf128 subvector load

As noticed on PR50053 we were failing to recognise when a shuffle of a load was really a subvector broadcast load

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx-vperm2x128.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0cdc84d4ee47..35e91dd94009 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16054,9 +16054,33 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                   const APInt &Zeroable,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
-  // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
-  if (Subtarget.hasAVX2() && V2.isUndef())
-    return SDValue();
+  if (V2.isUndef()) {
+    // Attempt to match VBROADCAST*128 subvector broadcast load.
+    bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
+    bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
+    if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
+        MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
+      auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
+      if (!Ld->isNonTemporal()) {
+        MVT MemVT = VT.getHalfNumVectorElementsVT();
+        unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
+        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+        SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
+                                               TypeSize::Fixed(Ofs), DL);
+        SDValue Ops[] = {Ld->getChain(), Ptr};
+        SDValue BcastLd = DAG.getMemIntrinsicNode(
+            X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
+            DAG.getMachineFunction().getMachineMemOperand(
+                Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
+        return BcastLd;
+      }
+    }
+
+    // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+    if (Subtarget.hasAVX2())
+      return SDValue();
+  }
 
   bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
 

diff  --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
index 91264ee2917e..86b15386797d 100644
--- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
 
 define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 ; AVX1-LABEL: shuffle_v8f32_45670123:
@@ -60,15 +60,10 @@ entry:
 }
 
 define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; AVX1-LABEL: shuffle_v8f32_01230123_mem:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vperm2f128 $34, (%rdi), %ymm0, %ymm0 # ymm0 = mem[0,1,0,1]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_01230123_mem:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpermpd $68, (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_01230123_mem:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
+; ALL-NEXT:    retq
 entry:
   %a = load <8 x float>, <8 x float>* %pa
   %b = load <8 x float>, <8 x float>* %pb
@@ -92,15 +87,10 @@ entry:
 }
 
 define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; AVX1-LABEL: shuffle_v8f32_45674567_mem:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_45674567_mem:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpermpd $238, (%rdi), %ymm0 # ymm0 = mem[2,3,2,3]
-; AVX2-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_45674567_mem:
+; ALL:       # %bb.0: # %entry
+; ALL-NEXT:    vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
+; ALL-NEXT:    retq
 entry:
   %a = load <8 x float>, <8 x float>* %pa
   %b = load <8 x float>, <8 x float>* %pb