[llvm] 34d88cf - [DAG] Allow folding AND of anyext masked_load with >1 user to zext version

Fri Nov 18 02:38:23 PST 2022

Author: Benjamin Maxwell
Date: 2022-11-18T10:38:09Z
New Revision: 34d88cf6cfe9f878e6330f157f178c2b104c3949

URL: https://github.com/llvm/llvm-project/commit/34d88cf6cfe9f878e6330f157f178c2b104c3949
DIFF: https://github.com/llvm/llvm-project/commit/34d88cf6cfe9f878e6330f157f178c2b104c3949.diff

LOG: [DAG] Allow folding AND of anyext masked_load with >1 user to zext version

This now allows folding an AND of a anyext masked_load to a
zext_masked_load even if the masked load has multiple users.  Doing is
eliminates some redundant ANDs/MOVs for certain AArch64 SVE code.

I'm not sure if there's any cases where doing this could negatively the
other users of the masked_load.  Looking at other optimizations of
masked loads, most don't apply if the load is used more than once, so it
doesn't look like this would interfere.

Reviewed By: c-rhodes

Differential Revision: https://reviews.llvm.org/D137844

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/sve-load-compare-store.ll
    llvm/test/CodeGen/Thumb2/mve-masked-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9043042eb16f8..0457e1d081f10 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6241,8 +6241,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     // fold (and (masked_load) (splat_vec (x, ...))) to zext_masked_load
     auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0);
     ConstantSDNode *Splat = isConstOrConstSplat(N1, true, true);
-    if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && N0.hasOneUse() &&
-        Splat && N1.hasOneUse()) {
+    if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && Splat &&
+        N1.hasOneUse()) {
       EVT LoadVT = MLoad->getMemoryVT();
       EVT ExtVT = VT;
       if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) {
@@ -6252,11 +6252,16 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         uint64_t ElementSize =
             LoadVT.getVectorElementType().getScalarSizeInBits();
         if (Splat->getAPIntValue().isMask(ElementSize)) {
-          return DAG.getMaskedLoad(
+          auto NewLoad = DAG.getMaskedLoad(
               ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
               MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
               LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
               ISD::ZEXTLOAD, MLoad->isExpandingLoad());
+          bool LoadHasOtherUsers = !N0.hasOneUse();
+          CombineTo(N, NewLoad);
+          if (LoadHasOtherUsers)
+            CombineTo(MLoad, NewLoad.getValue(0), NewLoad.getValue(1));
+          return SDValue(N, 0);
         }
       }
     }

diff  --git a/llvm/test/CodeGen/AArch64/sve-load-compare-store.ll b/llvm/test/CodeGen/AArch64/sve-load-compare-store.ll
index 27436ede52eb3..3a6e9818e07fa 100644
--- a/llvm/test/CodeGen/AArch64/sve-load-compare-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-load-compare-store.ll
@@ -6,9 +6,7 @@ define void @sve_load_compare_store(ptr noalias nocapture noundef readonly %a, p
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEXT:    cmphs p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    cmphs p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index afcea7901ccf7..f250731ea6a7c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -2079,7 +2079,7 @@ define arm_aapcs_vfpcc <4 x i32> @multi_user_zext(<4 x i16> *%dest, <4 x i32> %a
 ; CHECK-LE-NEXT:    vmov r0, r1, d8
 ; CHECK-LE-NEXT:    vmov r2, r3, d9
 ; CHECK-LE-NEXT:    bl foo
-; CHECK-LE-NEXT:    vmovlb.u16 q0, q4
+; CHECK-LE-NEXT:    vmov q0, q4
 ; CHECK-LE-NEXT:    vpop {d8, d9}
 ; CHECK-LE-NEXT:    pop {r7, pc}
 ;
@@ -2091,13 +2091,12 @@ define arm_aapcs_vfpcc <4 x i32> @multi_user_zext(<4 x i16> *%dest, <4 x i32> %a
 ; CHECK-BE-NEXT:    vpush {d8, d9}
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u32 q4, [r0]
-; CHECK-BE-NEXT:    vrev64.32 q0, q4
-; CHECK-BE-NEXT:    vmov r1, r0, d0
-; CHECK-BE-NEXT:    vmov r3, r2, d1
+; CHECK-BE-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-BE-NEXT:    vrev64.32 q4, q0
+; CHECK-BE-NEXT:    vmov r1, r0, d8
+; CHECK-BE-NEXT:    vmov r3, r2, d9
 ; CHECK-BE-NEXT:    bl foo
-; CHECK-BE-NEXT:    vmovlb.u16 q1, q4
-; CHECK-BE-NEXT:    vrev64.32 q0, q1
+; CHECK-BE-NEXT:    vmov q0, q4
 ; CHECK-BE-NEXT:    vpop {d8, d9}
 ; CHECK-BE-NEXT:    pop {r7, pc}
 entry: