[llvm] 5c96847 - [DAG][sve] Lowering for VLS masked truncating stores

Fri Dec 17 07:05:08 PST 2021

Author: David Truby
Date: 2021-12-17T15:04:45Z
New Revision: 5c9684704d15503107ce79ccaf362402ad2b0b2a

URL: https://github.com/llvm/llvm-project/commit/5c9684704d15503107ce79ccaf362402ad2b0b2a
DIFF: https://github.com/llvm/llvm-project/commit/5c9684704d15503107ce79ccaf362402ad2b0b2a.diff

LOG: [DAG][sve] Lowering for VLS masked truncating stores

This extends the custom lowering for truncating stores on
fixed length vectors in SVE to support masked truncating stores.
It also adds a DAG combine for truncates followed by masked
stores.

Reviewed By: peterwaller-arm, paulwalker-arm

Differential Revision: https://reviews.llvm.org/D108115

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d8da711b56f9f..83ce3d017cc5d 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -852,6 +852,20 @@ class TargetLoweringBase {
     return getBooleanContents(Type.isVector(), Type.isFloatingPoint());
   }
 
+  /// Promote the given target boolean to a target boolean of the given type.
+  /// A target boolean is an integer value, not necessarily of type i1, the bits
+  /// of which conform to getBooleanContents.
+  ///
+  /// ValVT is the type of values that produced the boolean.
+  SDValue promoteTargetBoolean(SelectionDAG &DAG, SDValue Bool,
+                               EVT ValVT) const {
+    SDLoc dl(Bool);
+    EVT BoolVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ValVT);
+    ISD::NodeType ExtendCode = getExtendForContent(getBooleanContents(ValVT));
+    return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
+  }
+
   /// Return target scheduling preference.
   Sched::Preference getSchedulingPreference() const {
     return SchedPreferenceInfo;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4637ce56ff258..bc6dd70b99bef 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10044,6 +10044,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
   SDValue Chain = MST->getChain();
+  SDValue Value = MST->getValue();
+  SDValue Ptr = MST->getBasePtr();
   SDLoc DL(N);
 
   // Zap masked stores with a zero mask.
@@ -10063,6 +10065,42 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
+  if (MST->isTruncatingStore() && MST->isUnindexed() &&
+      Value.getValueType().isInteger() &&
+      (!isa<ConstantSDNode>(Value) ||
+       !cast<ConstantSDNode>(Value)->isOpaque())) {
+    APInt TruncDemandedBits =
+        APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
+                             MST->getMemoryVT().getScalarSizeInBits());
+
+    // See if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+  }
+
+  // If this is a TRUNC followed by a masked store, fold this into a masked
+  // truncating store.  We can do this even if this is already a masked
+  // truncstore.
+  if ((Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() &&
+      MST->isUnindexed() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               MST->getMemoryVT(), LegalOperations)) {
+    auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(),
+                                         Value.getOperand(0).getValueType());
+    return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                              MST->getOffset(), Mask, MST->getMemoryVT(),
+                              MST->getMemOperand(), MST->getAddressingMode(),
+                              /*IsTruncating=*/true);
+  }
+
   return SDValue();
 }
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 24874770276f2..c123ea3cd2d5a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1007,11 +1007,7 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
 ///
 /// ValVT is the type of values that produced the boolean.
 SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
-  SDLoc dl(Bool);
-  EVT BoolVT = getSetCCResultType(ValVT);
-  ISD::NodeType ExtendCode =
-      TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT));
-  return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
+  return TLI.promoteTargetBoolean(DAG, Bool, ValVT);
 }
 
 /// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi.

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 702fb595de22d..7fb06d95dc1cd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18897,10 +18897,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
 
 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
     SDValue Op, SelectionDAG &DAG) const {
-  auto Store = cast<MaskedStoreSDNode>(Op);
-
-  if (Store->isTruncatingStore())
-    return SDValue();
+  auto *Store = cast<MaskedStoreSDNode>(Op);
 
   SDLoc DL(Op);
   EVT VT = Store->getValue().getValueType();

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 19bc39025dce1..73fc0c0a8f7e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -155,21 +155,13 @@ define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
 define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 {
 ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
-; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl8
-; VBITS_GE_512-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_512-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_512-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
-; VBITS_GE_512-NEXT:    ret
+; VBITS_GE_512-NEXT: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
+; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
+
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
   %mask = icmp eq <8 x i64> %a, %b
@@ -179,21 +171,13 @@ define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>
 }
 
 define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 {
-; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
-; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_512-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_512-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: masked_store_trunc_v8i64i16:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
+; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
   %mask = icmp eq <8 x i64> %a, %b
@@ -203,19 +187,13 @@ define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i1
 }
 
 define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 {
-; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
-; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_512-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_512-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_512-NEXT:    cmpne p0.s, p0/z, z1.s, #0
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: masked_store_trunc_v8i64i32:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d
+; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
   %a = load <8 x i64>, <8 x i64>* %ap
   %b = load <8 x i64>, <8 x i64>* %bp
   %mask = icmp eq <8 x i64> %a, %b
@@ -225,21 +203,13 @@ define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i3
 }
 
 define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 {
-; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
-; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl16
-; VBITS_GE_512-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_512-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_512-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: masked_store_trunc_v16i32i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
+; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
   %a = load <16 x i32>, <16 x i32>* %ap
   %b = load <16 x i32>, <16 x i32>* %bp
   %mask = icmp eq <16 x i32> %a, %b
@@ -249,19 +219,13 @@ define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x
 }
 
 define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 {
-; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
-; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_512-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_512-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_512-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: masked_store_trunc_v16i32i16:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s
+; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
   %a = load <16 x i32>, <16 x i32>* %ap
   %b = load <16 x i32>, <16 x i32>* %bp
   %mask = icmp eq <16 x i32> %a, %b
@@ -271,19 +235,13 @@ define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16
 }
 
 define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 {
-; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8:
-; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
-; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_512-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_512-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_512-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
-; VBITS_GE_512-NEXT:    ret
+; CHECK-LABEL: masked_store_trunc_v32i16i8:
+; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
+; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h
+; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P1]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ret
   %a = load <32 x i16>, <32 x i16>* %ap
   %b = load <32 x i16>, <32 x i16>* %bp
   %mask = icmp eq <32 x i16> %a, %b