[llvm] 2c3f665 - [SVE] Extend support for folding select + masked gathers

Matt Devereau via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 5 09:28:13 PDT 2022


Author: Matt Devereau
Date: 2022-04-05T16:27:11Z
New Revision: 2c3f66519c5e6af2a43e8c7087679f90d4582623

URL: https://github.com/llvm/llvm-project/commit/2c3f66519c5e6af2a43e8c7087679f90d4582623
DIFF: https://github.com/llvm/llvm-project/commit/2c3f66519c5e6af2a43e8c7087679f90d4582623.diff

LOG: [SVE] Extend support for folding select + masked gathers

Extend the work done in D106376 to include masked gathers

Differential Revision: https://reviews.llvm.org/D122896

Added: 
    llvm/test/Transforms/InstCombine/select-masked_gather.ll

Modified: 
    llvm/include/llvm/IR/PatternMatch.h
    llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index f9f4f16038619..5009feebc6357 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2117,6 +2117,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
   return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
 }
 
+/// Matches MaskedGather Intrinsic.
+template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
+m_MaskedGather(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
+               const Opnd3 &Op3) {
+  return m_Intrinsic<Intrinsic::masked_gather>(Op0, Op1, Op2, Op3);
+}
+
 template <Intrinsic::ID IntrID, typename T0>
 inline typename m_Intrinsic_Ty<T0>::Ty m_Intrinsic(const T0 &Op0) {
   return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 06c2de65e834a..1cf6e43991c5c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3031,18 +3031,22 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0)
   // Load inst is intentionally not checked for hasOneUse()
   if (match(FalseVal, m_Zero()) &&
-      match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
-                                  m_CombineOr(m_Undef(), m_Zero())))) {
-    auto *MaskedLoad = cast<IntrinsicInst>(TrueVal);
-    if (isa<UndefValue>(MaskedLoad->getArgOperand(3)))
-      MaskedLoad->setArgOperand(3, FalseVal /* Zero */);
-    return replaceInstUsesWith(SI, MaskedLoad);
+      (match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal),
+                                   m_CombineOr(m_Undef(), m_Zero()))) ||
+       match(TrueVal, m_MaskedGather(m_Value(), m_Value(), m_Specific(CondVal),
+                                     m_CombineOr(m_Undef(), m_Zero()))))) {
+    auto *MaskedInst = cast<IntrinsicInst>(TrueVal);
+    if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
+      MaskedInst->setArgOperand(3, FalseVal /* Zero */);
+    return replaceInstUsesWith(SI, MaskedInst);
   }
 
   Value *Mask;
   if (match(TrueVal, m_Zero()) &&
-      match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
-                                   m_CombineOr(m_Undef(), m_Zero()))) &&
+      (match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask),
+                                    m_CombineOr(m_Undef(), m_Zero()))) ||
+       match(FalseVal, m_MaskedGather(m_Value(), m_Value(), m_Value(Mask),
+                                      m_CombineOr(m_Undef(), m_Zero())))) &&
       (CondVal->getType() == Mask->getType())) {
     // We can remove the select by ensuring the load zeros all lanes the
     // select would have.  We determine this by proving there is no overlap
@@ -3053,10 +3057,10 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
       CanMergeSelectIntoLoad = match(V, m_Zero());
 
     if (CanMergeSelectIntoLoad) {
-      auto *MaskedLoad = cast<IntrinsicInst>(FalseVal);
-      if (isa<UndefValue>(MaskedLoad->getArgOperand(3)))
-        MaskedLoad->setArgOperand(3, TrueVal /* Zero */);
-      return replaceInstUsesWith(SI, MaskedLoad);
+      auto *MaskedInst = cast<IntrinsicInst>(FalseVal);
+      if (isa<UndefValue>(MaskedInst->getArgOperand(3)))
+        MaskedInst->setArgOperand(3, TrueVal /* Zero */);
+      return replaceInstUsesWith(SI, MaskedInst);
     }
   }
 

diff  --git a/llvm/test/Transforms/InstCombine/select-masked_gather.ll b/llvm/test/Transforms/InstCombine/select-masked_gather.ll
new file mode 100644
index 0000000000000..22d7e71613994
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-masked_gather.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Fold zeroing of inactive lanes into the gather's passthrough parameter.
+define <vscale x 2 x float> @masked_gather_and_zero_inactive_1(<vscale x 2 x float*> %ptr, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_1(
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x float> [[GATHER]]
+;
+  %gather = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  %masked = select <vscale x 2 x i1> %mask, <vscale x 2 x float> %gather, <vscale x 2 x float> zeroinitializer
+  ret <vscale x 2 x float> %masked
+}
+
+; As above but reuse the gather's existing passthrough.
+define <vscale x 2 x i32> @masked_gather_and_zero_inactive_2(<vscale x 2 x i32*> %ptr, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_2(
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
+;
+  %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> zeroinitializer)
+  %masked = select <vscale x 2 x i1> %mask, <vscale x 2 x i32> %gather, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i32> %masked
+}
+
+; No transform when the gather's passthrough cannot be reused or altered.
+define <vscale x 2 x i32> @masked_gather_and_zero_inactive_3(<vscale x 2 x i32*> %ptr, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthrough) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_3(
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[GATHER]], <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
+;
+  %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthrough)
+  %masked = select <vscale x 2 x i1> %mask, <vscale x 2 x i32> %gather, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i32> %masked
+}
+
+; Remove redundant select when its mask doesn't overlap with the gather mask.
+define <vscale x 2 x i32> @masked_gather_and_zero_inactive_4(<vscale x 2 x i32*> %ptr, <vscale x 2 x i1> %inv_mask) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_4(
+; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
+;
+  %splat  = shufflevector <vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+  %mask = xor <vscale x 2 x i1> %inv_mask, %splat
+  %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %masked = select <vscale x 2 x i1> %inv_mask, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %gather
+  ret <vscale x 2 x i32> %masked
+}
+
+; As above but reuse the gather's existing passthrough.
+define <vscale x 2 x i32> @masked_gather_and_zero_inactive_5(<vscale x 2 x i32*> %ptr, <vscale x 2 x i1> %inv_mask) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_5(
+; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
+;
+  %splat  = shufflevector <vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+  %mask = xor <vscale x 2 x i1> %inv_mask, %splat
+  %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> zeroinitializer)
+  %masked = select <vscale x 2 x i1> %inv_mask, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %gather
+  ret <vscale x 2 x i32> %masked
+}
+
+; No transform when the gather's passthrough cannot be reused or altered.
+define <vscale x 2 x i32> @masked_gather_and_zero_inactive_6(<vscale x 2 x i32*> %ptr, <vscale x 2 x i1> %inv_mask, <vscale x 2 x i32> %passthrough) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_6(
+; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[INV_MASK]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[GATHER]]
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
+;
+  %splat  = shufflevector <vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+  %mask = xor <vscale x 2 x i1> %inv_mask, %splat
+  %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthrough)
+  %masked = select <vscale x 2 x i1> %inv_mask, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %gather
+  ret <vscale x 2 x i32> %masked
+}
+
+; No transform when select and gather masks have no relation.
+define <vscale x 2 x i32> @masked_gather_and_zero_inactive_7(<vscale x 2 x i32*> %ptr, <vscale x 2 x i1> %mask1, <vscale x 2 x i1> %mask2) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_7(
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0i32(<vscale x 2 x i32*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK1:%.*]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[MASK2:%.*]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[GATHER]]
+; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
+;
+  %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptr, i32 4, <vscale x 2 x i1> %mask1, <vscale x 2 x i32> zeroinitializer)
+  %masked = select <vscale x 2 x i1> %mask2, <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> %gather
+  ret <vscale x 2 x i32> %masked
+}
+
+; A more complex case where we can prove the select mask is a subset of the
+; gather's inactive lanes and thus the gather's passthrough takes effect.
+define <vscale x 2 x float> @masked_gather_and_zero_inactive_8(<vscale x 2 x float*> %ptr, <vscale x 2 x i1> %inv_mask, <vscale x 2 x i1> %cond) {
+; CHECK-LABEL: @masked_gather_and_zero_inactive_8(
+; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[PG:%.*]] = and <vscale x 2 x i1> [[MASK]], [[COND:%.*]]
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[PG]], <vscale x 2 x float> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x float> [[GATHER]]
+;
+  %splat  = shufflevector <vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+  %mask = xor <vscale x 2 x i1> %inv_mask, %splat
+  %pg = and <vscale x 2 x i1> %mask, %cond
+  %gather = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptr, i32 4, <vscale x 2 x i1> %pg, <vscale x 2 x float> undef)
+  %masked = select <vscale x 2 x i1> %inv_mask, <vscale x 2 x float> zeroinitializer, <vscale x 2 x float> %gather
+  ret <vscale x 2 x float> %masked
+}
+
+define <vscale x 2 x float> @masked_load_and_scalar_select_cond(<vscale x 2 x float*> %ptr, <vscale x 2 x i1> %mask, i1 %cond) {
+; CHECK-LABEL: @masked_load_and_scalar_select_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> [[PTR:%.*]], i32 32, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <vscale x 2 x float> zeroinitializer, <vscale x 2 x float> [[TMP0]]
+; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
+;
+entry:
+  %0 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptr, i32 32, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  %1 = select i1 %cond, <vscale x 2 x float> zeroinitializer, <vscale x 2 x float> %0
+  ret <vscale x 2 x float> %1
+}
+
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)


        


More information about the llvm-commits mailing list