[llvm] f390781 - [VP] Implementing expansion pass for VP load and store.

Sun Jul 17 23:48:14 PDT 2022

Author: Lorenzo Albano
Date: 2022-07-18T08:47:54+02:00
New Revision: f390781cec5c7f608ea678294b482b6494fc70a4

URL: https://github.com/llvm/llvm-project/commit/f390781cec5c7f608ea678294b482b6494fc70a4
DIFF: https://github.com/llvm/llvm-project/commit/f390781cec5c7f608ea678294b482b6494fc70a4.diff

LOG: [VP] Implementing expansion pass for VP load and store.

Added function to the ExpandVectorPredication pass to handle VP loads
and stores.

Reviewed By: simoll

Differential Revision: https://reviews.llvm.org/D109584

Added: 
    llvm/test/CodeGen/Generic/expand-vp-load-store.ll

Modified: 
    llvm/lib/CodeGen/ExpandVectorPredication.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 59932a542bbc9..ae913f543ce6e 100644

--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -82,8 +83,11 @@ STATISTIC(NumLoweredVPOps, "Number of folded vector predication operations");
 
 /// \returns Whether the vector mask \p MaskVal has all lane bits set.
 static bool isAllTrueMask(Value *MaskVal) {
-  auto *ConstVec = dyn_cast<ConstantVector>(MaskVal);
-  return ConstVec && ConstVec->isAllOnesValue();
+  if (Value *SplattedVal = getSplatValue(MaskVal))
+    if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
+      return ConstValue->isAllOnesValue();
+
+  return false;
 }
 
 /// \returns A non-excepting divisor constant for this type.
@@ -171,6 +175,10 @@ struct CachingVPExpander {
   Value *expandPredicationInReduction(IRBuilder<> &Builder,
                                       VPReductionIntrinsic &PI);
 
+  /// \brief Lower this VP memory operation to a non-VP intrinsic.
+  Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                            VPIntrinsic &VPI);
+
   /// \brief Query TTI and expand the vector predication in \p P accordingly.
   Value *expandPredication(VPIntrinsic &PI);
 
@@ -389,6 +397,53 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
   return Reduction;
 }
 
+Value *
+CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                                      VPIntrinsic &VPI) {
+  assert(VPI.canIgnoreVectorLengthParam());
+
+  Value *MaskParam = VPI.getMaskParam();
+  Value *PtrParam = VPI.getMemoryPointerParam();
+  Value *DataParam = VPI.getMemoryDataParam();
+  bool IsUnmasked = isAllTrueMask(MaskParam);
+
+  MaybeAlign AlignOpt = VPI.getPointerAlignment();
+
+  Value *NewMemoryInst = nullptr;
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Not a VP memory intrinsic");
+  case Intrinsic::vp_store:
+    if (IsUnmasked) {
+      StoreInst *NewStore =
+          Builder.CreateStore(DataParam, PtrParam, /*IsVolatile*/ false);
+      if (AlignOpt.hasValue())
+        NewStore->setAlignment(AlignOpt.getValue());
+      NewMemoryInst = NewStore;
+    } else
+      NewMemoryInst = Builder.CreateMaskedStore(
+          DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam);
+
+    break;
+  case Intrinsic::vp_load:
+    if (IsUnmasked) {
+      LoadInst *NewLoad =
+          Builder.CreateLoad(VPI.getType(), PtrParam, /*IsVolatile*/ false);
+      if (AlignOpt.hasValue())
+        NewLoad->setAlignment(AlignOpt.getValue());
+      NewMemoryInst = NewLoad;
+    } else
+      NewMemoryInst = Builder.CreateMaskedLoad(
+          VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam);
+
+    break;
+  }
+
+  assert(NewMemoryInst);
+  replaceOperation(*NewMemoryInst, VPI);
+  return NewMemoryInst;
+}
+
 void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
@@ -465,6 +520,14 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
     return expandPredicationInReduction(Builder, *VPRI);
 
+  switch (VPI.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::vp_load:
+  case Intrinsic::vp_store:
+    return expandPredicationInMemoryIntrinsic(Builder, VPI);
+  }
+
   return &VPI;
 }
 

diff  --git a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
new file mode 100644
index 0000000000000..391eb82b80c9a
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt --expandvp -S < %s | FileCheck %s
+; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s
+
+; Fixed vectors
+define <2 x i64> @vpload_v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_v2i64(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef)
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %load
+}
+
+define <2 x i64> @vpload_v2i64_vlmax(<2 x i64>* %ptr, <2 x i1> %m) {
+; CHECK-LABEL: @vpload_v2i64_vlmax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> undef)
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 2)
+  ret <2 x i64> %load
+}
+
+define <2 x i64> @vpload_v2i64_allones_mask(<2 x i64>* %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_v2i64_allones_mask(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], <i1 true, i1 true>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef)
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
+  ret <2 x i64> %load
+}
+
+define <2 x i64> @vpload_v2i64_allones_mask_vlmax(<2 x i64>* %ptr) {
+; CHECK-LABEL: @vpload_v2i64_allones_mask_vlmax(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 2)
+  ret <2 x i64> %load
+}
+
+define void @vpstore_v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpstore_v2i64(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpstore_v2i64_vlmax(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m) {
+; CHECK-LABEL: @vpstore_v2i64_vlmax(
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 2)
+  ret void
+}
+
+define void @vpstore_v2i64_allones_mask(<2 x i64> %val, <2 x i64>* %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: @vpstore_v2i64_allones_mask(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], <i1 true, i1 true>
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
+  ret void
+}
+
+define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, <2 x i64>* %ptr) {
+; CHECK-LABEL: @vpstore_v2i64_allones_mask_vlmax(
+; CHECK-NEXT:    store <2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 2)
+  ret void
+}
+
+; Scalable vectors
+define <vscale x 1 x i64> @vpload_nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_nxv1i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> undef)
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+;
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %load
+}
+
+define <vscale x 1 x i64> @vpload_nxv1i64_vscale(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m) {
+; CHECK-LABEL: @vpload_nxv1i64_vscale(
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> undef)
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP1]]
+;
+  %vscale = call i32 @llvm.vscale.i32()
+  %vlmax = mul nuw i32 %vscale, 1
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %vlmax)
+  ret <vscale x 1 x i64> %load
+}
+
+define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(<vscale x 1 x i64>* %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: @vpload_nxv1i64_allones_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> undef)
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
+;
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
+  ret <vscale x 1 x i64> %load
+}
+
+define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask_vscale(<vscale x 1 x i64>* %ptr) {
+; CHECK-LABEL: @vpload_nxv1i64_allones_mask_vscale(
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 1 x i64>, <vscale x 1 x i64>* [[PTR:%.*]], align 8
+; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP1]]
+;
+  %vscale = call i32 @llvm.vscale.i32()
+  %vlmax = mul nuw i32 %vscale, 1
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
+  ret <vscale x 1 x i64> %load
+}
+
+define void @vpstore_nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpstore_nxv1i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %evl)
+  ret void
+}
+
+define void @vpstore_nxv1i64_vscale(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: @vpstore_nxv1i64_vscale(
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %vscale = call i32 @llvm.vscale.i32()
+  %vlmax = mul nuw i32 %vscale, 1
+  call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %vlmax)
+  ret void
+}
+
+define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, i32 zeroext %evl) {
+; CHECK-LABEL: @vpstore_nxv1i64_allones_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
+  ret void
+}
+
+define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr) {
+; CHECK-LABEL: @vpstore_nxv1i64_allones_mask_vscale(
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
+; CHECK-NEXT:    store <vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %vscale = call i32 @llvm.vscale.i32()
+  %vlmax = mul nuw i32 %vscale, 1
+  call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
+  ret void
+}
+
+declare i32 @llvm.vscale.i32()
+
+declare <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>*, <2 x i1>, i32)
+declare void @llvm.vp.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, <2 x i1>, i32)
+
+declare <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>*, <vscale x 1 x i1>, i32)
+declare void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>*, <vscale x 1 x i1>, i32)