[llvm] 49c2206 - [VP] Preserve address space of pointer for strided load/store intrinsics.

Tue Mar 22 09:53:05 PDT 2022

Author: Craig Topper
Date: 2022-03-22T09:52:54-07:00
New Revision: 49c2206b3bdce4a37a4602527b2d3da673514333

URL: https://github.com/llvm/llvm-project/commit/49c2206b3bdce4a37a4602527b2d3da673514333
DIFF: https://github.com/llvm/llvm-project/commit/49c2206b3bdce4a37a4602527b2d3da673514333.diff

LOG: [VP] Preserve address space of pointer for strided load/store intrinsics.

This adds LLVMAnyPointerToElt to use instead of LLVMPointerToElt.
This allows us to preserve the address space as part of the type
overload for the intrinsic, but still require the vector element
type to match the pointer type.

Reviewed By: nikic

Differential Revision: https://reviews.llvm.org/D122042

Added: 
    

Modified: 
    llvm/include/llvm/IR/Intrinsics.h
    llvm/include/llvm/IR/Intrinsics.td
    llvm/lib/IR/Function.cpp
    llvm/lib/IR/IntrinsicInst.cpp
    llvm/test/CodeGen/VE/Vector/vp_strided_load.ll
    llvm/test/CodeGen/VE/Vector/vp_strided_store.ll
    llvm/unittests/IR/VPIntrinsicTest.cpp
    llvm/utils/TableGen/IntrinsicEmitter.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 2ff48380ac282..2b4e37a6037b6 100644

--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -142,6 +142,7 @@ namespace Intrinsic {
       VecOfBitcastsToInt,
       AMX,
       PPCQuad,
+      AnyPtrToElt,
     } Kind;
 
     union {
@@ -180,14 +181,15 @@ namespace Intrinsic {
       return (ArgKind)(Argument_Info & 7);
     }
 
-    // VecOfAnyPtrsToElt uses both an overloaded argument (for address space)
-    // and a reference argument (for matching vector width and element types)
+    // VecOfAnyPtrsToElt and AnyPtrToElt uses both an overloaded argument (for
+    // address space) and a reference argument (for matching vector width and
+    // element types)
     unsigned getOverloadArgNumber() const {
-      assert(Kind == VecOfAnyPtrsToElt);
+      assert(Kind == VecOfAnyPtrsToElt || Kind == AnyPtrToElt);
       return Argument_Info >> 16;
     }
     unsigned getRefArgNumber() const {
-      assert(Kind == VecOfAnyPtrsToElt);
+      assert(Kind == VecOfAnyPtrsToElt || Kind == AnyPtrToElt);
       return Argument_Info & 0xFFFF;
     }
 

diff  --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 244d73cec88a1..27d66dd6f56c1 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -212,6 +212,7 @@ class LLVMScalarOrSameVectorWidth<int idx, LLVMType elty>
 
 class LLVMPointerTo<int num> : LLVMMatchType<num>;
 class LLVMPointerToElt<int num> : LLVMMatchType<num>;
+class LLVMAnyPointerToElt<int num> : LLVMMatchType<num>;
 class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
 class LLVMVectorElementType<int num> : LLVMMatchType<num>;
 
@@ -1412,14 +1413,14 @@ def int_vp_scatter: DefaultAttrsIntrinsic<[],
 // Experimental strided memory accesses
 def int_experimental_vp_strided_store : DefaultAttrsIntrinsic<[],
                              [ llvm_anyvector_ty,
-                               LLVMPointerToElt<0>,
+                               LLVMAnyPointerToElt<0>,
                                llvm_anyint_ty, // Stride in bytes
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty],
                              [ NoCapture<ArgIndex<1>>, IntrNoSync, IntrWriteMem, IntrArgMemOnly, IntrWillReturn ]>;
 
 def int_experimental_vp_strided_load  : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                             [ LLVMPointerToElt<0>,
+                             [ LLVMAnyPointerToElt<0>,
                                llvm_anyint_ty, // Stride in bytes
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty],

diff  --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index fcc0ef0459d02..3298591be40a5 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -983,7 +983,8 @@ enum IIT_Info {
   IIT_PPCF128 = 52,
   IIT_V3 = 53,
   IIT_EXTERNREF = 54,
-  IIT_FUNCREF = 55
+  IIT_FUNCREF = 55,
+  IIT_ANYPTR_TO_ELT = 56,
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
@@ -1157,6 +1158,13 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo));
     return;
   }
+  case IIT_ANYPTR_TO_ELT: {
+    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::AnyPtrToElt, ArgNo, RefNo));
+    return;
+  }
   case IIT_VEC_OF_ANYPTRS_TO_ELT: {
     unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
@@ -1348,6 +1356,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::VecOfAnyPtrsToElt:
     // Return the overloaded type (which determines the pointers address space)
     return Tys[D.getOverloadArgNumber()];
+  case IITDescriptor::AnyPtrToElt:
+    // Return the overloaded type (which determines the pointers address space)
+    return Tys[D.getOverloadArgNumber()];
   }
   llvm_unreachable("unhandled");
 }
@@ -1592,6 +1603,30 @@ static bool matchIntrinsicType(
       return !ThisArgType->isOpaqueOrPointeeTypeMatches(
           ReferenceType->getElementType());
     }
+    case IITDescriptor::AnyPtrToElt: {
+      unsigned RefArgNumber = D.getRefArgNumber();
+      if (RefArgNumber >= ArgTys.size()) {
+        if (IsDeferredCheck)
+          return true;
+        // If forward referencing, already add the pointer type and
+        // defer the checks for later.
+        ArgTys.push_back(Ty);
+        return DeferCheck(Ty);
+      }
+
+      if (!IsDeferredCheck) {
+        assert(D.getOverloadArgNumber() == ArgTys.size() &&
+               "Table consistency error");
+        ArgTys.push_back(Ty);
+      }
+
+      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
+      auto *ThisArgType = dyn_cast<PointerType>(Ty);
+      if (!ThisArgType || !ReferenceType)
+        return true;
+      return !ThisArgType->isOpaqueOrPointeeTypeMatches(
+          ReferenceType->getElementType());
+    }
     case IITDescriptor::VecOfAnyPtrsToElt: {
       unsigned RefArgNumber = D.getRefArgNumber();
       if (RefArgNumber >= ArgTys.size()) {

diff  --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index ad729b09db6ca..fd71cbba5f5e1 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -511,8 +511,8 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
         M, VPID, {ReturnType, Params[0]->getType()});
     break;
   case Intrinsic::experimental_vp_strided_load:
-    VPFunc =
-        Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[1]->getType()});
+    VPFunc = Intrinsic::getDeclaration(
+        M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()});
     break;
   case Intrinsic::vp_gather:
     VPFunc = Intrinsic::getDeclaration(
@@ -524,7 +524,8 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
     break;
   case Intrinsic::experimental_vp_strided_store:
     VPFunc = Intrinsic::getDeclaration(
-        M, VPID, {Params[0]->getType(), Params[2]->getType()});
+        M, VPID,
+        {Params[0]->getType(), Params[1]->getType(), Params[2]->getType()});
     break;
   case Intrinsic::vp_scatter:
     VPFunc = Intrinsic::getDeclaration(

diff  --git a/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll b/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll
index 8d28cd733ade7..48b35cee34906 100644
--- a/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_strided_load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
 
-declare <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare <256 x float> @llvm.experimental.vp.strided.load.v256f32.p0f32.i64(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc <256 x float> @vp_strided_load_v256f32_rrm(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_load_v256f32_rrm:
@@ -13,7 +13,7 @@ define fastcc <256 x float> @vp_strided_load_v256f32_rrm(float* %ptr, i64 %strid
 ; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
 ; CHECK-NEXT:    vgtu %v0, %v0, 0, 0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.p0f32.i64(float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret <256 x float> %r
 }
 
@@ -26,7 +26,7 @@ define fastcc <256 x float> @vp_strided_load_v256f32_rr(float* %ptr, i64 %stride
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.p0f32.i64(float* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret <256 x float> %r
 }
 
@@ -39,11 +39,11 @@ define fastcc <256 x float> @vp_strided_load_v256f32_ri(float* %ptr, i32 %evl) {
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.i64(float* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x float> @llvm.experimental.vp.strided.load.v256f32.p0f32.i64(float* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret <256 x float> %r
 }
 
-declare <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare <256 x i32> @llvm.experimental.vp.strided.load.v256i32.p0i32.i64(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc <256 x i32> @vp_strided_load_v256i32_rrm(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_load_v256i32_rrm:
@@ -55,7 +55,7 @@ define fastcc <256 x i32> @vp_strided_load_v256i32_rrm(i32* %ptr, i64 %stride, <
 ; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
 ; CHECK-NEXT:    vgtl.zx %v0, %v0, 0, 0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.p0i32.i64(i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret <256 x i32> %r
 }
 
@@ -68,7 +68,7 @@ define fastcc <256 x i32> @vp_strided_load_v256i32_rr(i32* %ptr, i64 %stride, i3
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.p0i32.i64(i32* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret <256 x i32> %r
 }
 
@@ -81,11 +81,11 @@ define fastcc <256 x i32> @vp_strided_load_v256i32_ri(i32* %ptr, i32 %evl) {
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.i64(i32* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x i32> @llvm.experimental.vp.strided.load.v256i32.p0i32.i64(i32* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret <256 x i32> %r
 }
 
-declare <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare <256 x double> @llvm.experimental.vp.strided.load.v256f64.p0f64.i64(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc <256 x double> @vp_strided_load_v256f64_rrm(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_load_v256f64_rrm:
@@ -97,7 +97,7 @@ define fastcc <256 x double> @vp_strided_load_v256f64_rrm(double* %ptr, i64 %str
 ; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
 ; CHECK-NEXT:    vgt %v0, %v0, 0, 0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.p0f64.i64(double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret <256 x double> %r
 }
 
@@ -110,7 +110,7 @@ define fastcc <256 x double> @vp_strided_load_v256f64_rr(double* %ptr, i64 %stri
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.p0f64.i64(double* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret <256 x double> %r
 }
 
@@ -123,11 +123,11 @@ define fastcc <256 x double> @vp_strided_load_v256f64_ri(double* %ptr, i32 %evl)
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.i64(double* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x double> @llvm.experimental.vp.strided.load.v256f64.p0f64.i64(double* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret <256 x double> %r
 }
 
-declare <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare <256 x i64> @llvm.experimental.vp.strided.load.v256i64.p0i64.i64(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc <256 x i64> @vp_strided_load_v256i64_rrm(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_load_v256i64_rrm:
@@ -139,7 +139,7 @@ define fastcc <256 x i64> @vp_strided_load_v256i64_rrm(i64* %ptr, i64 %stride, <
 ; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
 ; CHECK-NEXT:    vgt %v0, %v0, 0, 0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.p0i64.i64(i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret <256 x i64> %r
 }
 
@@ -152,7 +152,7 @@ define fastcc <256 x i64> @vp_strided_load_v256i64_rr(i64* %ptr, i64 %stride, i3
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.p0i64.i64(i64* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret <256 x i64> %r
 }
 
@@ -165,6 +165,6 @@ define fastcc <256 x i64> @vp_strided_load_v256i64_ri(i64* %ptr, i32 %evl) {
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.i64(i64* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  %r = call <256 x i64> @llvm.experimental.vp.strided.load.v256i64.p0i64.i64(i64* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret <256 x i64> %r
 }

diff  --git a/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll b/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll
index bd125ee7d2731..80ea6850e6732 100644
--- a/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll
+++ b/llvm/test/CodeGen/VE/Vector/vp_strided_store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
 
-declare void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare void @llvm.experimental.vp.strided.store.v256f32.p0f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc void @vp_strided_store_v256f32_rrm(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_store_v256f32_rrm:
@@ -10,7 +10,7 @@ define fastcc void @vp_strided_store_v256f32_rrm(<256 x float> %val, float* %ptr
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vstu %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  call void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256f32.p0f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret void
 }
 
@@ -23,7 +23,7 @@ define fastcc void @vp_strided_store_v256f32_rr(<256 x float> %val, float* %ptr,
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256f32.p0f32.i64(<256 x float> %val, float* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
@@ -36,11 +36,11 @@ define fastcc void @vp_strided_store_v256f32_ri(<256 x float> %val, float* %ptr,
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256f32.i64(<256 x float> %val, float* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256f32.p0f32.i64(<256 x float> %val, float* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
-declare void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare void @llvm.experimental.vp.strided.store.v256i32.p0i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc void @vp_strided_store_v256i32_rrm(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_store_v256i32_rrm:
@@ -49,7 +49,7 @@ define fastcc void @vp_strided_store_v256i32_rrm(<256 x i32> %val, i32* %ptr, i6
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vstl %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  call void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256i32.p0i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret void
 }
 
@@ -62,7 +62,7 @@ define fastcc void @vp_strided_store_v256i32_rr(<256 x i32> %val, i32* %ptr, i64
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256i32.p0i32.i64(<256 x i32> %val, i32* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
@@ -75,11 +75,11 @@ define fastcc void @vp_strided_store_v256i32_ri(<256 x i32> %val, i32* %ptr, i32
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256i32.i64(<256 x i32> %val, i32* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256i32.p0i32.i64(<256 x i32> %val, i32* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
-declare void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare void @llvm.experimental.vp.strided.store.v256f64.p0f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc void @vp_strided_store_v256f64_rrm(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_store_v256f64_rrm:
@@ -88,7 +88,7 @@ define fastcc void @vp_strided_store_v256f64_rrm(<256 x double> %val, double* %p
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vst %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  call void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256f64.p0f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret void
 }
 
@@ -101,7 +101,7 @@ define fastcc void @vp_strided_store_v256f64_rr(<256 x double> %val, double* %pt
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256f64.p0f64.i64(<256 x double> %val, double* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
@@ -114,11 +114,11 @@ define fastcc void @vp_strided_store_v256f64_ri(<256 x double> %val, double* %pt
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256f64.i64(<256 x double> %val, double* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256f64.p0f64.i64(<256 x double> %val, double* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
-declare void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+declare void @llvm.experimental.vp.strided.store.v256i64.p0i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
 
 define fastcc void @vp_strided_store_v256i64_rrm(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl) {
 ; CHECK-LABEL: vp_strided_store_v256i64_rrm:
@@ -127,7 +127,7 @@ define fastcc void @vp_strided_store_v256i64_rrm(<256 x i64> %val, i64* %ptr, i6
 ; CHECK-NEXT:    lvl %s2
 ; CHECK-NEXT:    vst %v0, %s1, %s0, %vm1
 ; CHECK-NEXT:    b.l.t (, %s10)
-  call void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256i64.p0i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %mask, i32 %evl)
   ret void
 }
 
@@ -140,7 +140,7 @@ define fastcc void @vp_strided_store_v256i64_rr(<256 x i64> %val, i64* %ptr, i64
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256i64.p0i64.i64(<256 x i64> %val, i64* %ptr, i64 %stride, <256 x i1> %allones, i32 %evl)
   ret void
 }
 
@@ -153,6 +153,6 @@ define fastcc void @vp_strided_store_v256i64_ri(<256 x i64> %val, i64* %ptr, i32
 ; CHECK-NEXT:    b.l.t (, %s10)
   %one = insertelement <256 x i1> undef, i1 1, i32 0
   %allones = shufflevector <256 x i1> %one, <256 x i1> undef, <256 x i32> zeroinitializer
-  call void @llvm.experimental.vp.strided.store.v256i64.i64(<256 x i64> %val, i64* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
+  call void @llvm.experimental.vp.strided.store.v256i64.p0i64.i64(<256 x i64> %val, i64* %ptr, i64 24, <256 x i1> %allones, i32 %evl)
   ret void
 }

diff  --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index be629cf760d06..68ab6bf391881 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -61,6 +61,9 @@ class VPIntrinsicTest : public testing::Test {
     Str << "declare void "
            "@llvm.experimental.vp.strided.store.v8i32.i32(<8 x i32>, "
            "i32*, i32, <8 x i1>, i32) ";
+    Str << "declare void "
+           "@llvm.experimental.vp.strided.store.v8i32.p1i32.i32(<8 x i32>, "
+           "i32 addrspace(1)*, i32, <8 x i1>, i32) ";
     Str << " declare void @llvm.vp.scatter.v8i32.v8p0i32(<8 x i32>, <8 x "
            "i32*>, <8 x i1>, i32) ";
     Str << " declare <8 x i32> @llvm.vp.load.v8i32.p0v8i32(<8 x i32>*, <8 x "
@@ -68,6 +71,9 @@ class VPIntrinsicTest : public testing::Test {
     Str << "declare <8 x i32> "
            "@llvm.experimental.vp.strided.load.v8i32.i32(i32*, i32, <8 "
            "x i1>, i32) ";
+    Str << "declare <8 x i32> "
+           "@llvm.experimental.vp.strided.load.v8i32.p1i32.i32(i32 "
+           "addrspace(1)*, i32, <8 x i1>, i32) ";
     Str << " declare <8 x i32> @llvm.vp.gather.v8i32.v8p0i32(<8 x i32*>, <8 x "
            "i1>, i32) ";
 

diff  --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp
index a5aa4069e60ff..2cfae803d59c2 100644
--- a/llvm/utils/TableGen/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp
@@ -252,7 +252,8 @@ enum IIT_Info {
   IIT_PPCF128 = 52,
   IIT_V3 = 53,
   IIT_EXTERNREF = 54,
-  IIT_FUNCREF = 55
+  IIT_FUNCREF = 55,
+  IIT_ANYPTR_TO_ELT = 56,
 };
 
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
@@ -327,6 +328,13 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
       // Encode LLVMMatchType<Number> ArgNo
       Sig.push_back(Number);
       return;
+    } else if (R->isSubClassOf("LLVMAnyPointerToElt")) {
+      Sig.push_back(IIT_ANYPTR_TO_ELT);
+      // Encode overloaded ArgNo
+      Sig.push_back(NextArgCode++);
+      // Encode LLVMMatchType<Number> ArgNo
+      Sig.push_back(Number);
+      return;
     } else if (R->isSubClassOf("LLVMPointerToElt"))
       Sig.push_back(IIT_PTR_TO_ELT);
     else if (R->isSubClassOf("LLVMVectorElementType"))
@@ -415,6 +423,9 @@ static void UpdateArgCodes(Record *R, std::vector<unsigned char> &ArgCodes,
     if (R->isSubClassOf("LLVMVectorOfAnyPointersToElt")) {
       ArgCodes.push_back(3 /*vAny*/);
       ++NumInserted;
+    } else if (R->isSubClassOf("LLVMAnyPointerToElt")) {
+      ArgCodes.push_back(4 /*iPTRAny*/);
+      ++NumInserted;
     }
     return;
   }