[llvm] [SPIR-V] Explicitly emit vector element count for OpenCL vloadn calls (PR #81148)

Sat Feb 10 02:02:41 PST 2024

https://github.com/michalpaszkowski updated https://github.com/llvm/llvm-project/pull/81148

>From 728895e1c7f5301327d43100b32b6739a8eadcfd Mon Sep 17 00:00:00 2001
From: Michal Paszkowski <michal at paszkowski.org>
Date: Thu, 8 Feb 2024 07:23:03 -0800
Subject: [PATCH 1/3] [SPIR-V] Explicitly emit vector element count for OpenCL
 vloadn calls

---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp       |  2 +
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        | 16 +++-
 .../SPIRV/opencl/basic/vstore_private.ll      | 95 -------------------
 llvm/test/CodeGen/SPIRV/opencl/vload2.ll      | 40 ++++++++
 4 files changed, 53 insertions(+), 100 deletions(-)
 delete mode 100644 llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/opencl/vload2.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index e4593e7db90e8b..572a9afe14b265 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -114,6 +114,7 @@ struct VectorLoadStoreBuiltin {
   StringRef Name;
   InstructionSet::InstructionSet Set;
   uint32_t Number;
+  uint32_t ElementCount;
   bool IsRounded;
   FPRoundingMode::FPRoundingMode RoundingMode;
 };
@@ -1851,6 +1852,7 @@ static bool generateVectorLoadStoreInst(const SPIRV::IncomingCall *Call,
           .addImm(Builtin->Number);
   for (auto Argument : Call->Arguments)
     MIB.addUse(Argument);
+  MIB.addImm(Builtin->ElementCount);
 
   // Rounding mode should be passed as a last argument in the MI for builtins
   // like "vstorea_halfn_r".
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 8acd4691787e4c..63ca0a909b69c3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1046,18 +1046,24 @@ class VectorLoadStoreBuiltin<string name, InstructionSet set, int number> {
   string Name = name;
   InstructionSet Set = set;
   bits<32> Number = number;
+  bits<32> ElementCount = !cond(!not(!eq(!find(name, "2"), -1)) : 2,
+                                !not(!eq(!find(name, "3"), -1)) : 3,
+                                !not(!eq(!find(name, "4"), -1)) : 4,
+                                !not(!eq(!find(name, "8"), -1)) : 8,
+                                !not(!eq(!find(name, "16"), -1)) : 16,
+                                true : 1);
   bit IsRounded = !not(!eq(!find(name, "_rt"), -1));
   FPRoundingMode RoundingMode = !cond(!not(!eq(!find(name, "_rte"), -1)) : RTE,
-                                  !not(!eq(!find(name, "_rtz"), -1)) : RTZ,
-                                  !not(!eq(!find(name, "_rtp"), -1)) : RTP,
-                                  !not(!eq(!find(name, "_rtn"), -1)) : RTN,
-                                  true : RTE);
+                                      !not(!eq(!find(name, "_rtz"), -1)) : RTZ,
+                                      !not(!eq(!find(name, "_rtp"), -1)) : RTP,
+                                      !not(!eq(!find(name, "_rtn"), -1)) : RTN,
+                                      true : RTE);
 }
 
 // Table gathering all the vector data load/store builtins.
 def VectorLoadStoreBuiltins : GenericTable {
   let FilterClass = "VectorLoadStoreBuiltin";
-  let Fields = ["Name", "Set", "Number", "IsRounded", "RoundingMode"];
+  let Fields = ["Name", "Set", "Number", "ElementCount", "IsRounded", "RoundingMode"];
   string TypeOf_Set = "InstructionSet";
   string TypeOf_RoundingMode = "FPRoundingMode";
 }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll
deleted file mode 100644
index 40f1d59e4365e1..00000000000000
--- a/llvm/test/CodeGen/SPIRV/opencl/basic/vstore_private.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
-
-; TODO(#60133): Requires updates following opaque pointer migration.
-; XFAIL: *
-
-; CHECK: %[[#i16_ty:]] = OpTypeInt 16 0
-; CHECK: %[[#v4xi16_ty:]] = OpTypeVector %[[#i16_ty]] 4
-; CHECK: %[[#pv4xi16_ty:]] = OpTypePointer Function %[[#v4xi16_ty]]
-; CHECK: %[[#i16_const0:]] = OpConstant %[[#i16_ty]] 0
-; CHECK: %[[#i16_undef:]] = OpUndef %[[#i16_ty]]
-; CHECK: %[[#comp_const:]] = OpConstantComposite %[[#v4xi16_ty]] %[[#i16_const0]] %[[#i16_const0]] %[[#i16_const0]] %[[#i16_undef]]
-
-; CHECK: %[[#r:]] = OpInBoundsPtrAccessChain
-; CHECK: %[[#r2:]] = OpBitcast %[[#pv4xi16_ty]] %[[#r]]
-; CHECK: OpStore %[[#r2]] %[[#comp_const]] Aligned 8
-
-define spir_kernel void @test_fn(i16 addrspace(1)* %srcValues, i32 addrspace(1)* %offsets, <3 x i16> addrspace(1)* %destBuffer, i32 %alignmentOffset) {
-entry:
-  %sPrivateStorage = alloca [42 x <3 x i16>], align 8
-  %0 = bitcast [42 x <3 x i16>]* %sPrivateStorage to i8*
-  %1 = bitcast i8* %0 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 336, i8* %1)
-  %2 = call spir_func <3 x i64> @BuiltInGlobalInvocationId()
-  %call = extractelement <3 x i64> %2, i32 0
-  %conv = trunc i64 %call to i32
-  %idxprom = sext i32 %conv to i64
-  %arrayidx = getelementptr inbounds [42 x <3 x i16>], [42 x <3 x i16>]* %sPrivateStorage, i64 0, i64 %idxprom
-  %storetmp = bitcast <3 x i16>* %arrayidx to <4 x i16>*
-  store <4 x i16> <i16 0, i16 0, i16 0, i16 undef>, <4 x i16>* %storetmp, align 8
-  %conv1 = sext i32 %conv to i64
-  %call2 = call spir_func <3 x i16> @OpenCL_vload3_i64_p1i16_i32(i64 %conv1, i16 addrspace(1)* %srcValues, i32 3)
-  %idxprom3 = sext i32 %conv to i64
-  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %idxprom3
-  %3 = load i32, i32 addrspace(1)* %arrayidx4, align 4
-  %conv5 = zext i32 %3 to i64
-  %arraydecay = getelementptr inbounds [42 x <3 x i16>], [42 x <3 x i16>]* %sPrivateStorage, i64 0, i64 0
-  %4 = bitcast <3 x i16>* %arraydecay to i16*
-  %idx.ext = zext i32 %alignmentOffset to i64
-  %add.ptr = getelementptr inbounds i16, i16* %4, i64 %idx.ext
-  call spir_func void @OpenCL_vstore3_v3i16_i64_p0i16(<3 x i16> %call2, i64 %conv5, i16* %add.ptr)
-  %arraydecay6 = getelementptr inbounds [42 x <3 x i16>], [42 x <3 x i16>]* %sPrivateStorage, i64 0, i64 0
-  %5 = bitcast <3 x i16>* %arraydecay6 to i16*
-  %idxprom7 = sext i32 %conv to i64
-  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %idxprom7
-  %6 = load i32, i32 addrspace(1)* %arrayidx8, align 4
-  %mul = mul i32 3, %6
-  %idx.ext9 = zext i32 %mul to i64
-  %add.ptr10 = getelementptr inbounds i16, i16* %5, i64 %idx.ext9
-  %idx.ext11 = zext i32 %alignmentOffset to i64
-  %add.ptr12 = getelementptr inbounds i16, i16* %add.ptr10, i64 %idx.ext11
-  %7 = bitcast <3 x i16> addrspace(1)* %destBuffer to i16 addrspace(1)*
-  %idxprom13 = sext i32 %conv to i64
-  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %idxprom13
-  %8 = load i32, i32 addrspace(1)* %arrayidx14, align 4
-  %mul15 = mul i32 3, %8
-  %idx.ext16 = zext i32 %mul15 to i64
-  %add.ptr17 = getelementptr inbounds i16, i16 addrspace(1)* %7, i64 %idx.ext16
-  %idx.ext18 = zext i32 %alignmentOffset to i64
-  %add.ptr19 = getelementptr inbounds i16, i16 addrspace(1)* %add.ptr17, i64 %idx.ext18
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %cmp = icmp ult i32 %i.0, 3
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %idxprom21 = zext i32 %i.0 to i64
-  %arrayidx22 = getelementptr inbounds i16, i16* %add.ptr12, i64 %idxprom21
-  %9 = load i16, i16* %arrayidx22, align 2
-  %idxprom23 = zext i32 %i.0 to i64
-  %arrayidx24 = getelementptr inbounds i16, i16 addrspace(1)* %add.ptr19, i64 %idxprom23
-  store i16 %9, i16 addrspace(1)* %arrayidx24, align 2
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body
-  %inc = add i32 %i.0, 1
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %10 = bitcast [42 x <3 x i16>]* %sPrivateStorage to i8*
-  %11 = bitcast i8* %10 to i8*
-  call void @llvm.lifetime.end.p0i8(i64 336, i8* %11)
-  ret void
-}
-
-declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
-
-declare spir_func <3 x i16> @OpenCL_vload3_i64_p1i16_i32(i64, i16 addrspace(1)*, i32)
-
-declare spir_func void @OpenCL_vstore3_v3i16_i64_p0i16(<3 x i16>, i64, i16*)
-
-declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
-
-declare spir_func <3 x i64> @BuiltInGlobalInvocationId()
diff --git a/llvm/test/CodeGen/SPIRV/opencl/vload2.ll b/llvm/test/CodeGen/SPIRV/opencl/vload2.ll
new file mode 100644
index 00000000000000..f7d380b96a3ef0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/opencl/vload2.ll
@@ -0,0 +1,40 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; This test only itends to check the vloadn builtin lowering. 
+; The calls to the OpenCL builtins are not valid and will not pass SPIR-V validation.
+
+; CHECK-DAG: %[[#IMPORT:]] = OpExtInstImport "OpenCL.std"
+
+; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#INT16:]] = OpTypeInt 16 0
+; CHECK-DAG: %[[#INT32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#INT64:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#FLOAT:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#VINT8:]] = OpTypeVector %[[#INT8]] 2
+; CHECK-DAG: %[[#VINT16:]] = OpTypeVector %[[#INT16]] 2
+; CHECK-DAG: %[[#VINT32:]] = OpTypeVector %[[#INT32]] 2
+; CHECK-DAG: %[[#VINT64:]] = OpTypeVector %[[#INT64]] 2
+; CHECK-DAG: %[[#VFLOAT:]] = OpTypeVector %[[#FLOAT]] 2
+; CHECK-DAG: %[[#PTRINT8:]] = OpTypePointer CrossWorkgroup %[[#INT8]]
+
+; CHECK: %[[#OFFSET:]] = OpFunctionParameter %[[#INT64]]
+; CHECK: %[[#ADDRESS:]] = OpFunctionParameter %[[#PTRINT8]]
+
+define spir_kernel void @test_fn(i64 %offset, ptr addrspace(1) %address) {
+; CHECK: %[[#]] = OpExtInst %[[#VINT8]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2
+  %call1 = call spir_func <2 x i8> @_Z6vload2mPU3AS1Kc(i64 %offset, ptr addrspace(1) %address)
+; CHECK: %[[#]] = OpExtInst %[[#VINT16]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2
+  %call2 = call spir_func <2 x i16> @_Z6vload2mPU3AS1Ks(i64 %offset, ptr addrspace(1) %address)
+; CHECK: %[[#]] = OpExtInst %[[#VINT32]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2
+  %call3 = call spir_func <2 x i32> @_Z6vload2mPU3AS1Ki(i64 %offset, ptr addrspace(1) %address)
+; CHECK: %[[#]] = OpExtInst %[[#VINT64]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2
+  %call4 = call spir_func <2 x i64> @_Z6vload2mPU3AS1Kl(i64 %offset, ptr addrspace(1) %address)
+; CHECK: %[[#]] = OpExtInst %[[#VFLOAT]] %[[#IMPORT]] vloadn %[[#OFFSET]] %[[#ADDRESS]] 2
+  %call5 = call spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64 %offset, ptr addrspace(1) %address)
+  ret void
+}
+
+declare spir_func <2 x i8> @_Z6vload2mPU3AS1Kc(i64, ptr addrspace(1))
+declare spir_func <2 x i16> @_Z6vload2mPU3AS1Ks(i64, ptr addrspace(1))
+declare spir_func <2 x i32> @_Z6vload2mPU3AS1Ki(i64, ptr addrspace(1))
+declare spir_func <2 x i64> @_Z6vload2mPU3AS1Kl(i64, ptr addrspace(1))
+declare spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64, ptr addrspace(1))

>From c450369e1979180838ed97dd671912e6bfdb6e9d Mon Sep 17 00:00:00 2001
From: Michal Paszkowski <michal at paszkowski.org>
Date: Thu, 8 Feb 2024 11:28:14 -0800
Subject: [PATCH 2/3] [SPIR-V] Generalize/simplify generating bitcasts for ptr
 kernel args

---
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 27 ++++++++--------
 .../pointers/getelementptr-kernel-arg-char.ll | 31 +++++++++++++++++++
 2 files changed, 44 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 26a5d7a30f19dd..e32cd50be56e38 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -290,25 +290,14 @@ void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) {
   Value *Pointer;
   Type *ExpectedElementType;
   unsigned OperandToReplace;
-  bool AllowCastingToChar = false;
 
   StoreInst *SI = dyn_cast<StoreInst>(I);
   if (SI && F->getCallingConv() == CallingConv::SPIR_KERNEL &&
       SI->getValueOperand()->getType()->isPointerTy() &&
       isa<Argument>(SI->getValueOperand())) {
-    Argument *Arg = cast<Argument>(SI->getValueOperand());
-    MDString *ArgType = getOCLKernelArgType(*Arg->getParent(), Arg->getArgNo());
-    if (!ArgType || ArgType->getString().starts_with("uchar*"))
-      return;
-
-    // Handle special case when StoreInst's value operand is a kernel argument
-    // of a pointer type. Since these arguments could have either a basic
-    // element type (e.g. float*) or OpenCL builtin type (sampler_t), bitcast
-    // the StoreInst's value operand to default pointer element type (i8).
-    Pointer = Arg;
+    Pointer = SI->getValueOperand();
     ExpectedElementType = IntegerType::getInt8Ty(F->getContext());
     OperandToReplace = 0;
-    AllowCastingToChar = true;
   } else if (SI) {
     Pointer = SI->getPointerOperand();
     ExpectedElementType = SI->getValueOperand()->getType();
@@ -390,10 +379,20 @@ void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) {
   }
 
   // Do not emit spv_ptrcast if it would cast to the default pointer element
-  // type (i8) of the same address space.
-  if (ExpectedElementType->isIntegerTy(8) && !AllowCastingToChar)
+  // type (i8) of the same address space. In case of OpenCL kernels, make sure
+  // i8 is the pointer element type defined for the given kernel argument.
+  if (ExpectedElementType->isIntegerTy(8) &&
+      F->getCallingConv() != CallingConv::SPIR_KERNEL)
     return;
 
+  Argument *Arg = dyn_cast<Argument>(Pointer);
+  if (ExpectedElementType->isIntegerTy(8) &&
+      F->getCallingConv() == CallingConv::SPIR_KERNEL && Arg) {
+    MDString *ArgType = getOCLKernelArgType(*Arg->getParent(), Arg->getArgNo());
+    if (ArgType && ArgType->getString().starts_with("uchar*"))
+      return;
+  }
+
   // If this would be the first spv_ptrcast, the pointer's defining instruction
   // requires spv_assign_ptr_type and does not already have one, do not emit
   // spv_ptrcast and emit spv_assign_ptr_type instead.
diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll
new file mode 100644
index 00000000000000..cca71d409d258d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-kernel-arg-char.ll
@@ -0,0 +1,31 @@
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#INT64:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#VINT8:]] = OpTypeVector %[[#INT8]] 2
+; CHECK-DAG: %[[#PTRINT8:]] = OpTypePointer Workgroup %[[#INT8]]
+; CHECK-DAG: %[[#PTRVINT8:]] = OpTypePointer Workgroup %[[#VINT8]]
+; CHECK-DAG: %[[#CONST:]] = OpConstant %[[#INT64]] 1
+
+; CHECK: %[[#PARAM1:]] = OpFunctionParameter %[[#PTRVINT8]]
+define spir_kernel void @test1(ptr addrspace(3) %address) !kernel_arg_type !1 {
+; CHECK: %[[#BITCAST1:]] = OpBitcast %[[#PTRINT8]] %[[#PARAM1]]
+; CHECK: %[[#]] = OpInBoundsPtrAccessChain %[[#PTRINT8]] %[[#BITCAST1]] %[[#CONST]]
+  %cast = bitcast ptr addrspace(3) %address to ptr addrspace(3)
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %cast, i64 1
+  ret void
+}
+
+; CHECK: %[[#PARAM2:]] = OpFunctionParameter %[[#PTRVINT8]]
+define spir_kernel void @test2(ptr addrspace(3) %address) !kernel_arg_type !1 {
+; CHECK: %[[#BITCAST2:]] = OpBitcast %[[#PTRINT8]] %[[#PARAM2]]
+; CHECK: %[[#]] = OpInBoundsPtrAccessChain %[[#PTRINT8]] %[[#BITCAST2]] %[[#CONST]]
+  %gep = getelementptr inbounds i8, ptr addrspace(3) %address, i64 1
+  ret void
+}
+
+declare spir_func <2 x i8> @_Z6vload2mPU3AS3Kc(i64, ptr addrspace(3))
+
+!1 = !{!"char2*"}

>From 013542f14eefa4b15e55185065712851fbbec551 Mon Sep 17 00:00:00 2001
From: Michal Paszkowski <michal at paszkowski.org>
Date: Sat, 10 Feb 2024 02:01:52 -0800
Subject: [PATCH 3/3] [SPIR-V] Fix comment in vload2.ll test

---
 llvm/test/CodeGen/SPIRV/opencl/vload2.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/SPIRV/opencl/vload2.ll b/llvm/test/CodeGen/SPIRV/opencl/vload2.ll
index f7d380b96a3ef0..b219aebc29befe 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/vload2.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/vload2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
-; This test only itends to check the vloadn builtin lowering. 
+; This test only intends to check the vloadn builtin name resolution.
 ; The calls to the OpenCL builtins are not valid and will not pass SPIR-V validation.
 
 ; CHECK-DAG: %[[#IMPORT:]] = OpExtInstImport "OpenCL.std"