[Mlir-commits] [mlir] [MLIR][GPU-LLVM] Convert `gpu.func` to `llvm.func` (PR #101664)

Mon Aug 5 05:53:23 PDT 2024

https://github.com/victor-eds updated https://github.com/llvm/llvm-project/pull/101664

>From 08332a6f1fddf9c5d161f3b79934df2cae5de11a Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez at codeplay.com>
Date: Mon, 29 Jul 2024 13:28:47 +0100
Subject: [PATCH 1/3] [MLIR][GPU-LLVM] Convert `gpu.func` to `llvm.func`

Add support in `-convert-gpu-to-llvm-spv` to convert `gpu.func` to
`llvm.func` operations.

- `spir_kernel`/`spir_func` calling conventions used for
  kernels/functions.
- `workgroup` attributions encoded as additional `llvm.ptr<3>`
  arguments.
- No attribute used to annotate kernels
- `reqd_work_group_size` attribute using to encode
  `gpu.known_block_size`.

**Note**: A notable missing feature that will be addressed in a
follow-up PR is a `-use-bare-ptr-memref-call-conv` option to replace
MemRef arguments with bare pointers to the MemRef element types
instead of the current MemRef descriptor approach.

Signed-off-by: Victor Perez <victor.perez at codeplay.com>
---
 .../SPIRVCommon/AttrToLLVMConverter.h         |  18 ++
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 144 ++++++---
 .../lib/Conversion/GPUCommon/GPUOpsLowering.h |  51 +++-
 .../Conversion/GPUToLLVMSPV/CMakeLists.txt    |   2 +
 .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp  |  25 +-
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  16 +-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |   9 +-
 .../SPIRVCommon/AttrToLLVMConverter.cpp       |  61 ++++
 .../lib/Conversion/SPIRVCommon/CMakeLists.txt |   6 +
 .../lib/Conversion/SPIRVToLLVM/CMakeLists.txt |   1 +
 .../Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp    |  47 +--
 .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir         | 285 ++++++++++++++++++
 13 files changed, 556 insertions(+), 110 deletions(-)
 create mode 100644 mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h
 create mode 100644 mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp
 create mode 100644 mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt

diff --git a/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h
new file mode 100644
index 0000000000000..a99dd0fe6f133
--- /dev/null
+++ b/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h
@@ -0,0 +1,18 @@
+//===- AttrToLLVMConverter.h - SPIR-V attributes conversion to LLVM - C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
+#define MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
+
+#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
+
+namespace mlir {
+unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI,
+                                    spirv::StorageClass storageClass);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 80c8b84d9ae89..813f700c5556e 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -53,6 +53,7 @@ add_subdirectory(SCFToGPU)
 add_subdirectory(SCFToOpenMP)
 add_subdirectory(SCFToSPIRV)
 add_subdirectory(ShapeToStandard)
+add_subdirectory(SPIRVCommon)
 add_subdirectory(SPIRVToLLVM)
 add_subdirectory(TensorToLinalg)
 add_subdirectory(TensorToSPIRV)
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6053e34f30a41..0007294b3ff27 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -25,29 +25,58 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   Location loc = gpuFuncOp.getLoc();
 
   SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
-  workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-  for (const auto [idx, attribution] :
-       llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
-    auto type = dyn_cast<MemRefType>(attribution.getType());
-    assert(type && type.hasStaticShape() && "unexpected type in attribution");
-
-    uint64_t numElements = type.getNumElements();
-
-    auto elementType =
-        cast<Type>(typeConverter->convertType(type.getElementType()));
-    auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
-    std::string name =
-        std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
-    uint64_t alignment = 0;
-    if (auto alignAttr =
-            dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getWorkgroupAttributionAttr(
-                idx, LLVM::LLVMDialect::getAlignAttrName())))
-      alignment = alignAttr.getInt();
-    auto globalOp = rewriter.create<LLVM::GlobalOp>(
-        gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
-        LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
-        workgroupAddrSpace);
-    workgroupBuffers.push_back(globalOp);
+  if (encodeWorkgroupAttributionsAsArguments) {
+    ArrayRef<BlockArgument> workgroupAttributions =
+        gpuFuncOp.getWorkgroupAttributions();
+    std::size_t numAttributions = workgroupAttributions.size();
+
+    // Insert all arguments at the end.
+    unsigned index = gpuFuncOp.getNumArguments();
+    SmallVector<unsigned> argIndices(numAttributions, index);
+
+    // New arguments will simply be `llvm.ptr` with the correct address space
+    Type workgroupPtrType =
+        rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
+    SmallVector<Type> argTypes(numAttributions, workgroupPtrType);
+
+    // No argument attributes will be added
+    DictionaryAttr emptyDict = rewriter.getDictionaryAttr({});
+    SmallVector<DictionaryAttr> argAttrs(numAttributions, emptyDict);
+
+    // Location match function location
+    SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());
+
+    // Perform signature modification
+    rewriter.modifyOpInPlace(
+        gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
+          static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
+              argIndices, argTypes, argAttrs, argLocs);
+        });
+  } else {
+    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+    for (const auto [idx, attribution] :
+         llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+      auto type = dyn_cast<MemRefType>(attribution.getType());
+      assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+      uint64_t numElements = type.getNumElements();
+
+      auto elementType =
+          cast<Type>(typeConverter->convertType(type.getElementType()));
+      auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
+      std::string name =
+          std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
+      uint64_t alignment = 0;
+      if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
+              gpuFuncOp.getWorkgroupAttributionAttr(
+                  idx, LLVM::LLVMDialect::getAlignAttrName())))
+        alignment = alignAttr.getInt();
+      auto globalOp = rewriter.create<LLVM::GlobalOp>(
+          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+          LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
+          workgroupAddrSpace);
+      workgroupBuffers.push_back(globalOp);
+    }
   }
 
   // Remap proper input types.
@@ -101,16 +130,20 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   // attribute. The former is necessary for further translation while the
   // latter is expected by gpu.launch_func.
   if (gpuFuncOp.isKernel()) {
-    attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+    if (kernelAttributeName)
+      attributes.emplace_back(*kernelAttributeName, rewriter.getUnitAttr());
     // Set the dialect-specific block size attribute if there is one.
     if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
       attributes.emplace_back(kernelBlockSizeAttributeName.value(),
                               knownBlockSize);
     }
   }
+  LLVM::CConv callingConvention = gpuFuncOp.isKernel()
+                                      ? kernelCallingConvention
+                                      : nonKernelCallingConvention;
   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
       gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
-      LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
+      LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
       /*comdat=*/nullptr, attributes);
 
   {
@@ -125,24 +158,49 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     rewriter.setInsertionPointToStart(&gpuFuncOp.front());
     unsigned numProperArguments = gpuFuncOp.getNumArguments();
 
-    for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
-      auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
-                                                global.getAddrSpace());
-      Value address = rewriter.create<LLVM::AddressOfOp>(
-          loc, ptrType, global.getSymNameAttr());
-      Value memory =
-          rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(), address,
-                                       ArrayRef<LLVM::GEPArg>{0, 0});
-
-      // Build a memref descriptor pointing to the buffer to plug with the
-      // existing memref infrastructure. This may use more registers than
-      // otherwise necessary given that memref sizes are fixed, but we can try
-      // and canonicalize that away later.
-      Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
-      auto type = cast<MemRefType>(attribution.getType());
-      auto descr = MemRefDescriptor::fromStaticShape(
-          rewriter, loc, *getTypeConverter(), type, memory);
-      signatureConversion.remapInput(numProperArguments + idx, descr);
+    if (encodeWorkgroupAttributionsAsArguments) {
+      unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
+      assert(numProperArguments >= numAttributions &&
+             "Expecting attributions to be encoded as arguments already");
+
+      // Arguments encoding workgroup attributions will be in positions
+      // [numProperArguments, numProperArguments+numAttributions)
+      ArrayRef<BlockArgument> attributionArguments =
+          gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
+                                         numAttributions);
+      for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
+               gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
+        auto [attribution, arg] = vals;
+        auto type = cast<MemRefType>(attribution.getType());
+
+        // Arguments are of llvm.ptr type and attributions are of memref type:
+        // we need to wrap them in memref descriptors.
+        Value descr = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, *getTypeConverter(), type, arg);
+
+        // And remap the arguments
+        signatureConversion.remapInput(numProperArguments + idx, descr);
+      }
+    } else {
+      for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
+        auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                                  global.getAddrSpace());
+        Value address = rewriter.create<LLVM::AddressOfOp>(
+            loc, ptrType, global.getSymNameAttr());
+        Value memory =
+            rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(),
+                                         address, ArrayRef<LLVM::GEPArg>{0, 0});
+
+        // Build a memref descriptor pointing to the buffer to plug with the
+        // existing memref infrastructure. This may use more registers than
+        // otherwise necessary given that memref sizes are fixed, but we can try
+        // and canonicalize that away later.
+        Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
+        auto type = cast<MemRefType>(attribution.getType());
+        auto descr = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, *getTypeConverter(), type, memory);
+        signatureConversion.remapInput(numProperArguments + idx, descr);
+      }
     }
 
     // Rewrite private memory attributions to alloca'ed buffers.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 92e69badc27dd..781bea6b09406 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -35,16 +35,39 @@ struct GPUDynamicSharedMemoryOpLowering
   unsigned alignmentBit;
 };
 
+struct GPUFuncOpLoweringOptions {
+  /// The address space to use for `alloca`s in private memory.
+  unsigned allocaAddrSpace;
+  /// The address space to use declaring workgroup memory.
+  unsigned workgroupAddrSpace;
+
+  /// The attribute name to use instead of `gpu.kernel`.
+  std::optional<StringAttr> kernelAttributeName = std::nullopt;
+  /// The attribute name to to set block size
+  std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt;
+
+  /// The calling convention to use for kernel functions
+  LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
+  /// The calling convention to use for non-kernel functions
+  LLVM::CConv nonKernelCallingConvention = LLVM::CConv::C;
+
+  /// Whether to encode workgroup attributions as additional arguments instead
+  /// of a global variable.
+  bool encodeWorkgroupAttributionsAsArguments = false;
+};
+
 struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
-  GPUFuncOpLowering(
-      const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
-      unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
-      std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
+  GPUFuncOpLowering(const LLVMTypeConverter &converter,
+                    const GPUFuncOpLoweringOptions &options)
       : ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
-        allocaAddrSpace(allocaAddrSpace),
-        workgroupAddrSpace(workgroupAddrSpace),
-        kernelAttributeName(kernelAttributeName),
-        kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
+        allocaAddrSpace(options.allocaAddrSpace),
+        workgroupAddrSpace(options.workgroupAddrSpace),
+        kernelAttributeName(options.kernelAttributeName),
+        kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
+        kernelCallingConvention(options.kernelCallingConvention),
+        nonKernelCallingConvention(options.nonKernelCallingConvention),
+        encodeWorkgroupAttributionsAsArguments(
+            options.encodeWorkgroupAttributionsAsArguments) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -57,10 +80,18 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
   unsigned workgroupAddrSpace;
 
   /// The attribute name to use instead of `gpu.kernel`.
-  StringAttr kernelAttributeName;
-
+  std::optional<StringAttr> kernelAttributeName;
   /// The attribute name to to set block size
   std::optional<StringAttr> kernelBlockSizeAttributeName;
+
+  /// The calling convention to use for kernel functions
+  LLVM::CConv kernelCallingConvention;
+  /// The calling convention to use for non-kernel functions
+  LLVM::CConv nonKernelCallingConvention;
+
+  /// Whether to encode workgroup attributions as additional arguments instead
+  /// of a global variable.
+  bool encodeWorkgroupAttributionsAsArguments;
 };
 
 /// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt b/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt
index da5650b2b68dd..d47c5e679d86e 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt
@@ -6,7 +6,9 @@ add_mlir_conversion_library(MLIRGPUToLLVMSPV
 
   LINK_LIBS PUBLIC
   MLIRGPUDialect
+  MLIRGPUToGPURuntimeTransforms
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
+  MLIRSPIRVAttrToLLVMConversion
   MLIRSPIRVDialect
 )
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index 27d63b5f8948d..74dd5f19c20f5 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -8,15 +8,18 @@
 
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 
+#include "../GPUCommon/GPUOpsLowering.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
@@ -321,8 +324,8 @@ struct GPUToLLVMSPVConversionPass final
     LLVMConversionTarget target(*context);
 
     target.addIllegalOp<gpu::BarrierOp, gpu::BlockDimOp, gpu::BlockIdOp,
-                        gpu::GlobalIdOp, gpu::GridDimOp, gpu::ShuffleOp,
-                        gpu::ThreadIdOp>();
+                        gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
+                        gpu::ReturnOp, gpu::ShuffleOp, gpu::ThreadIdOp>();
 
     populateGpuToLLVMSPVConversionPatterns(converter, patterns);
 
@@ -340,11 +343,27 @@ struct GPUToLLVMSPVConversionPass final
 namespace mlir {
 void populateGpuToLLVMSPVConversionPatterns(LLVMTypeConverter &typeConverter,
                                             RewritePatternSet &patterns) {
-  patterns.add<GPUBarrierConversion, GPUShuffleConversion,
+  patterns.add<GPUBarrierConversion, GPUReturnOpLowering, GPUShuffleConversion,
                LaunchConfigOpConversion<gpu::BlockIdOp>,
                LaunchConfigOpConversion<gpu::GridDimOp>,
                LaunchConfigOpConversion<gpu::BlockDimOp>,
                LaunchConfigOpConversion<gpu::ThreadIdOp>,
                LaunchConfigOpConversion<gpu::GlobalIdOp>>(typeConverter);
+  constexpr spirv::ClientAPI clientAPI = spirv::ClientAPI::OpenCL;
+  MLIRContext *context = &typeConverter.getContext();
+  unsigned privateAddressSpace =
+      storageClassToAddressSpace(clientAPI, spirv::StorageClass::Function);
+  unsigned localAddressSpace =
+      storageClassToAddressSpace(clientAPI, spirv::StorageClass::Workgroup);
+  OperationName llvmFuncOpName(LLVM::LLVMFuncOp::getOperationName(), context);
+  StringAttr kernelBlockSizeAttributeName =
+      LLVM::LLVMFuncOp::getReqdWorkGroupSizeAttrName(llvmFuncOpName);
+  patterns.add<GPUFuncOpLowering>(
+      typeConverter,
+      GPUFuncOpLoweringOptions{
+          privateAddressSpace, localAddressSpace,
+          /*kernelAttributeName=*/std::nullopt, kernelBlockSizeAttributeName,
+          LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC,
+          /*encodeWorkgroupAttributionsAsArguments=*/true});
 }
 } // namespace mlir
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index faa97caacb885..060a1e1e82f75 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -365,13 +365,15 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
   // attributions since NVVM models it as `alloca`s in the default
   // memory space and does not support `alloca`s with addrspace(5).
   patterns.add<GPUFuncOpLowering>(
-      converter, /*allocaAddrSpace=*/0,
-      /*workgroupAddrSpace=*/
-      static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
-      StringAttr::get(&converter.getContext(),
-                      NVVM::NVVMDialect::getKernelFuncAttrName()),
-      StringAttr::get(&converter.getContext(),
-                      NVVM::NVVMDialect::getMaxntidAttrName()));
+      converter,
+      GPUFuncOpLoweringOptions{
+          /*allocaAddrSpace=*/0,
+          /*workgroupAddrSpace=*/
+          static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
+          StringAttr::get(&converter.getContext(),
+                          NVVM::NVVMDialect::getKernelFuncAttrName()),
+          StringAttr::get(&converter.getContext(),
+                          NVVM::NVVMDialect::getMaxntidAttrName())});
 
   populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
                                     "__nv_fmod");
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 100181cdc69fe..564bab1ad92b9 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -372,10 +372,11 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUReturnOpLowering>(converter);
   patterns.add<GPUFuncOpLowering>(
       converter,
-      /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
-      /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
-      rocdlDialect->getKernelAttrHelper().getName(),
-      rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName());
+      GPUFuncOpLoweringOptions{
+          /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
+          /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
+          rocdlDialect->getKernelAttrHelper().getName(),
+          rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
   if (Runtime::HIP == runtime) {
     patterns.add<GPUPrintfOpToHIPLowering>(converter);
   } else if (Runtime::OpenCL == runtime) {
diff --git a/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp b/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp
new file mode 100644
index 0000000000000..924bd1643f83b
--- /dev/null
+++ b/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp
@@ -0,0 +1,61 @@
+//===- AttrToLLVMConverter.cpp - SPIR-V attributes conversion to LLVM -C++ ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h>
+
+namespace {
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Constants
+//===----------------------------------------------------------------------===//
+
+constexpr unsigned defaultAddressSpace = 0;
+
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
+
+static unsigned
+storageClassToOCLAddressSpace(spirv::StorageClass storageClass) {
+  // Based on
+  // https://registry.khronos.org/SPIR-V/specs/unified1/OpenCL.ExtendedInstructionSet.100.html#_binary_form
+  // and clang/lib/Basic/Targets/SPIR.h.
+  switch (storageClass) {
+  case spirv::StorageClass::Function:
+    return 0;
+  case spirv::StorageClass::Input:
+  case spirv::StorageClass::CrossWorkgroup:
+    return 1;
+  case spirv::StorageClass::UniformConstant:
+    return 2;
+  case spirv::StorageClass::Workgroup:
+    return 3;
+  case spirv::StorageClass::Generic:
+    return 4;
+  case spirv::StorageClass::DeviceOnlyINTEL:
+    return 5;
+  case spirv::StorageClass::HostOnlyINTEL:
+    return 6;
+  default:
+    return defaultAddressSpace;
+  }
+}
+} // namespace
+
+namespace mlir {
+unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI,
+                                    spirv::StorageClass storageClass) {
+  switch (clientAPI) {
+  case spirv::ClientAPI::OpenCL:
+    return storageClassToOCLAddressSpace(storageClass);
+  default:
+    return defaultAddressSpace;
+  }
+}
+} // namespace mlir
diff --git a/mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt b/mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt
new file mode 100644
index 0000000000000..cd5a4c225efbf
--- /dev/null
+++ b/mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_mlir_conversion_library(MLIRSPIRVAttrToLLVMConversion
+  AttrToLLVMConverter.cpp
+
+  DEPENDS
+  MLIRSPIRVEnumsIncGen
+)
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt b/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt
index 549785b154c1b..e563315d95c9c 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt
@@ -18,6 +18,7 @@ add_mlir_conversion_library(MLIRSPIRVToLLVM
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
   MLIRMemRefToLLVM
+  MLIRSPIRVAttrToLLVMConversion
   MLIRSPIRVDialect
   MLIRSPIRVUtils
   MLIRTransforms
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index da09384bfbe89..ca78631632419 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
@@ -28,12 +29,6 @@
 
 using namespace mlir;
 
-//===----------------------------------------------------------------------===//
-// Constants
-//===----------------------------------------------------------------------===//
-
-constexpr unsigned defaultAddressSpace = 0;
-
 //===----------------------------------------------------------------------===//
 // Utility functions
 //===----------------------------------------------------------------------===//
@@ -273,47 +268,13 @@ static std::optional<Type> convertArrayType(spirv::ArrayType type,
   return LLVM::LLVMArrayType::get(llvmElementType, numElements);
 }
 
-static unsigned mapToOpenCLAddressSpace(spirv::StorageClass storageClass) {
-  // Based on
-  // https://registry.khronos.org/SPIR-V/specs/unified1/OpenCL.ExtendedInstructionSet.100.html#_binary_form
-  // and clang/lib/Basic/Targets/SPIR.h.
-  switch (storageClass) {
-#define STORAGE_SPACE_MAP(storage, space)                                      \
-  case spirv::StorageClass::storage:                                           \
-    return space;
-    STORAGE_SPACE_MAP(Function, 0)
-    STORAGE_SPACE_MAP(CrossWorkgroup, 1)
-    STORAGE_SPACE_MAP(Input, 1)
-    STORAGE_SPACE_MAP(UniformConstant, 2)
-    STORAGE_SPACE_MAP(Workgroup, 3)
-    STORAGE_SPACE_MAP(Generic, 4)
-    STORAGE_SPACE_MAP(DeviceOnlyINTEL, 5)
-    STORAGE_SPACE_MAP(HostOnlyINTEL, 6)
-#undef STORAGE_SPACE_MAP
-  default:
-    return defaultAddressSpace;
-  }
-}
-
-static unsigned mapToAddressSpace(spirv::ClientAPI clientAPI,
-                                  spirv::StorageClass storageClass) {
-  switch (clientAPI) {
-#define CLIENT_MAP(client, storage)                                            \
-  case spirv::ClientAPI::client:                                               \
-    return mapTo##client##AddressSpace(storage);
-    CLIENT_MAP(OpenCL, storageClass)
-#undef CLIENT_MAP
-  default:
-    return defaultAddressSpace;
-  }
-}
-
 /// Converts SPIR-V pointer type to LLVM pointer. Pointer's storage class is not
 /// modelled at the moment.
 static Type convertPointerType(spirv::PointerType type,
                                LLVMTypeConverter &converter,
                                spirv::ClientAPI clientAPI) {
-  unsigned addressSpace = mapToAddressSpace(clientAPI, type.getStorageClass());
+  unsigned addressSpace =
+      storageClassToAddressSpace(clientAPI, type.getStorageClass());
   return LLVM::LLVMPointerType::get(type.getContext(), addressSpace);
 }
 
@@ -822,7 +783,7 @@ class GlobalVariablePattern
                        : LLVM::Linkage::External;
     auto newGlobalOp = rewriter.replaceOpWithNewOp<LLVM::GlobalOp>(
         op, dstType, isConstant, linkage, op.getSymName(), Attribute(),
-        /*alignment=*/0, mapToAddressSpace(clientAPI, storageClass));
+        /*alignment=*/0, storageClassToAddressSpace(clientAPI, storageClass));
 
     // Attach location attribute if applicable
     if (op.getLocationAttr())
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index bd7e5d139b001..ce3cc9a6137d3 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -377,3 +377,288 @@ gpu.module @shuffles_mismatch {
     return
   }
 }
+
+// -----
+
+gpu.module @kernels {
+// CHECK:           llvm.func spir_funccc @no_kernel() {
+  gpu.func @no_kernel() {
+    gpu.return
+  }
+
+// CHECK:           llvm.func spir_kernelcc @kernel_no_arg() attributes {gpu.kernel} {
+  gpu.func @kernel_no_arg() kernel {
+    gpu.return
+  }
+
+// CHECK:           llvm.func spir_kernelcc @kernel_with_args(%[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: i64) attributes {gpu.kernel} {
+  gpu.func @kernel_with_args(%arg0: f32, %arg1: i64) kernel {
+    gpu.return
+  }
+
+// CHECK-64:           llvm.func spir_kernelcc @kernel_with_conv_args(%[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: !llvm.ptr, %[[VAL_4:.*]]: !llvm.ptr, %[[VAL_5:.*]]: i64) attributes {gpu.kernel} {
+// CHECK-32:           llvm.func spir_kernelcc @kernel_with_conv_args(%[[VAL_2:.*]]: i32, %[[VAL_3:.*]]: !llvm.ptr, %[[VAL_4:.*]]: !llvm.ptr, %[[VAL_5:.*]]: i32) attributes {gpu.kernel} {
+  gpu.func @kernel_with_conv_args(%arg0: index, %arg1: memref<index>) kernel {
+    gpu.return
+  }
+
+// CHECK-64:           llvm.func spir_kernelcc @kernel_with_sized_memref(%[[VAL_6:.*]]: !llvm.ptr, %[[VAL_7:.*]]: !llvm.ptr, %[[VAL_8:.*]]: i64, %[[VAL_9:.*]]: i64, %[[VAL_10:.*]]: i64) attributes {gpu.kernel} {
+// CHECK-32:           llvm.func spir_kernelcc @kernel_with_sized_memref(%[[VAL_6:.*]]: !llvm.ptr, %[[VAL_7:.*]]: !llvm.ptr, %[[VAL_8:.*]]: i32, %[[VAL_9:.*]]: i32, %[[VAL_10:.*]]: i32) attributes {gpu.kernel} {
+  gpu.func @kernel_with_sized_memref(%arg0: memref<1xindex>) kernel {
+    gpu.return
+  }
+
+// CHECK-64:           llvm.func spir_kernelcc @kernel_with_ND_memref(%[[VAL_11:.*]]: !llvm.ptr, %[[VAL_12:.*]]: !llvm.ptr, %[[VAL_13:.*]]: i64, %[[VAL_14:.*]]: i64, %[[VAL_15:.*]]: i64, %[[VAL_16:.*]]: i64, %[[VAL_17:.*]]: i64, %[[VAL_18:.*]]: i64, %[[VAL_19:.*]]: i64) attributes {gpu.kernel} {
+// CHECK-32:           llvm.func spir_kernelcc @kernel_with_ND_memref(%[[VAL_11:.*]]: !llvm.ptr, %[[VAL_12:.*]]: !llvm.ptr, %[[VAL_13:.*]]: i32, %[[VAL_14:.*]]: i32, %[[VAL_15:.*]]: i32, %[[VAL_16:.*]]: i32, %[[VAL_17:.*]]: i32, %[[VAL_18:.*]]: i32, %[[VAL_19:.*]]: i32) attributes {gpu.kernel} {
+  gpu.func @kernel_with_ND_memref(%arg0: memref<128x128x128xindex>) kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+gpu.module @kernels {
+// CHECK-LABEL:           llvm.func spir_kernelcc @kernel_with_private_attribs(
+// CHECK-SAME:                %[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: i16) attributes {gpu.kernel} {
+// CHECK:                   %[[VAL_2:.*]] = llvm.mlir.constant(32 : i64) : i64
+// CHECK:                   %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x f32 : (i64) -> !llvm.ptr
+
+// CHECK-64:             %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_4]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_6:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_8:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_6]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_9:.*]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-64:             %[[VAL_10:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_8]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_11:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_12:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_10]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_13:.*]] = builtin.unrealized_conversion_cast %[[VAL_12]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<32xf32>
+
+// CHECK-32:             %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_4]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_6:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_5]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_8:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_6]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_9:.*]] = llvm.mlir.constant(32 : index) : i32
+// CHECK-32:             %[[VAL_10:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_8]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_11:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_12:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_10]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_13:.*]] = builtin.unrealized_conversion_cast %[[VAL_12]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<32xf32>
+
+// CHECK:                %[[VAL_14:.*]] = llvm.mlir.constant(16 : i64) : i64
+// CHECK:                %[[VAL_15:.*]] = llvm.alloca %[[VAL_14]] x i16 : (i64) -> !llvm.ptr
+
+// CHECK-64:             %[[VAL_16:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_17:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_16]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_18:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_17]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_19:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_18]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_21:.*]] = llvm.mlir.constant(16 : index) : i64
+// CHECK-64:             %[[VAL_22:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_20]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_23:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_24:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_22]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_25:.*]] = builtin.unrealized_conversion_cast %[[VAL_24]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<16xi16>
+
+// CHECK-32:             %[[VAL_16:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_17:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_16]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_18:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_17]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_19:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_18]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_21:.*]] = llvm.mlir.constant(16 : index) : i32
+// CHECK-32:             %[[VAL_22:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_20]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_23:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_24:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_22]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_25:.*]] = builtin.unrealized_conversion_cast %[[VAL_24]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<16xi16>
+
+// CHECK:                %[[VAL_26:.*]] = arith.constant 0 : index
+// CHECK:                memref.store %[[VAL_0]], %[[VAL_13]]{{\[}}%[[VAL_26]]] : memref<32xf32>
+// CHECK:                memref.store %[[VAL_1]], %[[VAL_25]]{{\[}}%[[VAL_26]]] : memref<16xi16>
+  gpu.func @kernel_with_private_attribs(%arg0: f32, %arg1: i16)
+      private(%arg2: memref<32xf32>, %arg3: memref<16xi16>)
+      kernel {
+    %c0 = arith.constant 0 : index
+    memref.store %arg0, %arg2[%c0] : memref<32xf32>
+    memref.store %arg1, %arg3[%c0] : memref<16xi16>
+    gpu.return
+  }
+
+// CHECK-LABEL:        llvm.func spir_kernelcc @kernel_with_workgoup_attribs(
+// CHECK-SAME:             %[[VAL_27:.*]]: f32, %[[VAL_28:.*]]: i16, %[[VAL_29:.*]]: !llvm.ptr<3>, %[[VAL_30:.*]]: !llvm.ptr<3>) attributes {gpu.kernel} {
+
+// CHECK-64:             %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_32:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_31]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_33:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_32]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_34:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_35:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_33]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_36:.*]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-64:             %[[VAL_37:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_35]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_38:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_39:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_37]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_40:.*]] = builtin.unrealized_conversion_cast %[[VAL_39]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<32xf32, 3>
+// CHECK-64:             %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_41]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_42]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_44:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_45:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_43]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_46:.*]] = llvm.mlir.constant(16 : index) : i64
+// CHECK-64:             %[[VAL_47:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_45]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_48:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_49:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_47]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_50:.*]] = builtin.unrealized_conversion_cast %[[VAL_49]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<16xi16, 3>
+
+// CHECK-32:             %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_32:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_31]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_33:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_32]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_34:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_35:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_33]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_36:.*]] = llvm.mlir.constant(32 : index) : i32
+// CHECK-32:             %[[VAL_37:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_35]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_38:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_39:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_37]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_40:.*]] = builtin.unrealized_conversion_cast %[[VAL_39]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<32xf32, 3>
+// CHECK-32:             %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_41]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_42]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_44:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_45:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_43]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_46:.*]] = llvm.mlir.constant(16 : index) : i32
+// CHECK-32:             %[[VAL_47:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_45]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_48:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_49:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_47]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_50:.*]] = builtin.unrealized_conversion_cast %[[VAL_49]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<16xi16, 3>
+
+// CHECK:                %[[VAL_51:.*]] = arith.constant 0 : index
+// CHECK:                memref.store %[[VAL_27]], %[[VAL_40]]{{\[}}%[[VAL_51]]] : memref<32xf32, 3>
+// CHECK:                memref.store %[[VAL_28]], %[[VAL_50]]{{\[}}%[[VAL_51]]] : memref<16xi16, 3>
+  gpu.func @kernel_with_workgoup_attribs(%arg0: f32, %arg1: i16)
+      workgroup(%arg2: memref<32xf32, 3>, %arg3: memref<16xi16, 3>)
+      kernel {
+    %c0 = arith.constant 0 : index
+    memref.store %arg0, %arg2[%c0] : memref<32xf32, 3>
+    memref.store %arg1, %arg3[%c0] : memref<16xi16, 3>
+    gpu.return
+  }
+
+// CHECK-LABEL:        llvm.func spir_kernelcc @kernel_with_both_attribs(
+// CHECK-64-SAME:          %[[VAL_52:.*]]: f32, %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32, %[[VAL_55:.*]]: i64, %[[VAL_56:.*]]: !llvm.ptr<3>, %[[VAL_57:.*]]: !llvm.ptr<3>) attributes {gpu.kernel} {
+// CHECK-32-SAME:          %[[VAL_52:.*]]: f32, %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32, %[[VAL_55:.*]]: i32, %[[VAL_56:.*]]: !llvm.ptr<3>, %[[VAL_57:.*]]: !llvm.ptr<3>) attributes {gpu.kernel} {
+
+// CHECK-64:             %[[VAL_58:.*]] = builtin.unrealized_conversion_cast %[[VAL_55]] : i64 to index
+// CHECK-64:             %[[VAL_59:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_60:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_59]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_61:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_60]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_62:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_61]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_64:.*]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-64:             %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_66:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_66]], %[[VAL_65]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_68:.*]] = builtin.unrealized_conversion_cast %[[VAL_67]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<32xf32, 3>
+// CHECK-64:             %[[VAL_69:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_69]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_70]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_72:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_72]], %[[VAL_71]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_74:.*]] = llvm.mlir.constant(16 : index) : i64
+// CHECK-64:             %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_74]], %[[VAL_73]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_76:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_76]], %[[VAL_75]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_78:.*]] = builtin.unrealized_conversion_cast %[[VAL_77]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<16xi16, 3>
+
+// CHECK-32:             %[[VAL_58:.*]] = builtin.unrealized_conversion_cast %[[VAL_55]] : i32 to index
+// CHECK-32:             %[[VAL_59:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_60:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_59]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_61:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_60]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_62:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_61]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_64:.*]] = llvm.mlir.constant(32 : index) : i32
+// CHECK-32:             %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_66:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_66]], %[[VAL_65]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_68:.*]] = builtin.unrealized_conversion_cast %[[VAL_67]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<32xf32, 3>
+// CHECK-32:             %[[VAL_69:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_69]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_70]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_72:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_72]], %[[VAL_71]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_74:.*]] = llvm.mlir.constant(16 : index) : i32
+// CHECK-32:             %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_74]], %[[VAL_73]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_76:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_76]], %[[VAL_75]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_78:.*]] = builtin.unrealized_conversion_cast %[[VAL_77]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<16xi16, 3>
+
+// CHECK:                %[[VAL_79:.*]] = llvm.mlir.constant(32 : i64) : i64
+// CHECK:                %[[VAL_80:.*]] = llvm.alloca %[[VAL_79]] x i32 : (i64) -> !llvm.ptr
+
+// CHECK-64:             %[[VAL_81:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_81]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_82]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_84:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_84]], %[[VAL_83]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_86:.*]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-64:             %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_86]], %[[VAL_85]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_88:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_88]], %[[VAL_87]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_90:.*]] = builtin.unrealized_conversion_cast %[[VAL_89]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<32xi32>
+
+// CHECK-32:             %[[VAL_81:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_81]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_82]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_84:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_84]], %[[VAL_83]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_86:.*]] = llvm.mlir.constant(32 : index) : i32
+// CHECK-32:             %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_86]], %[[VAL_85]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_88:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_88]], %[[VAL_87]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_90:.*]] = builtin.unrealized_conversion_cast %[[VAL_89]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<32xi32>
+
+// CHECK:                %[[VAL_91:.*]] = llvm.mlir.constant(32 : i64) : i64
+
+// CHECK-64:             %[[VAL_92:.*]] = llvm.alloca %[[VAL_91]] x i64 : (i64) -> !llvm.ptr
+// CHECK-32:             %[[VAL_92:.*]] = llvm.alloca %[[VAL_91]] x i32 : (i64) -> !llvm.ptr
+
+// CHECK-64:             %[[VAL_93:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_93]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_94]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_96:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK-64:             %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_96]], %[[VAL_95]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_98:.*]] = llvm.mlir.constant(32 : index) : i64
+// CHECK-64:             %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_98]], %[[VAL_97]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_100:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK-64:             %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_100]], %[[VAL_99]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+// CHECK-64:             %[[VAL_102:.*]] = builtin.unrealized_conversion_cast %[[VAL_101]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<32xindex>
+
+// CHECK-32:             %[[VAL_93:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_93]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_94]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_96:.*]] = llvm.mlir.constant(0 : index) : i32
+// CHECK-32:             %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_96]], %[[VAL_95]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_98:.*]] = llvm.mlir.constant(32 : index) : i32
+// CHECK-32:             %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_98]], %[[VAL_97]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_100:.*]] = llvm.mlir.constant(1 : index) : i32
+// CHECK-32:             %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_100]], %[[VAL_99]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
+// CHECK-32:             %[[VAL_102:.*]] = builtin.unrealized_conversion_cast %[[VAL_101]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<32xindex>
+
+// CHECK:                %[[VAL_103:.*]] = arith.constant 0 : index
+// CHECK:                memref.store %[[VAL_52]], %[[VAL_68]]{{\[}}%[[VAL_103]]] : memref<32xf32, 3>
+// CHECK:                memref.store %[[VAL_53]], %[[VAL_78]]{{\[}}%[[VAL_103]]] : memref<16xi16, 3>
+// CHECK:                memref.store %[[VAL_54]], %[[VAL_90]]{{\[}}%[[VAL_103]]] : memref<32xi32>
+// CHECK:                memref.store %[[VAL_58]], %[[VAL_102]]{{\[}}%[[VAL_103]]] : memref<32xindex>
+  gpu.func @kernel_with_both_attribs(%arg0: f32, %arg1: i16, %arg2: i32, %arg3: index)
+      workgroup(%arg4: memref<32xf32, 3>, %arg5: memref<16xi16, 3>)
+      private(%arg6: memref<32xi32>, %arg7: memref<32xindex>)
+      kernel {
+    %c0 = arith.constant 0 : index
+    memref.store %arg0, %arg4[%c0] : memref<32xf32, 3>
+    memref.store %arg1, %arg5[%c0] : memref<16xi16, 3>
+    memref.store %arg2, %arg6[%c0] : memref<32xi32>
+    memref.store %arg3, %arg7[%c0] : memref<32xindex>
+    gpu.return
+  }
+
+// CHECK-LABEL:     llvm.func spir_kernelcc @kernel_known_block_size
+// CHECK-SAME:          reqd_work_group_size = array<i32: 128, 128, 256>
+  gpu.func @kernel_known_block_size() kernel attributes {known_block_size = array<i32: 128, 128, 256>} {
+    gpu.return
+  }
+}

>From 098af95fc1b99de2dd1dd7025619eaf535840c58 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez at codeplay.com>
Date: Mon, 5 Aug 2024 11:16:58 +0100
Subject: [PATCH 2/3] Apply suggestions and implement
 `llvm.mlir.workgroup_attrib_size`

---
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |   4 +
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   |  49 +++-
 .../lib/Conversion/GPUCommon/GPUOpsLowering.h |  11 +-
 .../SPIRVCommon/AttrToLLVMConverter.cpp       |   3 +-
 .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir         | 232 +++---------------
 5 files changed, 97 insertions(+), 202 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index c4c011f30b3bc..8e933afbb02f1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -58,6 +58,10 @@ def LLVM_Dialect : Dialect {
     /// effect when lowering to the LLVMDialect.
     static StringRef getReadnoneAttrName() { return "llvm.readnone"; }
 
+    /// Name of the helper attribute to keep GPU workgroup attribution size
+    /// information when converting from GPU to LLVM.
+    static StringRef getWorkgroupAttribSizeAttrName() { return "llvm.mlir.workgroup_attrib_size"; }
+
     /// Verifies if the given string is a well-formed data layout descriptor.
     /// Uses `reportError` to report errors.
     static LogicalResult verifyDataLayoutString(
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 0007294b3ff27..b0d217650ba5f 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -8,6 +8,7 @@
 
 #include "GPUOpsLowering.h"
 
+#include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
@@ -19,6 +20,22 @@
 
 using namespace mlir;
 
+namespace {
+constexpr int64_t sizeQueryFailure = 0;
+
+static int64_t getAttributionSize(BlockArgument attribution,
+                                  const LLVMTypeConverter &converter,
+                                  const DataLayout &layout) {
+  auto attributionType = cast<MemRefType>(attribution.getType());
+  int64_t numElements = attributionType.getNumElements();
+  Type elementType = converter.convertType(attributionType.getElementType());
+  if (!elementType)
+    return sizeQueryFailure;
+  int64_t elementTypeSize = layout.getTypeSize(elementType);
+  return numElements * elementTypeSize;
+}
+} // namespace
+
 LogicalResult
 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                    ConversionPatternRewriter &rewriter) const {
@@ -28,7 +45,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   if (encodeWorkgroupAttributionsAsArguments) {
     ArrayRef<BlockArgument> workgroupAttributions =
         gpuFuncOp.getWorkgroupAttributions();
-    std::size_t numAttributions = workgroupAttributions.size();
+    size_t numAttributions = workgroupAttributions.size();
 
     // Insert all arguments at the end.
     unsigned index = gpuFuncOp.getNumArguments();
@@ -39,9 +56,30 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
         rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
     SmallVector<Type> argTypes(numAttributions, workgroupPtrType);
 
-    // No argument attributes will be added
-    DictionaryAttr emptyDict = rewriter.getDictionaryAttr({});
-    SmallVector<DictionaryAttr> argAttrs(numAttributions, emptyDict);
+    // Attributes: noalias, llvm.mlir.workgroup_attrib_size(<size>)
+    std::array attrs{
+        rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
+                              rewriter.getUnitAttr()),
+        rewriter.getNamedAttr(
+            LLVM::LLVMDialect::getWorkgroupAttribSizeAttrName(),
+            rewriter.getUnitAttr()),
+    };
+    SmallVector<DictionaryAttr> argAttrs;
+    assert(defaultLayout && "Expecting defaultLayout to be intialized");
+    const DataLayout *layout = &*defaultLayout;
+    if (const DataLayoutAnalysis *analysis =
+            getTypeConverter()->getDataLayoutAnalysis()) {
+      layout = &analysis->getAbove(gpuFuncOp);
+    }
+    for (BlockArgument attribution : workgroupAttributions) {
+      int64_t dataSize =
+          getAttributionSize(attribution, *getTypeConverter(), *layout);
+      // Check for special failure value
+      if (dataSize == sizeQueryFailure)
+        return failure();
+      attrs.back().setValue(rewriter.getI64IntegerAttr(dataSize));
+      argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
+    }
 
     // Location match function location
     SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());
@@ -54,7 +92,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
         });
   } else {
     workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-    for (const auto [idx, attribution] :
+    for (auto [idx, attribution] :
          llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
       auto type = dyn_cast<MemRefType>(attribution.getType());
       assert(type && type.hasStaticShape() && "unexpected type in attribution");
@@ -297,6 +335,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
       copyPointerAttribute(
           LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
+      copyPointerAttribute(LLVM::LLVMDialect::getWorkgroupAttribSizeAttrName());
     }
   }
   rewriter.eraseOp(gpuFuncOp);
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 781bea6b09406..0c8213c205269 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -67,7 +67,10 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
         kernelCallingConvention(options.kernelCallingConvention),
         nonKernelCallingConvention(options.nonKernelCallingConvention),
         encodeWorkgroupAttributionsAsArguments(
-            options.encodeWorkgroupAttributionsAsArguments) {}
+            options.encodeWorkgroupAttributionsAsArguments),
+        defaultLayout(options.encodeWorkgroupAttributionsAsArguments
+                          ? std::optional<DataLayout>(DataLayout())
+                          : std::optional<DataLayout>()) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -92,6 +95,12 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
   /// Whether to encode workgroup attributions as additional arguments instead
   /// of a global variable.
   bool encodeWorkgroupAttributionsAsArguments;
+
+  /// Default layout to use in absence of the corresponding analysis.
+  /// This will only be initialized if
+  /// encodeWorkgroupAttributionsAsArguments=true, as it will remain unused
+  /// otherwise.
+  std::optional<DataLayout> defaultLayout;
 };
 
 /// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp b/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp
index 924bd1643f83b..7f83a474c3f93 100644
--- a/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp
+++ b/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp
@@ -8,8 +8,8 @@
 
 #include <mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h>
 
+namespace mlir {
 namespace {
-using namespace mlir;
 
 //===----------------------------------------------------------------------===//
 // Constants
@@ -48,7 +48,6 @@ storageClassToOCLAddressSpace(spirv::StorageClass storageClass) {
 }
 } // namespace
 
-namespace mlir {
 unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI,
                                     spirv::StorageClass storageClass) {
   switch (clientAPI) {
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index ce3cc9a6137d3..f7dfa40b2da71 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -420,239 +420,83 @@ gpu.module @kernels {
 gpu.module @kernels {
 // CHECK-LABEL:           llvm.func spir_kernelcc @kernel_with_private_attribs(
 // CHECK-SAME:                %[[VAL_0:.*]]: f32, %[[VAL_1:.*]]: i16) attributes {gpu.kernel} {
+
+// Private attribution is converted to an llvm.alloca
+
 // CHECK:                   %[[VAL_2:.*]] = llvm.mlir.constant(32 : i64) : i64
 // CHECK:                   %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x f32 : (i64) -> !llvm.ptr
 
-// CHECK-64:             %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_4]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_6:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_8:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_6]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_9:.*]] = llvm.mlir.constant(32 : index) : i64
-// CHECK-64:             %[[VAL_10:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_8]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_11:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_12:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_10]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_13:.*]] = builtin.unrealized_conversion_cast %[[VAL_12]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<32xf32>
+// MemRef descriptor built from allocated pointer
 
+// CHECK-64:             %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK-32:             %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_4]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_6:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_5]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_8:.*]] = llvm.insertvalue %[[VAL_7]], %[[VAL_6]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_9:.*]] = llvm.mlir.constant(32 : index) : i32
-// CHECK-32:             %[[VAL_10:.*]] = llvm.insertvalue %[[VAL_9]], %[[VAL_8]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_11:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_12:.*]] = llvm.insertvalue %[[VAL_11]], %[[VAL_10]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_13:.*]] = builtin.unrealized_conversion_cast %[[VAL_12]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<32xf32>
+
+// CHECK:                %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_4]][0]
+// CHECK:                llvm.insertvalue %[[VAL_3]], %[[VAL_5]][1]
+
+// Same code as above
 
 // CHECK:                %[[VAL_14:.*]] = llvm.mlir.constant(16 : i64) : i64
 // CHECK:                %[[VAL_15:.*]] = llvm.alloca %[[VAL_14]] x i16 : (i64) -> !llvm.ptr
 
 // CHECK-64:             %[[VAL_16:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_17:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_16]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_18:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_17]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_19:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_18]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_21:.*]] = llvm.mlir.constant(16 : index) : i64
-// CHECK-64:             %[[VAL_22:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_20]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_23:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_24:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_22]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_25:.*]] = builtin.unrealized_conversion_cast %[[VAL_24]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<16xi16>
-
 // CHECK-32:             %[[VAL_16:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_17:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_16]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_18:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_17]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_19:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_19]], %[[VAL_18]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_21:.*]] = llvm.mlir.constant(16 : index) : i32
-// CHECK-32:             %[[VAL_22:.*]] = llvm.insertvalue %[[VAL_21]], %[[VAL_20]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_23:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_24:.*]] = llvm.insertvalue %[[VAL_23]], %[[VAL_22]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_25:.*]] = builtin.unrealized_conversion_cast %[[VAL_24]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<16xi16>
-
-// CHECK:                %[[VAL_26:.*]] = arith.constant 0 : index
-// CHECK:                memref.store %[[VAL_0]], %[[VAL_13]]{{\[}}%[[VAL_26]]] : memref<32xf32>
-// CHECK:                memref.store %[[VAL_1]], %[[VAL_25]]{{\[}}%[[VAL_26]]] : memref<16xi16>
+
+// CHECK:                %[[VAL_17:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_16]][0]
+// CHECK:                llvm.insertvalue %[[VAL_15]], %[[VAL_17]][1]
   gpu.func @kernel_with_private_attribs(%arg0: f32, %arg1: i16)
       private(%arg2: memref<32xf32>, %arg3: memref<16xi16>)
       kernel {
-    %c0 = arith.constant 0 : index
-    memref.store %arg0, %arg2[%c0] : memref<32xf32>
-    memref.store %arg1, %arg3[%c0] : memref<16xi16>
     gpu.return
   }
 
+// Workgroup attributions are converted to an llvm.ptr<3> argument
+
 // CHECK-LABEL:        llvm.func spir_kernelcc @kernel_with_workgoup_attribs(
-// CHECK-SAME:             %[[VAL_27:.*]]: f32, %[[VAL_28:.*]]: i16, %[[VAL_29:.*]]: !llvm.ptr<3>, %[[VAL_30:.*]]: !llvm.ptr<3>) attributes {gpu.kernel} {
+// CHECK-SAME:             %[[VAL_27:.*]]: f32, %[[VAL_28:.*]]: i16,
+// CHECK-SAME:             %[[VAL_29:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 128 : i64, llvm.noalias},
+// CHECK-SAME:             %[[VAL_30:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 32 : i64, llvm.noalias}) attributes {gpu.kernel} {
 
-// CHECK-64:             %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_32:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_31]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_33:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_32]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_34:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_35:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_33]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_36:.*]] = llvm.mlir.constant(32 : index) : i64
-// CHECK-64:             %[[VAL_37:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_35]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_38:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_39:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_37]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_40:.*]] = builtin.unrealized_conversion_cast %[[VAL_39]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<32xf32, 3>
-// CHECK-64:             %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_41]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_42]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_44:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_45:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_43]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_46:.*]] = llvm.mlir.constant(16 : index) : i64
-// CHECK-64:             %[[VAL_47:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_45]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_48:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_49:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_47]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_50:.*]] = builtin.unrealized_conversion_cast %[[VAL_49]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<16xi16, 3>
+// MemRef descriptor built from new argument
 
+// CHECK-64:             %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK-32:             %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_32:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_31]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_33:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_32]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_34:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_35:.*]] = llvm.insertvalue %[[VAL_34]], %[[VAL_33]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_36:.*]] = llvm.mlir.constant(32 : index) : i32
-// CHECK-32:             %[[VAL_37:.*]] = llvm.insertvalue %[[VAL_36]], %[[VAL_35]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_38:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_39:.*]] = llvm.insertvalue %[[VAL_38]], %[[VAL_37]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_40:.*]] = builtin.unrealized_conversion_cast %[[VAL_39]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<32xf32, 3>
+
+// CHECK:                %[[VAL_32:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_31]][0]
+// CHECK:                llvm.insertvalue %[[VAL_29]], %[[VAL_32]][1]
+
+// Same as above
+
+// CHECK-64:             %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK-32:             %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_41]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_43:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_42]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_44:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_45:.*]] = llvm.insertvalue %[[VAL_44]], %[[VAL_43]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_46:.*]] = llvm.mlir.constant(16 : index) : i32
-// CHECK-32:             %[[VAL_47:.*]] = llvm.insertvalue %[[VAL_46]], %[[VAL_45]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_48:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_49:.*]] = llvm.insertvalue %[[VAL_48]], %[[VAL_47]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_50:.*]] = builtin.unrealized_conversion_cast %[[VAL_49]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<16xi16, 3>
-
-// CHECK:                %[[VAL_51:.*]] = arith.constant 0 : index
-// CHECK:                memref.store %[[VAL_27]], %[[VAL_40]]{{\[}}%[[VAL_51]]] : memref<32xf32, 3>
-// CHECK:                memref.store %[[VAL_28]], %[[VAL_50]]{{\[}}%[[VAL_51]]] : memref<16xi16, 3>
+
+// CHECK:                %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_41]][0]
+// CHECK:                llvm.insertvalue %[[VAL_30]], %[[VAL_42]][1]
   gpu.func @kernel_with_workgoup_attribs(%arg0: f32, %arg1: i16)
       workgroup(%arg2: memref<32xf32, 3>, %arg3: memref<16xi16, 3>)
       kernel {
-    %c0 = arith.constant 0 : index
-    memref.store %arg0, %arg2[%c0] : memref<32xf32, 3>
-    memref.store %arg1, %arg3[%c0] : memref<16xi16, 3>
     gpu.return
   }
 
+// Check with both private and workgroup attributions. Simply check additional
+// arguments and a llvm.alloca are present.
+
 // CHECK-LABEL:        llvm.func spir_kernelcc @kernel_with_both_attribs(
-// CHECK-64-SAME:          %[[VAL_52:.*]]: f32, %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32, %[[VAL_55:.*]]: i64, %[[VAL_56:.*]]: !llvm.ptr<3>, %[[VAL_57:.*]]: !llvm.ptr<3>) attributes {gpu.kernel} {
-// CHECK-32-SAME:          %[[VAL_52:.*]]: f32, %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32, %[[VAL_55:.*]]: i32, %[[VAL_56:.*]]: !llvm.ptr<3>, %[[VAL_57:.*]]: !llvm.ptr<3>) attributes {gpu.kernel} {
-
-// CHECK-64:             %[[VAL_58:.*]] = builtin.unrealized_conversion_cast %[[VAL_55]] : i64 to index
-// CHECK-64:             %[[VAL_59:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_60:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_59]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_61:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_60]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_62:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_61]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_64:.*]] = llvm.mlir.constant(32 : index) : i64
-// CHECK-64:             %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_66:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_66]], %[[VAL_65]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_68:.*]] = builtin.unrealized_conversion_cast %[[VAL_67]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<32xf32, 3>
-// CHECK-64:             %[[VAL_69:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_69]][0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_70]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_72:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_72]], %[[VAL_71]][2] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_74:.*]] = llvm.mlir.constant(16 : index) : i64
-// CHECK-64:             %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_74]], %[[VAL_73]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_76:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_76]], %[[VAL_75]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_78:.*]] = builtin.unrealized_conversion_cast %[[VAL_77]] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> to memref<16xi16, 3>
-
-// CHECK-32:             %[[VAL_58:.*]] = builtin.unrealized_conversion_cast %[[VAL_55]] : i32 to index
-// CHECK-32:             %[[VAL_59:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_60:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_59]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_61:.*]] = llvm.insertvalue %[[VAL_56]], %[[VAL_60]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_62:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_63:.*]] = llvm.insertvalue %[[VAL_62]], %[[VAL_61]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_64:.*]] = llvm.mlir.constant(32 : index) : i32
-// CHECK-32:             %[[VAL_65:.*]] = llvm.insertvalue %[[VAL_64]], %[[VAL_63]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_66:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_67:.*]] = llvm.insertvalue %[[VAL_66]], %[[VAL_65]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_68:.*]] = builtin.unrealized_conversion_cast %[[VAL_67]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<32xf32, 3>
-// CHECK-32:             %[[VAL_69:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_70:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_69]][0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_71:.*]] = llvm.insertvalue %[[VAL_57]], %[[VAL_70]][1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_72:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_73:.*]] = llvm.insertvalue %[[VAL_72]], %[[VAL_71]][2] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_74:.*]] = llvm.mlir.constant(16 : index) : i32
-// CHECK-32:             %[[VAL_75:.*]] = llvm.insertvalue %[[VAL_74]], %[[VAL_73]][3, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_76:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_77:.*]] = llvm.insertvalue %[[VAL_76]], %[[VAL_75]][4, 0] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_78:.*]] = builtin.unrealized_conversion_cast %[[VAL_77]] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> to memref<16xi16, 3>
+// CHECK-SAME:             %[[VAL_52:.*]]: f32,
+// CHECK-SAME:             %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32,
+// CHECK-64-SAME:          %[[VAL_55:.*]]: i64, %[[VAL_56:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 32 : i64, llvm.noalias}, %[[VAL_57:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 128 : i64, llvm.noalias}) attributes {gpu.kernel} {
+// CHECK-32-SAME           %[[VAL_55:.*]]: i32, %[[VAL_56:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 32 : i64, llvm.noalias}, %[[VAL_57:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 64 : i64, llvm.noalias}) attributes {gpu.kernel} {
 
 // CHECK:                %[[VAL_79:.*]] = llvm.mlir.constant(32 : i64) : i64
 // CHECK:                %[[VAL_80:.*]] = llvm.alloca %[[VAL_79]] x i32 : (i64) -> !llvm.ptr
 
-// CHECK-64:             %[[VAL_81:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_81]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_82]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_84:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_84]], %[[VAL_83]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_86:.*]] = llvm.mlir.constant(32 : index) : i64
-// CHECK-64:             %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_86]], %[[VAL_85]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_88:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_88]], %[[VAL_87]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_90:.*]] = builtin.unrealized_conversion_cast %[[VAL_89]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<32xi32>
-
-// CHECK-32:             %[[VAL_81:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_82:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_81]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_83:.*]] = llvm.insertvalue %[[VAL_80]], %[[VAL_82]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_84:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_85:.*]] = llvm.insertvalue %[[VAL_84]], %[[VAL_83]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_86:.*]] = llvm.mlir.constant(32 : index) : i32
-// CHECK-32:             %[[VAL_87:.*]] = llvm.insertvalue %[[VAL_86]], %[[VAL_85]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_88:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_89:.*]] = llvm.insertvalue %[[VAL_88]], %[[VAL_87]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_90:.*]] = builtin.unrealized_conversion_cast %[[VAL_89]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<32xi32>
-
 // CHECK:                %[[VAL_91:.*]] = llvm.mlir.constant(32 : i64) : i64
-
 // CHECK-64:             %[[VAL_92:.*]] = llvm.alloca %[[VAL_91]] x i64 : (i64) -> !llvm.ptr
 // CHECK-32:             %[[VAL_92:.*]] = llvm.alloca %[[VAL_91]] x i32 : (i64) -> !llvm.ptr
-
-// CHECK-64:             %[[VAL_93:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_93]][0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_94]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_96:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK-64:             %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_96]], %[[VAL_95]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_98:.*]] = llvm.mlir.constant(32 : index) : i64
-// CHECK-64:             %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_98]], %[[VAL_97]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_100:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-64:             %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_100]], %[[VAL_99]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-// CHECK-64:             %[[VAL_102:.*]] = builtin.unrealized_conversion_cast %[[VAL_101]] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<32xindex>
-
-// CHECK-32:             %[[VAL_93:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_94:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_93]][0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_95:.*]] = llvm.insertvalue %[[VAL_92]], %[[VAL_94]][1] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_96:.*]] = llvm.mlir.constant(0 : index) : i32
-// CHECK-32:             %[[VAL_97:.*]] = llvm.insertvalue %[[VAL_96]], %[[VAL_95]][2] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_98:.*]] = llvm.mlir.constant(32 : index) : i32
-// CHECK-32:             %[[VAL_99:.*]] = llvm.insertvalue %[[VAL_98]], %[[VAL_97]][3, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_100:.*]] = llvm.mlir.constant(1 : index) : i32
-// CHECK-32:             %[[VAL_101:.*]] = llvm.insertvalue %[[VAL_100]], %[[VAL_99]][4, 0] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)>
-// CHECK-32:             %[[VAL_102:.*]] = builtin.unrealized_conversion_cast %[[VAL_101]] : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> to memref<32xindex>
-
-// CHECK:                %[[VAL_103:.*]] = arith.constant 0 : index
-// CHECK:                memref.store %[[VAL_52]], %[[VAL_68]]{{\[}}%[[VAL_103]]] : memref<32xf32, 3>
-// CHECK:                memref.store %[[VAL_53]], %[[VAL_78]]{{\[}}%[[VAL_103]]] : memref<16xi16, 3>
-// CHECK:                memref.store %[[VAL_54]], %[[VAL_90]]{{\[}}%[[VAL_103]]] : memref<32xi32>
-// CHECK:                memref.store %[[VAL_58]], %[[VAL_102]]{{\[}}%[[VAL_103]]] : memref<32xindex>
   gpu.func @kernel_with_both_attribs(%arg0: f32, %arg1: i16, %arg2: i32, %arg3: index)
-      workgroup(%arg4: memref<32xf32, 3>, %arg5: memref<16xi16, 3>)
+      workgroup(%arg4: memref<8xf32, 3>, %arg5: memref<16xindex, 3>)
       private(%arg6: memref<32xi32>, %arg7: memref<32xindex>)
       kernel {
-    %c0 = arith.constant 0 : index
-    memref.store %arg0, %arg4[%c0] : memref<32xf32, 3>
-    memref.store %arg1, %arg5[%c0] : memref<16xi16, 3>
-    memref.store %arg2, %arg6[%c0] : memref<32xi32>
-    memref.store %arg3, %arg7[%c0] : memref<32xindex>
     gpu.return
   }
 

>From 36d5bf05ea197d01a5a02616868563241ae8aac5 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez at codeplay.com>
Date: Mon, 5 Aug 2024 13:52:46 +0100
Subject: [PATCH 3/3] Use tuple to encode workgroup attribution in LLVM

---
 .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td       | 19 ++++++++
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |  6 +--
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 44 +++++--------------
 .../lib/Conversion/GPUCommon/GPUOpsLowering.h | 11 +----
 .../GPUToLLVMSPV/gpu-to-llvm-spv.mlir         | 10 ++---
 mlir/test/Dialect/LLVMIR/func.mlir            |  7 +++
 6 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index 529c458ce1254..892c6e8832b78 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -1104,4 +1104,23 @@ def TailCallKindAttr : LLVM_Attr<"TailCallKind", "tailcallkind"> {
   let assemblyFormat = "`<` $tailCallKind `>`";
 }
 
+//===----------------------------------------------------------------------===//
+// WorkgroupAttribAttr
+//===----------------------------------------------------------------------===//
+
+def WorkgroupAttribAttr
+    : LLVM_Attr<"WorkgroupAttrib", "mlir.workgroup_attrib"> {
+  let summary = "GPU workgroup attribution information";
+  let description = [{
+    GPU workgroup attributions are `gpu.func` arguments encoding memory
+    allocations in the workgroup address space. These might be encoded as
+    `llvm.ptr` arguments in our dialect, missing type and size information.
+    This attribute can be use to keep this information when converting from
+    GPU to LLVM dialect.
+  }];
+  let parameters = (ins "IntegerAttr":$num_elements,
+                        "TypeAttr":$element_type);
+  let assemblyFormat = "`<` $num_elements `,` $element_type`>`";
+}
+
 #endif // LLVMIR_ATTRDEFS
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index 8e933afbb02f1..1bf525e2aeb79 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -58,9 +58,9 @@ def LLVM_Dialect : Dialect {
     /// effect when lowering to the LLVMDialect.
     static StringRef getReadnoneAttrName() { return "llvm.readnone"; }
 
-    /// Name of the helper attribute to keep GPU workgroup attribution size
-    /// information when converting from GPU to LLVM.
-    static StringRef getWorkgroupAttribSizeAttrName() { return "llvm.mlir.workgroup_attrib_size"; }
+    /// Name of the helper attribute to keep GPU workgroup attribution size and
+    /// type information when converting from GPU to LLVM.
+    static StringRef getWorkgroupAttribAttrName() { return "llvm.mlir.workgroup_attrib"; }
 
     /// Verifies if the given string is a well-formed data layout descriptor.
     /// Uses `reportError` to report errors.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index b0d217650ba5f..68a940a137436 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -8,7 +8,6 @@
 
 #include "GPUOpsLowering.h"
 
-#include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
@@ -20,22 +19,6 @@
 
 using namespace mlir;
 
-namespace {
-constexpr int64_t sizeQueryFailure = 0;
-
-static int64_t getAttributionSize(BlockArgument attribution,
-                                  const LLVMTypeConverter &converter,
-                                  const DataLayout &layout) {
-  auto attributionType = cast<MemRefType>(attribution.getType());
-  int64_t numElements = attributionType.getNumElements();
-  Type elementType = converter.convertType(attributionType.getElementType());
-  if (!elementType)
-    return sizeQueryFailure;
-  int64_t elementTypeSize = layout.getTypeSize(elementType);
-  return numElements * elementTypeSize;
-}
-} // namespace
-
 LogicalResult
 GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
                                    ConversionPatternRewriter &rewriter) const {
@@ -60,24 +43,21 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     std::array attrs{
         rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
                               rewriter.getUnitAttr()),
-        rewriter.getNamedAttr(
-            LLVM::LLVMDialect::getWorkgroupAttribSizeAttrName(),
-            rewriter.getUnitAttr()),
+        rewriter.getNamedAttr(LLVM::LLVMDialect::getWorkgroupAttribAttrName(),
+                              rewriter.getUnitAttr()),
     };
     SmallVector<DictionaryAttr> argAttrs;
-    assert(defaultLayout && "Expecting defaultLayout to be intialized");
-    const DataLayout *layout = &*defaultLayout;
-    if (const DataLayoutAnalysis *analysis =
-            getTypeConverter()->getDataLayoutAnalysis()) {
-      layout = &analysis->getAbove(gpuFuncOp);
-    }
     for (BlockArgument attribution : workgroupAttributions) {
-      int64_t dataSize =
-          getAttributionSize(attribution, *getTypeConverter(), *layout);
-      // Check for special failure value
-      if (dataSize == sizeQueryFailure)
+      auto attributionType = cast<MemRefType>(attribution.getType());
+      IntegerAttr numElements =
+          rewriter.getI64IntegerAttr(attributionType.getNumElements());
+      Type llvmElementType =
+          getTypeConverter()->convertType(attributionType.getElementType());
+      if (!llvmElementType)
         return failure();
-      attrs.back().setValue(rewriter.getI64IntegerAttr(dataSize));
+      TypeAttr type = TypeAttr::get(llvmElementType);
+      attrs.back().setValue(
+          rewriter.getAttr<LLVM::WorkgroupAttribAttr>(numElements, type));
       argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
     }
 
@@ -335,7 +315,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
       copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
       copyPointerAttribute(
           LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
-      copyPointerAttribute(LLVM::LLVMDialect::getWorkgroupAttribSizeAttrName());
+      copyPointerAttribute(LLVM::LLVMDialect::getWorkgroupAttribAttrName());
     }
   }
   rewriter.eraseOp(gpuFuncOp);
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 0c8213c205269..781bea6b09406 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -67,10 +67,7 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
         kernelCallingConvention(options.kernelCallingConvention),
         nonKernelCallingConvention(options.nonKernelCallingConvention),
         encodeWorkgroupAttributionsAsArguments(
-            options.encodeWorkgroupAttributionsAsArguments),
-        defaultLayout(options.encodeWorkgroupAttributionsAsArguments
-                          ? std::optional<DataLayout>(DataLayout())
-                          : std::optional<DataLayout>()) {}
+            options.encodeWorkgroupAttributionsAsArguments) {}
 
   LogicalResult
   matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -95,12 +92,6 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
   /// Whether to encode workgroup attributions as additional arguments instead
   /// of a global variable.
   bool encodeWorkgroupAttributionsAsArguments;
-
-  /// Default layout to use in absence of the corresponding analysis.
-  /// This will only be initialized if
-  /// encodeWorkgroupAttributionsAsArguments=true, as it will remain unused
-  /// otherwise.
-  std::optional<DataLayout> defaultLayout;
 };
 
 /// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
index f7dfa40b2da71..72b783380bf9b 100644
--- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
+++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir
@@ -454,8 +454,8 @@ gpu.module @kernels {
 
 // CHECK-LABEL:        llvm.func spir_kernelcc @kernel_with_workgoup_attribs(
 // CHECK-SAME:             %[[VAL_27:.*]]: f32, %[[VAL_28:.*]]: i16,
-// CHECK-SAME:             %[[VAL_29:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 128 : i64, llvm.noalias},
-// CHECK-SAME:             %[[VAL_30:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 32 : i64, llvm.noalias}) attributes {gpu.kernel} {
+// CHECK-SAME:             %[[VAL_29:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<32 : i64, f32>, llvm.noalias},
+// CHECK-SAME:             %[[VAL_30:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<16 : i64, i16>, llvm.noalias}) attributes {gpu.kernel} {
 
 // MemRef descriptor built from new argument
 
@@ -482,10 +482,8 @@ gpu.module @kernels {
 // arguments and a llvm.alloca are present.
 
 // CHECK-LABEL:        llvm.func spir_kernelcc @kernel_with_both_attribs(
-// CHECK-SAME:             %[[VAL_52:.*]]: f32,
-// CHECK-SAME:             %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32,
-// CHECK-64-SAME:          %[[VAL_55:.*]]: i64, %[[VAL_56:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 32 : i64, llvm.noalias}, %[[VAL_57:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 128 : i64, llvm.noalias}) attributes {gpu.kernel} {
-// CHECK-32-SAME           %[[VAL_55:.*]]: i32, %[[VAL_56:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 32 : i64, llvm.noalias}, %[[VAL_57:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib_size = 64 : i64, llvm.noalias}) attributes {gpu.kernel} {
+// CHECK-64-SAME:          %[[VAL_52:.*]]: f32, %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32, %[[VAL_55:.*]]: i64, %[[VAL_56:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<8 : i64, f32>, llvm.noalias}, %[[VAL_57:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<16 : i64, i64>, llvm.noalias}) attributes {gpu.kernel} {
+// CHECK-32-SAME:          %[[VAL_52:.*]]: f32, %[[VAL_53:.*]]: i16, %[[VAL_54:.*]]: i32, %[[VAL_55:.*]]: i32, %[[VAL_56:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<8 : i64, f32>, llvm.noalias}, %[[VAL_57:.*]]: !llvm.ptr<3> {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<16 : i64, i32>, llvm.noalias}) attributes {gpu.kernel} {
 
 // CHECK:                %[[VAL_79:.*]] = llvm.mlir.constant(32 : i64) : i64
 // CHECK:                %[[VAL_80:.*]] = llvm.alloca %[[VAL_79]] x i32 : (i64) -> !llvm.ptr
diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir
index 40b4e49f08a3e..c648ec5880659 100644
--- a/mlir/test/Dialect/LLVMIR/func.mlir
+++ b/mlir/test/Dialect/LLVMIR/func.mlir
@@ -472,3 +472,10 @@ llvm.func @reqd_work_group_size_hint() attributes {reqd_work_group_size = array<
 // CHECK: @intel_reqd_sub_group_size_hint()
 // CHECK-SAME: intel_reqd_sub_group_size = 32 : i32
 llvm.func @intel_reqd_sub_group_size_hint() attributes {llvm.intel_reqd_sub_group_size = 32 : i32}
+
+// -----
+
+// CHECK: @workgroup_attrib
+// CHECK-SAME: llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<512 : i64, i32>
+// CHECK-SAME: llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<128 : i64, !llvm.struct<(i32, i64, f32)>
+llvm.func @workgroup_attrib(%arg0: !llvm.ptr {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<512 : i64, i32>}, %arg1: !llvm.ptr {llvm.mlir.workgroup_attrib = #llvm.mlir.workgroup_attrib<128 : i64, !llvm.struct<(i32, i64, f32)>>})