[Mlir-commits] [mlir] [MLIR][XeGPU] Add XeGPU scattered ops (PR #86594)

Tue Mar 26 10:40:28 PDT 2024

================
@@ -317,11 +324,327 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-  let extraClassDeclaration = extraBaseClassDeclaration;
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getValueType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
 
-  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict 
+  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
                         `:` type($value) `,` qualified(type($TensorDesc))}];
   let hasVerifier = 1;
 }
 
+def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
+                [AllTypesMatch<["TensorDesc", "result"]>]> {
+  let summary = "It updates the offsets for the TensorDesc.";
+  let description = [{The op updates the offset of the given TensorDesc.
+    The offsets are relative offset to the current position in the number
+    of elements. It will result in a same type TensorDesc as the input.
+
+  example:
+  ```
+    %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+  ```
+  }];
+
+  let arguments = (ins
+    XeGPU_TensorDesc: $TensorDesc,
+    Variadic<Index>: $offsets,
+    DenseI64ArrayAttr: $const_offsets);
+
+  let results = (outs XeGPU_TensorDesc: $result);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      Builder b(getContext());
+      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    }
+
+    size_t getNumOffsets() {
+      return getMixedOffsets().size();
+    }
+
+    OpFoldResult getOffset(unsigned idx) {
+      assert(idx < getNumOffsets() && "Invalid out of bound access.");
+      return getMixedOffsets()[idx];
+    }
+  }];
+
+  let assemblyFormat = [{
+    $TensorDesc `,`
+    custom<DynamicIndexList>($offsets, $const_offsets)
+    attr-dict `:` qualified(type($result))
+  }];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
+  let summary = "create scattered tensor descriptors (TensorDesc).";
+  let description = [{
+    "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
+    a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
+    is for creating continious subviews, "create_tdesc" is for creating non-continious
+    (scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
+    It accepts the following parameters:
+
+    * source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
+    * offsets: a array containing offsets of each access point. Its size
+      is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
+      implying each element in the array corresponds to a work-item (SIMT lane)
+      in the subgroup.
+    * chunk_size: [optional attribute] indicates number of continious
+      elements accessed for each offset, default is 1.
+
+    Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+    %a = memref.alloc() : memref<1024xf32>
+    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
+
+    Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+               It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+    %0 = memref.alloc() : memref<1024xf32>
+    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
----------------
joker-eph wrote:

Can you make it explicit in the doc?

https://github.com/llvm/llvm-project/pull/86594