[Mlir-commits] [mlir] [mlir][tosa] Convert group tosa::Conv2DOp to linalg conv (PR #108192)

Fri Sep 13 02:12:48 PDT 2024

https://github.com/stefankoncarevic updated https://github.com/llvm/llvm-project/pull/108192

>From d077d8bcd42e80a7953b3f4e04347d9c2c57fd5e Mon Sep 17 00:00:00 2001
From: Stefan Koncarevic <skoncare at amd.com>
Date: Wed, 11 Sep 2024 11:33:35 +0000
Subject: [PATCH] [mlir][tosa] Convert group tosa::Conv2DOp to linalg conv

This patch adds two new ops: linalg::Conv2DNhwgcGfhwcOp and
linalg::Conv2DNhwgcGfhwcQOp, and uses them to convert
tosa group conv2d Ops.
- Added linalg::Conv2DNhwgcGfhwcOp and linalg::Conv2DNhwgcGfhwcQOp.
- Updated the conversion process to use these new ops for tosa
  group conv2d operations.
---
 .../Linalg/IR/LinalgNamedStructuredOps.yaml   | 237 ++++++++++++++++++
 .../mlir/Dialect/Tosa/IR/TosaOpBase.td        |  12 +
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td  |   3 +-
 .../TosaToLinalg/TosaToLinalgNamed.cpp        | 145 +++++++----
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          |  13 +
 .../Transforms/TosaDecomposeTransposeConv.cpp |   5 +-
 .../linalg/opdsl/ops/core_named_ops.py        |  61 +++++
 mlir/test/Dialect/Linalg/named-ops.mlir       |  16 ++
 mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp |   2 +-
 9 files changed, 448 insertions(+), 46 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 8cb698096ef5b7..011c4858d6521b 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -3410,6 +3410,243 @@ structured_op: !LinalgStructuredOpConfig
                 - !ScalarExpression
                   scalar_arg: K
 --- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_nhwgc_gfhwc
+  cpp_class_name: Conv2DNhwgcGfhwcOp
+  doc: |-
+    Performs 2-D grouped convolution.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1 * s2 + s3 * s4, s5 * s6 + s7 * s8, s9, s10)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s9, s11, s3, s7, s10)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s5, s9, s11)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s2, s6)>
+    default_indices:
+    - 1
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s4, s8)>
+    default_indices:
+    - 1
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1 * s2 + d5 * s4, d2 * s6 + d6 * s8, d3, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d3, d4, d5, d6, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                fn_name: cast_signed
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              scalar_fn:
+                kind: type
+                fn_name: cast_signed
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_nhwgc_gfhwc_q
+  cpp_class_name: Conv2DNhwgcGfhwcQOp
+  doc: |-
+    Performs 2-D grouped convolution with zero point offsets.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1 * s2 + s3 * s4, s5 * s6 + s7 * s8, s9, s10)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s9, s11, s3, s7, s10)>
+  - !LinalgOperandDefConfig
+    name: IZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: KZp
+    kind: scalar
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s5, s9, s11)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s2, s6)>
+    default_indices:
+    - 1
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s4, s8)>
+    default_indices:
+    - 1
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1 * s2 + d5 * s4, d2 * s6 + d6 * s8, d3, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d3, d4, d5, d6, d7)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7)[s0, s1, s2, s3, s4, s5, s6, s7,
+      s8, s9, s10, s11] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: binary
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: I
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: IZp
+            - !ScalarExpression
+              scalar_fn:
+                kind: binary
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: K
+                - !ScalarExpression
+                  scalar_fn:
+                    kind: type
+                    fn_name: cast_signed
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: KZp
+--- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: conv_2d_ngchw_gfchw_q
   cpp_class_name: Conv2DNgchwGfchwQOp
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 1412c7a2615d20..d4697f0afbf466 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -133,6 +133,18 @@ def Tosa_ConvOpQuantInfoBuilder : OpBuilder<
                              pad, stride, dilation);
   }]>;
 
+// Handles grouped convolution
+def Tosa_ConvOpGroupQuantBuilder : OpBuilder<
+  (ins "::mlir::Type":$outputType, "::mlir::Value":$input,
+       "::mlir::Value":$weight, "::mlir::Value":$bias,
+       "::mlir::DenseI64ArrayAttr":$pad, "::mlir::DenseI64ArrayAttr":$stride,
+       "::mlir::DenseI64ArrayAttr":$dilation, "::mlir::IntegerAttr":$group),
+  [{
+    buildConvOpWithQuantInfo($_builder, $_state, outputType,
+                             input, weight, bias,
+                             pad, stride, dilation, group);
+  }]>;
+
 // Handles tosa.transpose_conv2d which has an outpad and output shape attribute.
 def Tosa_TransConvOpQuantInfoBuilder : OpBuilder<
   (ins "::mlir::Type":$outputType, "::mlir::Value":$input,
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 539b7cd0b74267..b8f04212e82a70 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -106,6 +106,7 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> {
     Tosa_IntArrayAttr4:$pad,
     Tosa_IntArrayAttr2:$stride,
     Tosa_IntArrayAttr2:$dilation,
+    OptionalAttr<I64Attr>:$group,
     OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info,
     DefaultValuedOptionalAttr<BoolAttr, "false">:$local_bound
   );
@@ -114,7 +115,7 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> {
     Tosa_Tensor4D:$output
   );
 
-  let builders = [Tosa_ConvOpQuantInfoBuilder];
+  let builders = [Tosa_ConvOpQuantInfoBuilder, Tosa_ConvOpGroupQuantBuilder];
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index fe53b499674324..991c98f0ed33ea 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -236,6 +236,7 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
   LogicalResult
   matchAndRewrite(TosaConvOp op, typename TosaConvOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
+    bool isConv2DOp = isa<tosa::Conv2DOp>(op);
     Location loc = op->getLoc();
     Value input = op->getOperand(0);
     Value weight = op->getOperand(1);
@@ -253,6 +254,24 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
     DenseI64ArrayAttr strideTosaAttr = op.getStrideAttr();
     DenseI64ArrayAttr dilationTosaAttr = op.getDilationAttr();
     bool isQuantized = op.getQuantizationInfo().has_value();
+    int64_t group = 1;
+
+    if (auto convop = dyn_cast<tosa::Conv2DOp>(&op)) {
+      if (convop->getGroup().has_value())
+        group = convop->getGroup().value();
+    }
+
+    if (group > 1 && isConv2DOp &&
+        !std::is_same<LinalgConvOp, linalg::Conv2DNhwgcGfhwcOp>::value &&
+        !std::is_same<LinalgConvOp, linalg::Conv2DNhwcHwcfOp>::value)
+      return rewriter.notifyMatchFailure(
+          op, "tosa.conv ops should map to grouped convolution ops");
+
+    if (group == 1 && isConv2DOp &&
+        !std::is_same<LinalgConvOp, linalg::Conv2DNhwcFhwcOp>::value &&
+        !std::is_same<LinalgConvOp, linalg::Conv2DNhwcHwcfOp>::value)
+      return rewriter.notifyMatchFailure(
+          op, "tosa.conv ops should map to non-grouped convolution ops");
 
     if (!weightTy.hasStaticShape() || !biasTy.hasStaticShape())
       return rewriter.notifyMatchFailure(
@@ -274,8 +293,6 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
         strideTosaAttr.asArrayRef(), dilationTosaAttr.asArrayRef(),
         inputSizeDims, kernelSizeDims, rewriter);
 
-    auto weightShape = weightTy.getShape();
-
     // Apply padding as necessary.
     TypedAttr zeroAttr = rewriter.getZeroAttr(inputETy);
     if (isQuantized) {
@@ -302,15 +319,64 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
     pad.resize(pad.size() + 2, 0);
     input = applyPad(loc, input, pad, zeroAttr, rewriter);
 
-    if (4 == inputTy.getRank()) {
-      // For 2D convolutions, we need to check if the target convolution op
-      // wants a HWCF kernel layout.
-      bool wantHwcf =
-          isQuantized ? std::is_same_v<LinalgConvQOp, linalg::Conv2DNhwcHwcfQOp>
-                      : std::is_same_v<LinalgConvOp, linalg::Conv2DNhwcHwcfOp>;
-      if (wantHwcf) {
-        // Transpose the kernel to match dimension ordering of the linalg
-        // convolution operation.
+    auto weightShape = weightTy.getShape();
+    SmallVector<int64_t> weightPerm;
+
+    auto resultShape = resultTy.getShape();
+    auto newResultTy = resultTy;
+
+    if (isConv2DOp && group > 1) {
+      // Map 4D-tensors to 5D tensors
+      auto inputShape = cast<ShapedType>(input.getType()).getShape();
+      SmallVector<int64_t, 5> newInputShape = {inputShape[0], inputShape[1],
+                                               inputShape[2], group,
+                                               inputShape[3] / group};
+
+      SmallVector<int64_t, 5> newWeightShape = {group, weightShape[0] / group,
+                                                weightShape[1], weightShape[2],
+                                                weightShape[3]};
+      input = rewriter.create<tosa::ReshapeOp>(
+          loc, RankedTensorType::get(newInputShape, inputETy), input,
+          rewriter.getDenseI64ArrayAttr(newInputShape));
+      weight = rewriter.create<tosa::ReshapeOp>(
+          loc, RankedTensorType::get(newWeightShape, weightTy.getElementType()),
+          weight, rewriter.getDenseI64ArrayAttr(newWeightShape));
+    } else {
+
+      if (4 == inputTy.getRank()) {
+        // For 2D convolutions, we need to check if the target convolution op
+        // wants a HWCF kernel layout.
+        bool wantHwcf =
+            isQuantized
+                ? std::is_same_v<LinalgConvQOp, linalg::Conv2DNhwcHwcfQOp>
+                : std::is_same_v<LinalgConvOp, linalg::Conv2DNhwcHwcfOp>;
+        if (wantHwcf) {
+          // Transpose the kernel to match dimension ordering of the linalg
+          // convolution operation.
+          // TODO(suderman): See if this can be efficiently folded - check
+          // whether the input is used anywhere else, if not fold the constant.
+          SmallVector<int32_t> weightPerm;
+          for (int i = 1; i < resultTy.getRank(); i++)
+            weightPerm.push_back(i);
+          weightPerm.push_back(0);
+
+          SmallVector<int64_t> newWeightShape;
+          for (auto dim : weightPerm)
+            newWeightShape.push_back(weightShape[dim]);
+          auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
+          Value weightPermValue =
+              rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
+          Type newWeightTy =
+              RankedTensorType::get(newWeightShape, weightTy.getElementType());
+          weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
+                                                      weightPermValue);
+        }
+      }
+
+      // For Conv3D transpose the kernel to match dimension ordering of the
+      // linalg convolution operation. Conv2D has a 1-1 mapping in linalg so
+      // better to map directly and then transpose later if desired.
+      if (5 == inputTy.getRank()) {
         // TODO(suderman): See if this can be efficiently folded - check whether
         // the input is used anywhere else, if not fold the constant.
         SmallVector<int32_t> weightPerm;
@@ -331,27 +397,11 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
       }
     }
 
-    // For Conv3D transpose the kernel to match dimension ordering of the linalg
-    // convolution operation. Conv2D has a 1-1 mapping in linalg so better to
-    // map directly and then transpose later if desired.
-    if (5 == inputTy.getRank()) {
-      // TODO(suderman): See if this can be efficiently folded - check whether
-      // the input is used anywhere else, if not fold the constant.
-      SmallVector<int32_t> weightPerm;
-      for (int i = 1; i < resultTy.getRank(); i++)
-        weightPerm.push_back(i);
-      weightPerm.push_back(0);
-
-      SmallVector<int64_t> newWeightShape;
-      for (auto dim : weightPerm)
-        newWeightShape.push_back(weightShape[dim]);
-      auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
-      Value weightPermValue =
-          rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
-      Type newWeightTy =
-          RankedTensorType::get(newWeightShape, weightTy.getElementType());
-      weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
-                                                  weightPermValue);
+    if (isConv2DOp && group > 1) {
+      SmallVector<int64_t, 5> newResultShape{resultShape[0], resultShape[1],
+                                             resultShape[2], group,
+                                             resultShape[3] / group};
+      newResultTy = RankedTensorType::get(newResultShape, resultETy);
     }
 
     // Extract the attributes for convolution.
@@ -368,6 +418,13 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
     Value broadcastBias =
         linalgBroadcastAndMaybeExtSI(rewriter, loc, bias, biasEmptyTensor);
 
+    if (isConv2DOp && group > 1) {
+      broadcastBias = rewriter.create<tosa::ReshapeOp>(
+          loc, RankedTensorType::get(newResultTy.getShape(), resultETy),
+          broadcastBias, rewriter.getDenseI64ArrayAttr(newResultTy.getShape()));
+    }
+
+    Value conv;
     if (isQuantized) {
       auto quantizationInfo = *op.getQuantizationInfo();
       auto iZp = rewriter.getI32IntegerAttr(quantizationInfo.getInputZp());
@@ -376,22 +433,25 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
       auto iZpVal = rewriter.create<arith::ConstantOp>(loc, iZp);
       auto kZpVal = rewriter.create<arith::ConstantOp>(loc, kZp);
 
-      Value conv =
+      conv =
           rewriter
               .create<LinalgConvQOp>(
-                  loc, resultTy, ValueRange{input, weight, iZpVal, kZpVal},
+                  loc, newResultTy, ValueRange{input, weight, iZpVal, kZpVal},
                   ValueRange{broadcastBias}, strideAttr, dilationAttr)
               ->getResult(0);
-
-      rewriter.replaceOp(op, conv);
-      return success();
+    } else {
+      conv = rewriter
+                 .create<LinalgConvOp>(
+                     loc, newResultTy, ValueRange{input, weight},
+                     ValueRange{broadcastBias}, strideAttr, dilationAttr)
+                 ->getResult(0);
     }
 
-    Value conv = rewriter
-                     .create<LinalgConvOp>(
-                         loc, resultTy, ValueRange{input, weight},
-                         ValueRange{broadcastBias}, strideAttr, dilationAttr)
-                     ->getResult(0);
+    if (isConv2DOp && group > 1) {
+      conv = rewriter.create<tosa::ReshapeOp>(
+          loc, RankedTensorType::get(resultShape, resultETy), conv,
+          rewriter.getDenseI64ArrayAttr(resultShape));
+    }
 
     rewriter.replaceOp(op, conv);
     return success();
@@ -1077,6 +1137,7 @@ void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
   }
   patterns->add<
       // clang-format off
+      ConvConverter<tosa::Conv2DOp, linalg::Conv2DNhwgcGfhwcOp, linalg::Conv2DNhwgcGfhwcQOp>,
       ConvConverter<tosa::Conv3DOp, linalg::Conv3DNdhwcDhwcfOp, linalg::Conv3DNdhwcDhwcfQOp>,
       DepthwiseConvConverter,
       MatMulConverter,
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 0d0241fea5152c..0ebc83cb8a062e 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -383,6 +383,19 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
   }
 }
 
+/// Handles grouped convolution
+static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
+                                     Type outputType, Value input, Value weight,
+                                     Value bias, DenseI64ArrayAttr pad,
+                                     DenseI64ArrayAttr stride,
+                                     DenseI64ArrayAttr dilation,
+                                     mlir::IntegerAttr group) {
+  buildConvOpWithQuantInfo(builder, result, outputType, input, weight, bias,
+                           pad, stride, dilation);
+  if (group)
+    result.addAttribute("group", group);
+}
+
 /// Handles tosa.transpose_conv2d which has outpad and output shape
 /// attributes.
 static void buildTransConvOpWithQuantInfo(
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
index 0779cdb9667a1a..479d4da4026008 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp
@@ -75,7 +75,8 @@ class TransposeConvNonStridedConverter
           loc, resultTy, input, reverse2, bias,
           rewriter.getDenseI64ArrayAttr(convPad),
           rewriter.getDenseI64ArrayAttr(stride),
-          rewriter.getDenseI64ArrayAttr({1, 1}), *op.getQuantizationInfo());
+          rewriter.getDenseI64ArrayAttr({1, 1}), nullptr,
+          *op.getQuantizationInfo());
     } else {
       conv2d = rewriter.create<tosa::Conv2DOp>(
           loc, resultTy, input, reverse2, bias,
@@ -238,7 +239,7 @@ class TransposeConvStridedConverter
                    /*pad=*/rewriter.getDenseI64ArrayAttr({0, 0, 0, 0}),
                    /*stride=*/rewriter.getDenseI64ArrayAttr({1, 1}),
                    /*dilation=*/rewriter.getDenseI64ArrayAttr({1, 1}),
-                   *op.getQuantizationInfo())
+                   /*group=*/nullptr, *op.getQuantizationInfo())
                    .getResult();
     } else {
       conv2d = CreateOpAndInferShape<tosa::Conv2DOp>(
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index e4a6ec7487bb2f..8fb0fd6bbe832f 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -952,6 +952,67 @@ def conv_2d_ngchw_gfchw(
     ) * TypeFn.cast_signed(U, K[D.g, D.fg, D.c, D.kh, D.kw])
 
 
+ at linalg_structured_op
+def conv_2d_nhwgc_gfhwc(
+    I=TensorDef(
+        T1, S.N, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW, S.G, S.C
+    ),
+    K=TensorDef(T2, S.G, S.FG, S.KH, S.KW, S.C),
+    O=TensorDef(U, S.N, S.OH, S.OW, S.G, S.FG, output=True),
+    strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+    dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]),
+):
+    """Performs 2-D grouped convolution.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+    """
+    implements(ConvolutionOpInterface)
+    domain(D.n, D.oh, D.ow, D.g, D.fg, D.kh, D.kw, D.c)
+    O[D.n, D.oh, D.ow, D.g, D.fg] += TypeFn.cast_signed(
+        U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.g, D.c]
+    ) * TypeFn.cast_signed(U, K[D.g, D.fg, D.kh, D.kw, D.c])
+
+
+ at linalg_structured_op
+def conv_2d_nhwgc_gfhwc_q(
+    I=TensorDef(
+        T1, S.N, S.OH * S.SH + S.KH * S.DH, S.OW * S.SW + S.KW * S.DW, S.G, S.C
+    ),
+    K=TensorDef(T2, S.G, S.FG, S.KH, S.KW, S.C),
+    IZp=ScalarDef(I32),
+    KZp=ScalarDef(I32),
+    O=TensorDef(U, S.N, S.OH, S.OW, S.G, S.FG, output=True),
+    strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+    dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1]),
+):
+    """Performs 2-D grouped convolution with zero point offsets.
+
+    Layout:
+      * Input: NHWGC.
+      * Kernel: GFHWC.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
+    """
+    implements(ConvolutionOpInterface)
+    domain(D.n, D.oh, D.ow, D.g, D.fg, D.kh, D.kw, D.c)
+    O[D.n, D.oh, D.ow, D.g, D.fg] += (
+        TypeFn.cast_signed(
+            U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.g, D.c]
+        )
+        - TypeFn.cast_signed(U, IZp)
+    ) * (
+        TypeFn.cast_signed(U, K[D.g, D.fg, D.kh, D.kw, D.c])
+        - TypeFn.cast_signed(U, KZp)
+    )
+
+
 @linalg_structured_op
 def conv_2d_ngchw_gfchw_q(
     I=TensorDef(
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 02ecbed232c8b5..b9f0e2bc6e6539 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -409,6 +409,22 @@ func.func @conv_2d_ngchw_fgchw(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x
 
 // -----
 
+// CHECK-LABEL: func @conv_2d_nhwgc_gfhwc
+func.func @conv_2d_nhwgc_gfhwc(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
+  // CHECK:      linalg.conv_2d_nhwgc_gfhwc
+  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
+  // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?x?x?xf32>)
+  linalg.conv_2d_nhwgc_gfhwc {dilations = dense<1> : tensor<2xi64>,
+                                         strides = dense<1> : tensor<2xi64>}
+     ins (%input, %filter: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
+    outs (%output: memref<?x?x?x?x?xf32>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @conv_2d_ngchw_fgchw_dimensions
 func.func @conv_2d_ngchw_fgchw_dimensions(%input: tensor<1x5x3x32x32xf32>, %filter: tensor<2x5x3x3x3xf32>, %init: tensor<1x5x2x30x30xf32>) -> tensor<1x5x2x30x30xf32> {
   // CHECK:      linalg.conv_2d_ngchw_fgchw
diff --git a/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp b/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
index e5a3e2b6fccaa3..37fd5c8b25a5e5 100644
--- a/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
+++ b/mlir/test/lib/Dialect/Tosa/TosaTestPasses.cpp
@@ -149,7 +149,7 @@ ConvertTosaConv2DOp::matchAndRewrite(Operation *op,
       op->getLoc(), newTosaConv2DOpType, tosaConv2DOp.getInput(),
       tosaConv2DOp.getWeight(), tosaConv2DOp.getBias(),
       tosaConv2DOp.getPadAttr(), tosaConv2DOp.getStrideAttr(),
-      tosaConv2DOp.getDilationAttr());
+      tosaConv2DOp.getDilationAttr(), tosaConv2DOp.getGroupAttr());
 
   // Create rescale to quantized type
   double inputScale = inputQType.getScale();