[Mlir-commits] [mlir] 8b56014 - [mlir][linalg] Creating named 1D pooling ops

Fri Dec 16 15:11:37 PST 2022

Author: Murali Vijayaraghavan
Date: 2022-12-16T23:10:10Z
New Revision: 8b56014e9cf73409bcf63bd7f29a6b55543b5403

URL: https://github.com/llvm/llvm-project/commit/8b56014e9cf73409bcf63bd7f29a6b55543b5403
DIFF: https://github.com/llvm/llvm-project/commit/8b56014e9cf73409bcf63bd7f29a6b55543b5403.diff

LOG: [mlir][linalg] Creating named 1D pooling ops

This is mostly going to be used for linalg transformations - to make pooling ops similar to convolution ops.

Differential Revision: https://reviews.llvm.org/D140186

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
    mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
    mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
    mlir/test/Dialect/Linalg/named-ops.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index 57c6100c3ae39..9d771fc1b738a 100644

--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -661,7 +661,7 @@ metadata: !LinalgOpMetadata
     The partial multiplication results are reduced into a 2D output.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output."
+    them to the same data type as the accumulator/output.
   implements:
   - LinalgContractionOpInterface
 structured_op: !LinalgStructuredOpConfig
@@ -2279,38 +2279,39 @@ structured_op: !LinalgStructuredOpConfig
     name: I
     kind: input_tensor
     type_var: T1
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s9, s1 *
-      s2 + s3 * s4, s5 * s6 + s7 * s8)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s1, s2
+      * s3 + s4 * s5, s6 * s7 + s8 * s9)>
   - !LinalgOperandDefConfig
     name: K
     kind: input_tensor
     type_var: T2
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s9, s3, s7)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s1, s4, s8)>
   - !LinalgOperandDefConfig
     name: O
     kind: output_tensor
     type_var: U
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s9, s1, s5)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s0, s1, s2,
+      s6)>
   - !LinalgOperandDefConfig
     name: strides
     kind: index_attr
-    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s2,
-      s6)>
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s3,
+      s7)>
     default_indices:
     - 1
     - 1
   - !LinalgOperandDefConfig
     name: dilations
     kind: index_attr
-    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s4,
-      s8)>
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (s5,
+      s9)>
     default_indices:
     - 1
     - 1
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9]
-      -> (d0, d3, d1 * s2 + d4 * s4, d2 * s6 + d5 * s8)>
+      -> (d0, d3, d1 * s3 + d4 * s5, d2 * s7 + d5 * s9)>
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9]
       -> (d3, d4, d5)>
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9]
@@ -3470,6 +3471,497 @@ structured_op: !LinalgStructuredOpConfig
             - !ScalarExpression
               scalar_arg: I
 --- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_nwc_sum
+  cpp_class_name: PoolingNwcSumOp
+  doc: |-
+    Performs sum pooling.
+
+    Layout:
+      * Input: NWC.
+      * Kernel: W.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1 * s2 + s3 * s4, s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s5)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s2)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1 * s2 + d3 * s4,
+      d2)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_ncw_sum
+  cpp_class_name: PoolingNcwSumOp
+  doc: |-
+    Performs sum pooling.
+
+    Layout:
+      * Input: NCW.
+      * Kernel: W.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2 * s3 + s4 * s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s5)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2 * s3 + d3
+      * s5)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_nwc_max
+  cpp_class_name: PoolingNwcMaxOp
+  doc: |-
+    Performs max pooling.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1 * s2 + s3 * s4, s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s5)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s2)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1 * s2 + d3 * s4,
+      d2)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: max_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_nwc_max_unsigned
+  cpp_class_name: PoolingNwcMaxUnsignedOp
+  doc: |-
+    Performs unsigned max pooling.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1 * s2 + s3 * s4, s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s5)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s2)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1 * s2 + d3 * s4,
+      d2)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: max_unsigned
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_unsigned
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_ncw_max
+  cpp_class_name: PoolingNcwMaxOp
+  doc: |-
+    Performs max pooling.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2 * s3 + s4 * s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s2)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s5)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2 * s3 + d3
+      * s5)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: max_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_nwc_min
+  cpp_class_name: PoolingNwcMinOp
+  doc: |-
+    Performs min pooling.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1 * s2 + s3 * s4, s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s5)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s2)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1 * s2 + d3 * s4,
+      d2)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: min_signed
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_signed
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: pooling_nwc_min_unsigned
+  cpp_class_name: PoolingNwcMinUnsignedOp
+  doc: |-
+    Performs unsigned min pooling.
+
+    Numeric casting is performed on the input operand, promoting it to the same
+    data type as the accumulator/output.
+  implements:
+  - LinalgConvolutionOpInterface
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    kind: input_tensor
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1 * s2 + s3 * s4, s5)>
+  - !LinalgOperandDefConfig
+    name: K
+    kind: input_tensor
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s3)>
+  - !LinalgOperandDefConfig
+    name: O
+    kind: output_tensor
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1, s5)>
+  - !LinalgOperandDefConfig
+    name: strides
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s2)>
+    default_indices:
+    - 1
+  - !LinalgOperandDefConfig
+    name: dilations
+    kind: index_attr
+    index_attr_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4)>
+    default_indices:
+    - 1
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1 * s2 + d3 * s4,
+      d2)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_fn:
+        kind: binary
+        fn_name: min_unsigned
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_fn:
+            kind: type
+            fn_name: cast_unsigned
+            type_var: U
+            operands:
+            - !ScalarExpression
+              scalar_arg: I
+--- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: pooling_ndhwc_sum
   cpp_class_name: PoolingNdhwcSumOp

diff  --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 1aa112dcf9186..8bab1607b4ed4 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -694,7 +694,6 @@ def depthwise_conv_3d_ndhwc_dhwcm(I=TensorDef(T1,
            D.ow * S.SW + D.kw * S.DW, D.ic]) * TypeFn.cast_signed(
                U, K[D.kd, D.kh, D.kw, D.ic, D.cm])
 
-
 @linalg_structured_op
 def pooling_nhwc_sum(I=TensorDef(T1, S.N, S.OH * S.SH + S.KH * S.DH,
                                  S.OW * S.SW + S.KW * S.DW, S.C),
@@ -838,6 +837,146 @@ def pooling_nhwc_min_unsigned(I=TensorDef(T1, S.N, S.OH * S.SH + S.KH * S.DH,
     D.c] = ReduceFn.min_unsigned[D.kh, D.kw](TypeFn.cast_unsigned(
         U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c]))
 
+ at linalg_structured_op
+def pooling_nwc_sum(I=TensorDef(T1, S.N,
+                                S.OW * S.SW + S.KW * S.DW, S.C),
+                    K=TensorDef(T2, S.KW, index_dims=[D.kw]),
+                    O=TensorDef(U, S.N, S.OW, S.C, output=True),
+                    strides=IndexAttrDef(S.SW, default=[1]),
+                    dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs sum pooling.
+
+  Layout:
+    * Input: NWC.
+    * Kernel: W.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.ow, D.c, D.kw)
+  O[D.n, D.ow, D.c] += TypeFn.cast_signed(
+      U, I[D.n, D.ow * S.SW + D.kw * S.DW, D.c])
+
+
+ at linalg_structured_op
+def pooling_ncw_sum(I=TensorDef(T1, S.N, S.C,
+                                S.OW * S.SW + S.KW * S.DW),
+                    K=TensorDef(T2, S.KW, index_dims=[D.kw]),
+                    O=TensorDef(U, S.N, S.C, S.OW, output=True),
+                    strides=IndexAttrDef(S.SW, default=[1]),
+                    dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs sum pooling.
+
+  Layout:
+    * Input: NCW.
+    * Kernel: W.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.c, D.ow, D.kw)
+  O[D.n, D.c, D.ow] += TypeFn.cast_signed(
+      U, I[D.n, D.c, D.ow * S.SW + D.kw * S.DW])
+
+
+ at linalg_structured_op
+def pooling_nwc_max(I=TensorDef(T1, S.N,
+                                S.OW * S.SW + S.KW * S.DW, S.C),
+                    K=TensorDef(T2, S.KW, index_dims=[D.kw]),
+                    O=TensorDef(U, S.N, S.OW, S.C, output=True),
+                    strides=IndexAttrDef(S.SW, default=[1]),
+                    dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs max pooling.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.ow, D.c, D.kw)
+  O[D.n, D.ow, D.c] = ReduceFn.max_signed[[D.kw]](TypeFn.cast_signed(
+      U, I[D.n, D.ow * S.SW + D.kw * S.DW, D.c]))
+
+
+ at linalg_structured_op
+def pooling_nwc_max_unsigned(I=TensorDef(T1, S.N,
+                                         S.OW * S.SW + S.KW * S.DW, S.C),
+                             K=TensorDef(T2,
+                                         S.KW,
+                                         index_dims=[D.kw]),
+                             O=TensorDef(U, S.N, S.OW, S.C, output=True),
+                             strides=IndexAttrDef(S.SW, default=[1]),
+                             dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs unsigned max pooling.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.ow, D.c, D.kw)
+  O[D.n, D.ow,
+    D.c] = ReduceFn.max_unsigned[[D.kw]](TypeFn.cast_unsigned(
+        U, I[D.n, D.ow * S.SW + D.kw * S.DW, D.c]))
+
+
+ at linalg_structured_op
+def pooling_ncw_max(I=TensorDef(T1, S.N, S.C,
+                                S.OW * S.SW + S.KW * S.DW),
+                    K=TensorDef(T2, S.KW, index_dims=[D.kw]),
+                    O=TensorDef(U, S.N, S.C, S.OW, output=True),
+                    strides=IndexAttrDef(S.SW, default=[1]),
+                    dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs max pooling.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.c, D.ow, D.kw)
+  O[D.n, D.c, D.ow] = ReduceFn.max_signed[[D.kw]](TypeFn.cast_signed(
+      U, I[D.n, D.c, D.ow * S.SW + D.kw * S.DW,]))
+
+
+ at linalg_structured_op
+def pooling_nwc_min(I=TensorDef(T1, S.N,
+                                S.OW * S.SW + S.KW * S.DW, S.C),
+                    K=TensorDef(T2, S.KW, index_dims=[D.kw]),
+                    O=TensorDef(U, S.N, S.OW, S.C, output=True),
+                    strides=IndexAttrDef(S.SW, default=[1]),
+                    dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs min pooling.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.ow, D.c, D.kw)
+  O[D.n, D.ow, D.c] = ReduceFn.min_signed[[D.kw]](TypeFn.cast_signed(
+      U, I[D.n, D.ow * S.SW + D.kw * S.DW, D.c]))
+
+
+ at linalg_structured_op
+def pooling_nwc_min_unsigned(I=TensorDef(T1, S.N,
+                                         S.OW * S.SW + S.KW * S.DW, S.C),
+                             K=TensorDef(T2,
+                                         S.KW,
+                                         index_dims=[D.kw]),
+                             O=TensorDef(U, S.N, S.OW, S.C, output=True),
+                             strides=IndexAttrDef(S.SW, default=[1]),
+                             dilations=IndexAttrDef(S.DW, default=[1])):
+  """Performs unsigned min pooling.
+
+  Numeric casting is performed on the input operand, promoting it to the same
+  data type as the accumulator/output.
+  """
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.ow, D.c, D.kw)
+  O[D.n, D.ow,
+    D.c] = ReduceFn.min_unsigned[[D.kw]](TypeFn.cast_unsigned(
+        U, I[D.n, D.ow * S.SW + D.kw * S.DW, D.c]))
+
+
 
 @linalg_structured_op
 def pooling_ndhwc_sum(I=TensorDef(T1, S.N, S.OD * S.SD + S.KD * S.DD,

diff  --git a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
index fbba581e3df8e..225cd583c4256 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
@@ -131,6 +131,20 @@ func.func @generalize_pooling_nhwc_max_f32(%input : tensor<1x4x16x1xf32>, %shape
 
 // -----
 
+func.func @generalize_pooling_nwc_max_f32(%input : tensor<1x16x1xf32>, %shape: tensor<2xf32>, %output: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
+  %0 = linalg.pooling_nwc_max {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>) -> tensor<1x4x1xf32>
+  return %0: tensor<1x4x1xf32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_max_f32
+// CHECK:      ^{{.*}}(%[[IN_ARG:.+]]: f32, %[[SHAPE_ARG:.+]]: f32, %[[OUT_ARG:.+]]: f32)
+// CHECK-NEXT:   %[[MAX:.+]] = arith.maxf %[[OUT_ARG]], %[[IN_ARG]] : f32
+// CHECK-NEXT:   linalg.yield %[[MAX]] : f32
+// CHECK-NEXT: -> tensor<1x4x1xf32>
+
+// -----
+
 func.func @generalize_pooling_nhwc_max_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_max {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
@@ -143,6 +157,18 @@ func.func @generalize_pooling_nhwc_max_i32(%input : tensor<1x4x16x1xi32>, %shape
 
 // -----
 
+func.func @generalize_pooling_nwc_max_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
+  %0 = linalg.pooling_nwc_max {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+  return %0: tensor<1x4x1xi32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_max_i32
+// Verify signed integer maximum.
+// CHECK:        = arith.maxsi
+
+// -----
+
 func.func @generalize_pooling_nhwc_max_unsigned_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_max_unsigned {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
@@ -155,6 +181,18 @@ func.func @generalize_pooling_nhwc_max_unsigned_i32(%input : tensor<1x4x16x1xi32
 
 // -----
 
+func.func @generalize_pooling_nwc_max_unsigned_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
+  %0 = linalg.pooling_nwc_max_unsigned {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+  return %0: tensor<1x4x1xi32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_max_unsigned_i32
+// Verify unsigned integer minimum.
+// CHECK:        = arith.maxui
+
+// -----
+
 func.func @generalize_pooling_nhwc_min_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
   %0 = linalg.pooling_nhwc_min {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
@@ -169,6 +207,20 @@ func.func @generalize_pooling_nhwc_min_f32(%input : tensor<1x4x16x1xf32>, %shape
 
 // -----
 
+func.func @generalize_pooling_nwc_min_f32(%input : tensor<1x16x1xf32>, %shape: tensor<2xf32>, %output: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
+  %0 = linalg.pooling_nwc_min {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>) -> tensor<1x4x1xf32>
+  return %0: tensor<1x4x1xf32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_min_f32
+// CHECK:      ^{{.*}}(%[[IN_ARG:.+]]: f32, %[[SHAPE_ARG:.+]]: f32, %[[OUT_ARG:.+]]: f32)
+// CHECK-NEXT:   %[[MIN:.+]] = arith.minf %[[OUT_ARG]], %[[IN_ARG]] : f32
+// CHECK-NEXT:   linalg.yield %[[MIN]] : f32
+// CHECK-NEXT: -> tensor<1x4x1xf32>
+
+// -----
+
 func.func @generalize_pooling_nhwc_min_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_min {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
@@ -181,6 +233,18 @@ func.func @generalize_pooling_nhwc_min_i32(%input : tensor<1x4x16x1xi32>, %shape
 
 // -----
 
+func.func @generalize_pooling_nwc_min_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
+  %0 = linalg.pooling_nwc_min {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+  return %0: tensor<1x4x1xi32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_min_i32
+// Verify signed integer minimum.
+// CHECK:        = arith.minsi
+
+// -----
+
 func.func @generalize_pooling_nhwc_min_unsigned_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_min_unsigned {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
@@ -193,6 +257,18 @@ func.func @generalize_pooling_nhwc_min_unsigned_i32(%input : tensor<1x4x16x1xi32
 
 // -----
 
+func.func @generalize_pooling_nwc_min_unsigned_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
+  %0 = linalg.pooling_nwc_min_unsigned {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+  return %0: tensor<1x4x1xi32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_min_unsigned_i32
+// Verify unsigned integer minimum.
+// CHECK:        = arith.minui
+
+// -----
+
 func.func @generalize_pooling_nhwc_sum_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
   %0 = linalg.pooling_nhwc_sum {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
@@ -207,6 +283,20 @@ func.func @generalize_pooling_nhwc_sum_f32(%input : tensor<1x4x16x1xf32>, %shape
 
 // -----
 
+func.func @generalize_pooling_nwc_sum_f32(%input : tensor<1x16x1xf32>, %shape: tensor<2xf32>, %output: tensor<1x4x1xf32>) -> tensor<1x4x1xf32> {
+  %0 = linalg.pooling_nwc_sum {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xf32>, tensor<2xf32>) outs(%output : tensor<1x4x1xf32>) -> tensor<1x4x1xf32>
+  return %0: tensor<1x4x1xf32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_sum_f32
+// CHECK:      ^{{.*}}(%[[IN_ARG:.+]]: f32, %[[SHAPE_ARG:.+]]: f32, %[[OUT_ARG:.+]]: f32)
+// CHECK-NEXT:   %[[ADD:.+]] = arith.addf %[[OUT_ARG]], %[[IN_ARG]] : f32
+// CHECK-NEXT:   linalg.yield %[[ADD]] : f32
+// CHECK-NEXT: -> tensor<1x4x1xf32>
+
+// -----
+
 func.func @generalize_pooling_nhwc_sum_i32(%input : tensor<1x4x16x1xi32>, %shape: tensor<2x2xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
   %0 = linalg.pooling_nhwc_sum {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xi32>, tensor<2x2xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
@@ -221,6 +311,20 @@ func.func @generalize_pooling_nhwc_sum_i32(%input : tensor<1x4x16x1xi32>, %shape
 
 // -----
 
+func.func @generalize_pooling_nwc_sum_i32(%input : tensor<1x16x1xi32>, %shape: tensor<2xi32>, %output: tensor<1x4x1xi32>) -> tensor<1x4x1xi32> {
+  %0 = linalg.pooling_nwc_sum {dilations = dense<[2]> : tensor<1xi64>, strides = dense<[4]> : tensor<1xi64>}
+    ins(%input, %shape : tensor<1x16x1xi32>, tensor<2xi32>) outs(%output : tensor<1x4x1xi32>) -> tensor<1x4x1xi32>
+  return %0: tensor<1x4x1xi32>
+}
+
+// CHECK-LABEL: @generalize_pooling_nwc_sum_i32
+// CHECK:      ^{{.*}}(%[[IN_ARG:.+]]: i32, %[[SHAPE_ARG:.+]]: i32, %[[OUT_ARG:.+]]: i32)
+// CHECK-NEXT:   %[[ADD:.+]] = arith.addi %[[OUT_ARG]], %[[IN_ARG]] : i32
+// CHECK-NEXT:   linalg.yield %[[ADD]] : i32
+// CHECK-NEXT: -> tensor<1x4x1xi32>
+
+// -----
+
 func.func @generalize_fill_0d(%value: f64, %O: tensor<f32>) -> tensor<f32> {
   %0 = linalg.fill ins(%value: f64) outs(%O : tensor<f32>) -> tensor<f32>
   return %0: tensor<f32>

diff  --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index 69bdf892c6874..381fdc39354a2 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -422,6 +422,25 @@ func.func @pooling_nhwc_sum_tensor(%input: tensor<1x4x4x1xf32>) -> tensor<1x2x2x
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_sum_tensor
+// CHECK:         %{{.+}} = linalg.pooling_nwc_sum
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xf32>, tensor<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+func.func @pooling_nwc_sum_tensor(%input: tensor<1x4x1xf32>) -> tensor<1x2x1xf32> {
+  %fake = tensor.empty() : tensor<3xf32>
+  %init = tensor.empty() : tensor<1x2x1xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  %res = linalg.pooling_nwc_sum {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x4x1xf32>, tensor<3xf32>)
+    outs(%fill: tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  return %res : tensor<1x2x1xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_sum
 // CHECK:         linalg.pooling_nhwc_sum
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -437,6 +456,21 @@ func.func @pooling_nhwc_sum(%input: memref<1x4x4x1xf32>, %fake: memref<3x3xf32>,
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_sum
+// CHECK:         linalg.pooling_nwc_sum
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x4x1xf32>, memref<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x2x1xf32>)
+func.func @pooling_nwc_sum(%input: memref<1x4x1xf32>, %fake: memref<3xf32>, %output: memref<1x2x1xf32>) {
+  linalg.pooling_nwc_sum {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x4x1xf32>, memref<3xf32>)
+    outs(%output: memref<1x2x1xf32>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nchw_sum_tensor
 // CHECK:         %{{.+}} = linalg.pooling_nchw_sum
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -456,6 +490,25 @@ func.func @pooling_nchw_sum_tensor(%input: tensor<1x1x4x4xf32>) -> tensor<1x1x2x
 
 // -----
 
+// CHECK-LABEL: func @pooling_ncw_sum_tensor
+// CHECK:         %{{.+}} = linalg.pooling_ncw_sum
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x1x4xf32>, tensor<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+func.func @pooling_ncw_sum_tensor(%input: tensor<1x1x4xf32>) -> tensor<1x1x2xf32> {
+  %fake = tensor.empty() : tensor<3xf32>
+  %init = tensor.empty() : tensor<1x1x2xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  %res = linalg.pooling_ncw_sum {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x1x4xf32>, tensor<3xf32>)
+    outs(%fill: tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  return %res : tensor<1x1x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nchw_sum
 // CHECK:         linalg.pooling_nchw_sum
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -471,6 +524,21 @@ func.func @pooling_nchw_sum(%input: memref<1x1x4x4xf32>, %fake: memref<3x3xf32>,
 
 // -----
 
+// CHECK-LABEL: func @pooling_ncw_sum
+// CHECK:         linalg.pooling_ncw_sum
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x1x4xf32>, memref<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x1x2xf32>)
+func.func @pooling_ncw_sum(%input: memref<1x1x4xf32>, %fake: memref<3xf32>, %output: memref<1x1x2xf32>) {
+  linalg.pooling_ncw_sum {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x1x4xf32>, memref<3xf32>)
+    outs(%output: memref<1x1x2xf32>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_max_tensor
 // CHECK:         %{{.+}} = linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -488,6 +556,24 @@ func.func @pooling_nhwc_max_tensor(%input: tensor<1x4x4x1xf32>) -> tensor<1x2x2x
   return %res : tensor<1x2x2x1xf32>
 }
 
+// -----
+// CHECK-LABEL: func @pooling_nwc_max_tensor
+// CHECK:         %{{.+}} = linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xf32>, tensor<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+func.func @pooling_nwc_max_tensor(%input: tensor<1x4x1xf32>) -> tensor<1x2x1xf32> {
+  %fake = tensor.empty() : tensor<3xf32>
+  %init = tensor.empty() : tensor<1x2x1xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x4x1xf32>, tensor<3xf32>)
+    outs(%fill: tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  return %res : tensor<1x2x1xf32>
+}
+
 // -----
 // CHECK-LABEL: func @pooling_nchw_max_tensor
 // CHECK:         %{{.+}} = linalg.pooling_nchw_max
@@ -507,6 +593,25 @@ func.func @pooling_nchw_max_tensor(%input: tensor<1x1x4x4xf32>) -> tensor<1x1x2x
   return %res : tensor<1x1x2x2xf32>
 }
 
+// -----
+// CHECK-LABEL: func @pooling_ncw_max_tensor
+// CHECK:         %{{.+}} = linalg.pooling_ncw_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x1x4xf32>, tensor<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+
+func.func @pooling_ncw_max_tensor(%input: tensor<1x1x4xf32>) -> tensor<1x1x2xf32> {
+  %fake = tensor.empty() : tensor<3xf32>
+  %init = tensor.empty() : tensor<1x1x2xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  %res = linalg.pooling_ncw_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x1x4xf32>, tensor<3xf32>)
+    outs(%fill: tensor<1x1x2xf32>) -> tensor<1x1x2xf32>
+  return %res : tensor<1x1x2xf32>
+}
+
 // -----
 
 // CHECK-LABEL: func @pooling_nhwc_max
@@ -524,6 +629,21 @@ func.func @pooling_nhwc_max(%input: memref<1x4x4x1xf32>, %fake: memref<3x3xf32>,
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_max
+// CHECK:         linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x4x1xf32>, memref<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x2x1xf32>)
+func.func @pooling_nwc_max(%input: memref<1x4x1xf32>, %fake: memref<3xf32>, %output: memref<1x2x1xf32>) {
+  linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x4x1xf32>, memref<3xf32>)
+    outs(%output: memref<1x2x1xf32>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_i8_max_tensor
 // CHECK:         %{{.+}} = linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -543,6 +663,25 @@ func.func @pooling_nhwc_i8_max_tensor(%input: tensor<1x4x4x1xi8>) -> tensor<1x2x
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_i8_max_tensor
+// CHECK:         %{{.+}} = linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xi8>, tensor<3xi8>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi8>) -> tensor<1x2x1xi8>
+func.func @pooling_nwc_i8_max_tensor(%input: tensor<1x4x1xi8>) -> tensor<1x2x1xi8> {
+  %fake = tensor.empty() : tensor<3xi8>
+  %init = tensor.empty() : tensor<1x2x1xi8>
+  %cst = arith.constant 0 : i8
+  %fill = linalg.fill ins(%cst : i8) outs(%init : tensor<1x2x1xi8>) -> tensor<1x2x1xi8>
+  %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x4x1xi8>, tensor<3xi8>)
+    outs(%fill: tensor<1x2x1xi8>) -> tensor<1x2x1xi8>
+  return %res : tensor<1x2x1xi8>
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_i8_max
 // CHECK:         linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -558,6 +697,21 @@ func.func @pooling_nhwc_i8_max(%input: memref<1x4x4x1xi8>, %fake: memref<3x3xi8>
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_i8_max
+// CHECK:         linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x4x1xi8>, memref<3xi8>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x2x1xi8>)
+func.func @pooling_nwc_i8_max(%input: memref<1x4x1xi8>, %fake: memref<3xi8>, %output: memref<1x2x1xi8>) {
+  linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x4x1xi8>, memref<3xi8>)
+    outs(%output: memref<1x2x1xi8>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_i16_max_tensor
 // CHECK:         %{{.+}} = linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -577,6 +731,25 @@ func.func @pooling_nhwc_i16_max_tensor(%input: tensor<1x4x4x1xi16>) -> tensor<1x
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_i16_max_tensor
+// CHECK:         %{{.+}} = linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xi16>, tensor<3xi16>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi16>) -> tensor<1x2x1xi16>
+func.func @pooling_nwc_i16_max_tensor(%input: tensor<1x4x1xi16>) -> tensor<1x2x1xi16> {
+  %fake = tensor.empty() : tensor<3xi16>
+  %init = tensor.empty() : tensor<1x2x1xi16>
+  %cst = arith.constant 0 : i16
+  %fill = linalg.fill ins(%cst : i16) outs(%init : tensor<1x2x1xi16>) -> tensor<1x2x1xi16>
+  %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x4x1xi16>, tensor<3xi16>)
+    outs(%fill: tensor<1x2x1xi16>) -> tensor<1x2x1xi16>
+  return %res : tensor<1x2x1xi16>
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_i16_max
 // CHECK:         linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -592,6 +765,21 @@ func.func @pooling_nhwc_i16_max(%input: memref<1x4x4x1xi16>, %fake: memref<3x3xi
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_i16_max
+// CHECK:         linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x4x1xi16>, memref<3xi16>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x2x1xi16>)
+func.func @pooling_nwc_i16_max(%input: memref<1x4x1xi16>, %fake: memref<3xi16>, %output: memref<1x2x1xi16>) {
+  linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x4x1xi16>, memref<3xi16>)
+    outs(%output: memref<1x2x1xi16>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_i32_max_tensor
 // CHECK:         %{{.+}} = linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -611,6 +799,25 @@ func.func @pooling_nhwc_i32_max_tensor(%input: tensor<1x4x4x1xi32>) -> tensor<1x
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_i32_max_tensor
+// CHECK:         %{{.+}} = linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xi32>, tensor<3xi32>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xi32>) -> tensor<1x2x1xi32>
+func.func @pooling_nwc_i32_max_tensor(%input: tensor<1x4x1xi32>) -> tensor<1x2x1xi32> {
+  %fake = tensor.empty() : tensor<3xi32>
+  %init = tensor.empty() : tensor<1x2x1xi32>
+  %cst = arith.constant 0 : i32
+  %fill = linalg.fill ins(%cst : i32) outs(%init : tensor<1x2x1xi32>) -> tensor<1x2x1xi32>
+  %res = linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x4x1xi32>, tensor<3xi32>)
+    outs(%fill: tensor<1x2x1xi32>) -> tensor<1x2x1xi32>
+  return %res : tensor<1x2x1xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_i32_max
 // CHECK:         linalg.pooling_nhwc_max
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -624,6 +831,21 @@ func.func @pooling_nhwc_i32_max(%input: memref<1x4x4x1xi32>, %fake: memref<3x3xi
   return
 }
 
+// -----
+
+// CHECK-LABEL: func @pooling_nwc_i32_max
+// CHECK:         linalg.pooling_nwc_max
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x4x1xi32>, memref<3xi32>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x2x1xi32>)
+func.func @pooling_nwc_i32_max(%input: memref<1x4x1xi32>, %fake: memref<3xi32>, %output: memref<1x2x1xi32>) {
+  linalg.pooling_nwc_max {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x4x1xi32>, memref<3xi32>)
+    outs(%output: memref<1x2x1xi32>)
+  return
+}
+
 
 // -----
 
@@ -646,6 +868,25 @@ func.func @pooling_nhwc_min_tensor(%input: tensor<1x4x4x1xf32>) -> tensor<1x2x2x
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_min_tensor
+// CHECK:         %{{.+}} = linalg.pooling_nwc_min
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : tensor<1x4x1xf32>, tensor<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+func.func @pooling_nwc_min_tensor(%input: tensor<1x4x1xf32>) -> tensor<1x2x1xf32> {
+  %fake = tensor.empty() : tensor<3xf32>
+  %init = tensor.empty() : tensor<1x2x1xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  %res = linalg.pooling_nwc_min {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: tensor<1x4x1xf32>, tensor<3xf32>)
+    outs(%fill: tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
+  return %res : tensor<1x2x1xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_nhwc_min
 // CHECK:         linalg.pooling_nhwc_min
 // CHECK-SAME:      dilations = dense<1> : tensor<2xi64>
@@ -661,6 +902,21 @@ func.func @pooling_nhwc_min(%input: memref<1x4x4x1xf32>, %fake: memref<3x3xf32>,
 
 // -----
 
+// CHECK-LABEL: func @pooling_nwc_min
+// CHECK:         linalg.pooling_nwc_min
+// CHECK-SAME:      dilations = dense<1> : tensor<1xi64>
+// CHECK-SAME:      strides = dense<1> : tensor<1xi64>
+// CHECK-SAME:      ins(%{{.+}}, %{{.+}} : memref<1x4x1xf32>, memref<3xf32>)
+// CHECK-SAME:      outs(%{{.+}} : memref<1x2x1xf32>)
+func.func @pooling_nwc_min(%input: memref<1x4x1xf32>, %fake: memref<3xf32>, %output: memref<1x2x1xf32>) {
+  linalg.pooling_nwc_min {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+    ins(%input, %fake: memref<1x4x1xf32>, memref<3xf32>)
+    outs(%output: memref<1x2x1xf32>)
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @pooling_ndhwc_sum_tensor
 // CHECK:         %{{.+}} = linalg.pooling_ndhwc_sum
 // CHECK-SAME:      dilations = dense<1> : tensor<3xi64>