[llvm-branch-commits] [flang] [WIP][flang] Introduce HLFIR lowerings to omp.workshare_loop_nest (PR #104748)

Sun Sep 22 23:51:56 PDT 2024

https://github.com/ivanradanov updated https://github.com/llvm/llvm-project/pull/104748

>From 975a0d74c5ae81c69844b8bd089832ed53278477 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 23 Sep 2024 15:07:48 +0900
Subject: [PATCH 1/4] Emit a proper error message for CFG in workshare

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 13 +++++-
 .../OpenMP/lower-workshare-todo-cfg-dom.mlir  | 23 ++++++++++
 .../OpenMP/lower-workshare-todo-cfg.mlir      | 20 +++++++++
 .../Transforms/OpenMP/lower-workshare5.mlir   | 42 -------------------
 4 files changed, 55 insertions(+), 43 deletions(-)
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
 delete mode 100644 flang/test/Transforms/OpenMP/lower-workshare5.mlir

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index 6e5538b54ba5e0..cf1867311cc236 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -16,6 +16,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "flang/Optimizer/Builder/Todo.h"
 #include <flang/Optimizer/Builder/FIRBuilder.h>
 #include <flang/Optimizer/Dialect/FIROps.h>
 #include <flang/Optimizer/Dialect/FIRType.h>
@@ -416,8 +417,18 @@ LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) {
 
   parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc, di);
 
+  // FIXME Currently, we only support workshare constructs with structured
+  // control flow. The transformation itself supports CFG, however, once we
+  // transform the MLIR region in the omp.workshare, we need to inline that
+  // region in the parent block. We have no guarantees at this point of the
+  // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not
+  // generally possible.  The alternative is to put the lowered region in an
+  // operation akin to scf.execute_region, which will get lowered at the same
+  // time when fir ops get lowered to CFG. However, SCF is not registered in
+  // flang so we cannot use it. Remove this requirement once we have
+  // scf.execute_region or an alternative operation available.
   if (wsOp.getRegion().getBlocks().size() != 1)
-    return failure();
+    TODO(wsOp->getLoc(), "omp workshare with unstructured control flow");
 
   // Inline the contents of the placeholder workshare op into its parent block.
   Block *theBlock = &newOp.getRegion().front();
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
new file mode 100644
index 00000000000000..1c47d448f597d9
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
@@ -0,0 +1,23 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s
+
+// CHECK: not yet implemented: omp workshare with unstructured control flow
+
+// Check that the definition of %r dominates its use post-transform
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+    omp.workshare {
+    ^bb1:
+      %c1 = arith.constant 1 : i32
+      cf.br ^bb3(%c1: i32)
+    ^bb2:
+      "test.test2"(%r) : (i32) -> ()
+      omp.terminator
+    ^bb3(%arg1: i32):
+      %r = "test.test2"(%arg1) : (i32) -> i32
+      cf.br ^bb2
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
new file mode 100644
index 00000000000000..bf6c196a05b4a3
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
@@ -0,0 +1,20 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | FileCheck %s
+
+// CHECK: not yet implemented: omp workshare with unstructured control flow
+
+// Check transforming a simple CFG
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+    omp.workshare {
+    ^bb1:
+      %c1 = arith.constant 1 : i32
+      cf.br ^bb3(%c1: i32)
+    ^bb3(%arg1: i32):
+      "test.test2"(%arg1) : (i32) -> ()
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare5.mlir b/flang/test/Transforms/OpenMP/lower-workshare5.mlir
deleted file mode 100644
index 177f8aa8f86c7c..00000000000000
--- a/flang/test/Transforms/OpenMP/lower-workshare5.mlir
+++ /dev/null
@@ -1,42 +0,0 @@
-// XFAIL: *
-// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
-
-// TODO we can lower these but we have no guarantee that the parent of
-// omp.workshare supports multi-block regions, thus we fail for now.
-
-func.func @wsfunc() {
-  %a = fir.alloca i32
-  omp.parallel {
-    omp.workshare {
-    ^bb1:
-      %c1 = arith.constant 1 : i32
-      cf.br ^bb3(%c1: i32)
-    ^bb3(%arg1: i32):
-      "test.test2"(%arg1) : (i32) -> ()
-      omp.terminator
-    }
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-func.func @wsfunc() {
-  %a = fir.alloca i32
-  omp.parallel {
-    omp.workshare {
-    ^bb1:
-      %c1 = arith.constant 1 : i32
-      cf.br ^bb3(%c1: i32)
-    ^bb2:
-      "test.test2"(%r) : (i32) -> ()
-      omp.terminator
-    ^bb3(%arg1: i32):
-      %r = "test.test2"(%arg1) : (i32) -> i32
-      cf.br ^bb2
-    }
-    omp.terminator
-  }
-  return
-}

>From 79ac7998609480d18be4ea3bc61b6c1c77089f70 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 23 Sep 2024 15:44:23 +0900
Subject: [PATCH 2/4] Cleanup tests

---
 .../OpenMP/lower-workshare-alloca.mlir        |  55 +++++
 ...are6.mlir => lower-workshare-binding.mlir} |   0
 ...are4.mlir => lower-workshare-cleanup.mlir} |   0
 ....mlir => lower-workshare-copyprivate.mlir} |   0
 ...hare2.mlir => lower-workshare-nowait.mlir} |   0
 .../Transforms/OpenMP/lower-workshare.mlir    | 189 ------------------
 6 files changed, 55 insertions(+), 189 deletions(-)
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir
 rename flang/test/Transforms/OpenMP/{lower-workshare6.mlir => lower-workshare-binding.mlir} (100%)
 rename flang/test/Transforms/OpenMP/{lower-workshare4.mlir => lower-workshare-cleanup.mlir} (100%)
 rename flang/test/Transforms/OpenMP/{lower-workshare3.mlir => lower-workshare-copyprivate.mlir} (100%)
 rename flang/test/Transforms/OpenMP/{lower-workshare2.mlir => lower-workshare-nowait.mlir} (100%)
 delete mode 100644 flang/test/Transforms/OpenMP/lower-workshare.mlir

diff --git a/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir
new file mode 100644
index 00000000000000..d1bef3a359e487
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-alloca.mlir
@@ -0,0 +1,55 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Checks that fir.alloca is hoisted out and copyprivate'd
+func.func @wsfunc() {
+  omp.workshare {
+    %c1 = arith.constant 1 : index
+    %c42 = arith.constant 42 : index
+    %c1_i32 = arith.constant 1 : i32
+    %alloc = fir.alloca i32
+    fir.store %c1_i32 to %alloc : !fir.ref<i32>
+    omp.workshare.loop_wrapper {
+      omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
+        "test.test1"(%alloc) : (!fir.ref<i32>) -> ()
+        omp.yield
+      }
+      omp.terminator
+    }
+    "test.test2"(%alloc) : (!fir.ref<i32>) -> ()
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL:   func.func private @_workshare_copy_i32(
+// CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32>,
+// CHECK-SAME:                                           %[[VAL_1:.*]]: !fir.ref<i32>) {
+// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
+// CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @wsfunc() {
+// CHECK:           %[[VAL_0:.*]] = fir.alloca i32
+// CHECK:           omp.single copyprivate(%[[VAL_0]] -> @_workshare_copy_i32 : !fir.ref<i32>) {
+// CHECK:             %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:             fir.store %[[VAL_1]] to %[[VAL_0]] : !fir.ref<i32>
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : index
+// CHECK:           omp.wsloop {
+// CHECK:             omp.loop_nest (%[[VAL_4:.*]]) : index = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_2]]) {
+// CHECK:               "test.test1"(%[[VAL_0]]) : (!fir.ref<i32>) -> ()
+// CHECK:               omp.yield
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           omp.single nowait {
+// CHECK:             "test.test2"(%[[VAL_0]]) : (!fir.ref<i32>) -> ()
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           omp.barrier
+// CHECK:           return
+// CHECK:         }
+
diff --git a/flang/test/Transforms/OpenMP/lower-workshare6.mlir b/flang/test/Transforms/OpenMP/lower-workshare-binding.mlir
similarity index 100%
rename from flang/test/Transforms/OpenMP/lower-workshare6.mlir
rename to flang/test/Transforms/OpenMP/lower-workshare-binding.mlir
diff --git a/flang/test/Transforms/OpenMP/lower-workshare4.mlir b/flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir
similarity index 100%
rename from flang/test/Transforms/OpenMP/lower-workshare4.mlir
rename to flang/test/Transforms/OpenMP/lower-workshare-cleanup.mlir
diff --git a/flang/test/Transforms/OpenMP/lower-workshare3.mlir b/flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir
similarity index 100%
rename from flang/test/Transforms/OpenMP/lower-workshare3.mlir
rename to flang/test/Transforms/OpenMP/lower-workshare-copyprivate.mlir
diff --git a/flang/test/Transforms/OpenMP/lower-workshare2.mlir b/flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir
similarity index 100%
rename from flang/test/Transforms/OpenMP/lower-workshare2.mlir
rename to flang/test/Transforms/OpenMP/lower-workshare-nowait.mlir
diff --git a/flang/test/Transforms/OpenMP/lower-workshare.mlir b/flang/test/Transforms/OpenMP/lower-workshare.mlir
deleted file mode 100644
index a609ee5d3d6c2a..00000000000000
--- a/flang/test/Transforms/OpenMP/lower-workshare.mlir
+++ /dev/null
@@ -1,189 +0,0 @@
-// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
-
-// checks:
-// nowait on final omp.single
-func.func @wsfunc(%arg0: !fir.ref<!fir.array<42xi32>>) {
-  omp.parallel {
-    omp.workshare {
-      %c42 = arith.constant 42 : index
-      %c1_i32 = arith.constant 1 : i32
-      %0 = fir.shape %c42 : (index) -> !fir.shape<1>
-      %1:2 = hlfir.declare %arg0(%0) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-      %2 = fir.allocmem !fir.array<42xi32> {bindc_name = ".tmp.array", uniq_name = ""}
-      %3:2 = hlfir.declare %2(%0) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
-      %true = arith.constant true
-      %c1 = arith.constant 1 : index
-      omp.workshare.loop_wrapper {
-        omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
-          %7 = hlfir.designate %1#0 (%arg1)  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-          %8 = fir.load %7 : !fir.ref<i32>
-          %9 = arith.subi %8, %c1_i32 : i32
-          %10 = hlfir.designate %3#0 (%arg1)  : (!fir.heap<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-          hlfir.assign %9 to %10 temporary_lhs : i32, !fir.ref<i32>
-          omp.yield
-        }
-        omp.terminator
-      }
-      %4 = fir.undefined tuple<!fir.heap<!fir.array<42xi32>>, i1>
-      %5 = fir.insert_value %4, %true, [1 : index] : (tuple<!fir.heap<!fir.array<42xi32>>, i1>, i1) -> tuple<!fir.heap<!fir.array<42xi32>>, i1>
-      %6 = fir.insert_value %5, %3#0, [0 : index] : (tuple<!fir.heap<!fir.array<42xi32>>, i1>, !fir.heap<!fir.array<42xi32>>) -> tuple<!fir.heap<!fir.array<42xi32>>, i1>
-      hlfir.assign %3#0 to %1#0 : !fir.heap<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
-      fir.freemem %3#0 : !fir.heap<!fir.array<42xi32>>
-      omp.terminator
-    }
-    omp.terminator
-  }
-  return
-}
-
-// -----
-
-// checks:
-// fir.alloca hoisted out and copyprivate'd
-func.func @wsfunc(%arg0: !fir.ref<!fir.array<42xi32>>) {
-  omp.workshare {
-    %c1_i32 = arith.constant 1 : i32
-    %alloc = fir.alloca i32
-    fir.store %c1_i32 to %alloc : !fir.ref<i32>
-    %c42 = arith.constant 42 : index
-    %0 = fir.shape %c42 : (index) -> !fir.shape<1>
-    %1:2 = hlfir.declare %arg0(%0) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-    %2 = fir.allocmem !fir.array<42xi32> {bindc_name = ".tmp.array", uniq_name = ""}
-    %3:2 = hlfir.declare %2(%0) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
-    %true = arith.constant true
-    %c1 = arith.constant 1 : index
-    omp.workshare.loop_wrapper {
-      omp.loop_nest (%arg1) : index = (%c1) to (%c42) inclusive step (%c1) {
-        %7 = hlfir.designate %1#0 (%arg1)  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-        %8 = fir.load %7 : !fir.ref<i32>
-        %ld = fir.load %alloc : !fir.ref<i32>
-        %n8 = arith.subi %8, %ld : i32
-        %9 = arith.subi %n8, %c1_i32 : i32
-        %10 = hlfir.designate %3#0 (%arg1)  : (!fir.heap<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-        hlfir.assign %9 to %10 temporary_lhs : i32, !fir.ref<i32>
-        omp.yield
-      }
-      omp.terminator
-    }
-    %4 = fir.undefined tuple<!fir.heap<!fir.array<42xi32>>, i1>
-    %5 = fir.insert_value %4, %true, [1 : index] : (tuple<!fir.heap<!fir.array<42xi32>>, i1>, i1) -> tuple<!fir.heap<!fir.array<42xi32>>, i1>
-    %6 = fir.insert_value %5, %3#0, [0 : index] : (tuple<!fir.heap<!fir.array<42xi32>>, i1>, !fir.heap<!fir.array<42xi32>>) -> tuple<!fir.heap<!fir.array<42xi32>>, i1>
-    "test.test1"(%alloc) : (!fir.ref<i32>) -> ()
-    hlfir.assign %3#0 to %1#0 : !fir.heap<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
-    fir.freemem %3#0 : !fir.heap<!fir.array<42xi32>>
-    omp.terminator
-  }
-  return
-}
-
-// CHECK-LABEL:   func.func private @_workshare_copy_heap_42xi32(
-// CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<!fir.heap<!fir.array<42xi32>>>,
-// CHECK-SAME:                                                   %[[VAL_1:.*]]: !fir.ref<!fir.heap<!fir.array<42xi32>>>) {
-// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:           return
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @wsfunc(
-// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
-// CHECK:           omp.parallel {
-// CHECK:             %[[VAL_1:.*]] = fir.alloca !fir.heap<!fir.array<42xi32>>
-// CHECK:             omp.single copyprivate(%[[VAL_1]] -> @_workshare_copy_heap_42xi32 : !fir.ref<!fir.heap<!fir.array<42xi32>>>) {
-// CHECK:               %[[VAL_2:.*]] = arith.constant 42 : index
-// CHECK:               %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-// CHECK:               %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-// CHECK:               %[[VAL_5:.*]] = fir.allocmem !fir.array<42xi32> {bindc_name = ".tmp.array", uniq_name = ""}
-// CHECK:               fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:               %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
-// CHECK:               omp.terminator
-// CHECK:             }
-// CHECK:             %[[VAL_7:.*]] = arith.constant 42 : index
-// CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
-// CHECK:             %[[VAL_9:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
-// CHECK:             %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_9]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-// CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_1]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_9]]) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
-// CHECK:             %[[VAL_13:.*]] = arith.constant 1 : index
-// CHECK:             omp.wsloop {
-// CHECK:               omp.loop_nest (%[[VAL_14:.*]]) : index = (%[[VAL_13]]) to (%[[VAL_7]]) inclusive step (%[[VAL_13]]) {
-// CHECK:                 %[[VAL_15:.*]] = hlfir.designate %[[VAL_10]]#0 (%[[VAL_14]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-// CHECK:                 %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<i32>
-// CHECK:                 %[[VAL_17:.*]] = arith.subi %[[VAL_16]], %[[VAL_8]] : i32
-// CHECK:                 %[[VAL_18:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_14]])  : (!fir.heap<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-// CHECK:                 hlfir.assign %[[VAL_17]] to %[[VAL_18]] temporary_lhs : i32, !fir.ref<i32>
-// CHECK:                 omp.yield
-// CHECK:               }
-// CHECK:               omp.terminator
-// CHECK:             }
-// CHECK:             omp.single nowait {
-// CHECK:               hlfir.assign %[[VAL_12]]#0 to %[[VAL_10]]#0 : !fir.heap<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
-// CHECK:               fir.freemem %[[VAL_12]]#0 : !fir.heap<!fir.array<42xi32>>
-// CHECK:               omp.terminator
-// CHECK:             }
-// CHECK:             omp.barrier
-// CHECK:             omp.terminator
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
-
-// CHECK-LABEL:   func.func private @_workshare_copy_heap_42xi32(
-// CHECK-SAME:                                                   %[[VAL_0:.*]]: !fir.ref<!fir.heap<!fir.array<42xi32>>>,
-// CHECK-SAME:                                                   %[[VAL_1:.*]]: !fir.ref<!fir.heap<!fir.array<42xi32>>>) {
-// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:           return
-// CHECK:         }
-
-// CHECK-LABEL:   func.func private @_workshare_copy_i32(
-// CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.ref<i32>,
-// CHECK-SAME:                                           %[[VAL_1:.*]]: !fir.ref<i32>) {
-// CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
-// CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-
-// CHECK-LABEL:   func.func @wsfunc(
-// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
-// CHECK:           %[[VAL_1:.*]] = fir.alloca i32
-// CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.heap<!fir.array<42xi32>>
-// CHECK:           omp.single copyprivate(%[[VAL_1]] -> @_workshare_copy_i32 : !fir.ref<i32>, %[[VAL_2]] -> @_workshare_copy_heap_42xi32 : !fir.ref<!fir.heap<!fir.array<42xi32>>>) {
-// CHECK:             %[[VAL_3:.*]] = arith.constant 1 : i32
-// CHECK:             fir.store %[[VAL_3]] to %[[VAL_1]] : !fir.ref<i32>
-// CHECK:             %[[VAL_4:.*]] = arith.constant 42 : index
-// CHECK:             %[[VAL_5:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
-// CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-// CHECK:             %[[VAL_7:.*]] = fir.allocmem !fir.array<42xi32> {bindc_name = ".tmp.array", uniq_name = ""}
-// CHECK:             fir.store %[[VAL_7]] to %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_5]]) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
-// CHECK:             omp.terminator
-// CHECK:           }
-// CHECK:           %[[VAL_9:.*]] = arith.constant 1 : i32
-// CHECK:           %[[VAL_10:.*]] = arith.constant 42 : index
-// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_11]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
-// CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.heap<!fir.array<42xi32>>>
-// CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]](%[[VAL_11]]) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
-// CHECK:           %[[VAL_15:.*]] = arith.constant 1 : index
-// CHECK:           omp.wsloop {
-// CHECK:             omp.loop_nest (%[[VAL_16:.*]]) : index = (%[[VAL_15]]) to (%[[VAL_10]]) inclusive step (%[[VAL_15]]) {
-// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_16]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
-// CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:               %[[VAL_20:.*]] = arith.subi %[[VAL_18]], %[[VAL_19]] : i32
-// CHECK:               %[[VAL_21:.*]] = arith.subi %[[VAL_20]], %[[VAL_9]] : i32
-// CHECK:               %[[VAL_22:.*]] = hlfir.designate %[[VAL_14]]#0 (%[[VAL_16]])  : (!fir.heap<!fir.array<42xi32>>, index) -> !fir.ref<i32>
-// CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_22]] temporary_lhs : i32, !fir.ref<i32>
-// CHECK:               omp.yield
-// CHECK:             }
-// CHECK:             omp.terminator
-// CHECK:           }
-// CHECK:           omp.single nowait {
-// CHECK:             "test.test1"(%[[VAL_1]]) : (!fir.ref<i32>) -> ()
-// CHECK:             hlfir.assign %[[VAL_14]]#0 to %[[VAL_12]]#0 : !fir.heap<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
-// CHECK:             fir.freemem %[[VAL_14]]#0 : !fir.heap<!fir.array<42xi32>>
-// CHECK:             omp.terminator
-// CHECK:           }
-// CHECK:           omp.barrier
-// CHECK:           return
-// CHECK:         }
-

>From 39bf15c02d0e010a6c0095c1b332ad1532ddc0ab Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Sun, 4 Aug 2024 17:33:52 +0900
Subject: [PATCH 3/4] Add workshare loop wrapper lowerings

Bufferize test

Bufferize test

Bufferize test

Add test for should use workshare lowering
---
 .../HLFIR/Transforms/BufferizeHLFIR.cpp       |   4 +-
 .../Transforms/OptimizedBufferization.cpp     |  10 +-
 flang/test/HLFIR/bufferize-workshare.fir      |  58 ++++++++
 .../OpenMP/should-use-workshare-lowering.mlir | 140 ++++++++++++++++++
 4 files changed, 208 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/HLFIR/bufferize-workshare.fir
 create mode 100644 flang/test/Transforms/OpenMP/should-use-workshare-lowering.mlir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 07794828fce267..1848dbe2c7a2c2 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -26,6 +26,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -792,7 +793,8 @@ struct ElementalOpConversion
     // Generate a loop nest looping around the fir.elemental shape and clone
     // fir.elemental region inside the inner loop.
     hlfir::LoopNest loopNest =
-        hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
+        hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered(),
+                           flangomp::shouldUseWorkshareLowering(elemental));
     auto insPt = builder.saveInsertionPoint();
     builder.setInsertionPointToStart(loopNest.body);
     auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 3a0a98dc594463..f014724861e333 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -20,6 +20,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/Transforms/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Dominance.h"
@@ -482,7 +483,8 @@ llvm::LogicalResult ElementalAssignBufferization::matchAndRewrite(
   // Generate a loop nest looping around the hlfir.elemental shape and clone
   // hlfir.elemental region inside the inner loop
   hlfir::LoopNest loopNest =
-      hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
+      hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered(),
+                         flangomp::shouldUseWorkshareLowering(elemental));
   builder.setInsertionPointToStart(loopNest.body);
   auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
                                         loopNest.oneBasedIndices);
@@ -553,7 +555,8 @@ llvm::LogicalResult BroadcastAssignBufferization::matchAndRewrite(
   llvm::SmallVector<mlir::Value> extents =
       hlfir::getIndexExtents(loc, builder, shape);
   hlfir::LoopNest loopNest =
-      hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
+      hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
+                         flangomp::shouldUseWorkshareLowering(assign));
   builder.setInsertionPointToStart(loopNest.body);
   auto arrayElement =
       hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
@@ -648,7 +651,8 @@ llvm::LogicalResult VariableAssignBufferization::matchAndRewrite(
   llvm::SmallVector<mlir::Value> extents =
       hlfir::getIndexExtents(loc, builder, shape);
   hlfir::LoopNest loopNest =
-      hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
+      hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
+                         flangomp::shouldUseWorkshareLowering(assign));
   builder.setInsertionPointToStart(loopNest.body);
   auto rhsArrayElement =
       hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
diff --git a/flang/test/HLFIR/bufferize-workshare.fir b/flang/test/HLFIR/bufferize-workshare.fir
new file mode 100644
index 00000000000000..9b7341ae43398a
--- /dev/null
+++ b/flang/test/HLFIR/bufferize-workshare.fir
@@ -0,0 +1,58 @@
+// RUN: fir-opt --bufferize-hlfir %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @simple(
+// CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<!fir.array<42xi32>>) {
+// CHECK:           omp.parallel {
+// CHECK:             omp.workshare {
+// CHECK:               %[[VAL_1:.*]] = arith.constant 42 : index
+// CHECK:               %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:               %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
+// CHECK:               %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+// CHECK:               %[[VAL_5:.*]] = fir.allocmem !fir.array<42xi32> {bindc_name = ".tmp.array", uniq_name = ""}
+// CHECK:               %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_3]]) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<42xi32>>, !fir.heap<!fir.array<42xi32>>)
+// CHECK:               %[[VAL_7:.*]] = arith.constant true
+// CHECK:               %[[VAL_8:.*]] = arith.constant 1 : index
+// CHECK:               omp.workshare.loop_wrapper {
+// CHECK:                 omp.loop_nest (%[[VAL_9:.*]]) : index = (%[[VAL_8]]) to (%[[VAL_1]]) inclusive step (%[[VAL_8]]) {
+// CHECK:                   %[[VAL_10:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_9]])  : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:                   %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<i32>
+// CHECK:                   %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_2]] : i32
+// CHECK:                   %[[VAL_13:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_9]])  : (!fir.heap<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+// CHECK:                   hlfir.assign %[[VAL_12]] to %[[VAL_13]] temporary_lhs : i32, !fir.ref<i32>
+// CHECK:                   omp.yield
+// CHECK:                 }
+// CHECK:                 omp.terminator
+// CHECK:               }
+// CHECK:               %[[VAL_14:.*]] = fir.undefined tuple<!fir.heap<!fir.array<42xi32>>, i1>
+// CHECK:               %[[VAL_15:.*]] = fir.insert_value %[[VAL_14]], %[[VAL_7]], [1 : index] : (tuple<!fir.heap<!fir.array<42xi32>>, i1>, i1) -> tuple<!fir.heap<!fir.array<42xi32>>, i1>
+// CHECK:               %[[VAL_16:.*]] = fir.insert_value %[[VAL_15]], %[[VAL_6]]#0, [0 : index] : (tuple<!fir.heap<!fir.array<42xi32>>, i1>, !fir.heap<!fir.array<42xi32>>) -> tuple<!fir.heap<!fir.array<42xi32>>, i1>
+// CHECK:               hlfir.assign %[[VAL_6]]#0 to %[[VAL_4]]#0 : !fir.heap<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>
+// CHECK:               fir.freemem %[[VAL_6]]#0 : !fir.heap<!fir.array<42xi32>>
+// CHECK:               omp.terminator
+// CHECK:             }
+// CHECK:             omp.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+func.func @simple(%arg: !fir.ref<!fir.array<42xi32>>) {
+  omp.parallel {
+    omp.workshare {
+      %c42 = arith.constant 42 : index
+      %c1_i32 = arith.constant 1 : i32
+      %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+      %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+      %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+      ^bb0(%i: index):
+        %ref = hlfir.designate %array#0 (%i) : (!fir.ref<!fir.array<42xi32>>, index) -> !fir.ref<i32>
+        %val = fir.load %ref : !fir.ref<i32>
+        %sub = arith.subi %val, %c1_i32 : i32
+        hlfir.yield_element %sub : i32
+      }
+      hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+      hlfir.destroy %elemental : !hlfir.expr<42xi32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/should-use-workshare-lowering.mlir b/flang/test/Transforms/OpenMP/should-use-workshare-lowering.mlir
new file mode 100644
index 00000000000000..229fe592a02b9b
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/should-use-workshare-lowering.mlir
@@ -0,0 +1,140 @@
+// RUN: fir-opt --bufferize-hlfir %s | FileCheck %s
+
+// Checks that we correctly identify when to use the lowering to
+// omp.workshare.loop_wrapper
+
+// CHECK-LABEL: @should_parallelize_0
+// CHECK: omp.workshare.loop_wrapper
+func.func @should_parallelize_0(%arg: !fir.ref<!fir.array<42xi32>>, %idx : index) {
+  omp.workshare {
+    %c42 = arith.constant 42 : index
+    %c1_i32 = arith.constant 1 : i32
+    %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+    %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+    %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+    ^bb0(%i: index):
+      hlfir.yield_element %c1_i32 : i32
+    }
+    hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+    hlfir.destroy %elemental : !hlfir.expr<42xi32>
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @should_parallelize_1
+// CHECK: omp.workshare.loop_wrapper
+func.func @should_parallelize_1(%arg: !fir.ref<!fir.array<42xi32>>, %idx : index) {
+  omp.parallel {
+    omp.workshare {
+      %c42 = arith.constant 42 : index
+      %c1_i32 = arith.constant 1 : i32
+      %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+      %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+      %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+      ^bb0(%i: index):
+        hlfir.yield_element %c1_i32 : i32
+      }
+      hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+      hlfir.destroy %elemental : !hlfir.expr<42xi32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+
+// CHECK-LABEL: @should_not_parallelize_0
+// CHECK-NOT: omp.workshare.loop_wrapper
+func.func @should_not_parallelize_0(%arg: !fir.ref<!fir.array<42xi32>>, %idx : index) {
+  omp.workshare {
+    omp.single {
+      %c42 = arith.constant 42 : index
+      %c1_i32 = arith.constant 1 : i32
+      %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+      %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+      %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+      ^bb0(%i: index):
+        hlfir.yield_element %c1_i32 : i32
+      }
+      hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+      hlfir.destroy %elemental : !hlfir.expr<42xi32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @should_not_parallelize_1
+// CHECK-NOT: omp.workshare.loop_wrapper
+func.func @should_not_parallelize_1(%arg: !fir.ref<!fir.array<42xi32>>, %idx : index) {
+  omp.workshare {
+    omp.critical {
+      %c42 = arith.constant 42 : index
+      %c1_i32 = arith.constant 1 : i32
+      %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+      %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+      %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+      ^bb0(%i: index):
+        hlfir.yield_element %c1_i32 : i32
+      }
+      hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+      hlfir.destroy %elemental : !hlfir.expr<42xi32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @should_not_parallelize_2
+// CHECK-NOT: omp.workshare.loop_wrapper
+func.func @should_not_parallelize_2(%arg: !fir.ref<!fir.array<42xi32>>, %idx : index) {
+  omp.workshare {
+    omp.parallel {
+      %c42 = arith.constant 42 : index
+      %c1_i32 = arith.constant 1 : i32
+      %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+      %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+      %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+      ^bb0(%i: index):
+        hlfir.yield_element %c1_i32 : i32
+      }
+      hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+      hlfir.destroy %elemental : !hlfir.expr<42xi32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @should_not_parallelize_3
+// CHECK-NOT: omp.workshare.loop_wrapper
+func.func @should_not_parallelize_3(%arg: !fir.ref<!fir.array<42xi32>>, %idx : index) {
+  omp.workshare {
+    omp.parallel {
+      omp.workshare {
+        omp.parallel {
+          %c42 = arith.constant 42 : index
+          %c1_i32 = arith.constant 1 : i32
+          %shape = fir.shape %c42 : (index) -> !fir.shape<1>
+          %array:2 = hlfir.declare %arg(%shape) {uniq_name = "array"} : (!fir.ref<!fir.array<42xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<42xi32>>, !fir.ref<!fir.array<42xi32>>)
+          %elemental = hlfir.elemental %shape unordered : (!fir.shape<1>) -> !hlfir.expr<42xi32> {
+          ^bb0(%i: index):
+            hlfir.yield_element %c1_i32 : i32
+          }
+          hlfir.assign %elemental to %array#0 : !hlfir.expr<42xi32>, !fir.ref<!fir.array<42xi32>>
+          hlfir.destroy %elemental : !hlfir.expr<42xi32>
+          omp.terminator
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}

>From 57a46ba7b1cec887d683ae97a1248ee710528c67 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov <ivanov.i.aa at m.titech.ac.jp>
Date: Mon, 23 Sep 2024 12:56:11 +0900
Subject: [PATCH 4/4] Add integration test for workshare

---
 flang/test/Integration/OpenMP/workshare.f90 | 57 +++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 flang/test/Integration/OpenMP/workshare.f90

diff --git a/flang/test/Integration/OpenMP/workshare.f90 b/flang/test/Integration/OpenMP/workshare.f90
new file mode 100644
index 00000000000000..0c4524f8552906
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare.f90
@@ -0,0 +1,57 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -O3 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -O3 %s -o - | FileCheck %s --check-prefix FIR
+
+subroutine sb1(a, x, y, z)
+  integer :: a
+  integer :: x(:)
+  integer :: y(:)
+  integer :: z(:)
+  !$omp parallel workshare
+  z = a * x + y
+  !$omp end parallel workshare
+end subroutine
+
+! HLFIR:  func.func @_QPsb1
+! HLFIR:    omp.parallel {
+! HLFIR:      omp.workshare {
+! HLFIR:        hlfir.elemental {{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! HLFIR:        hlfir.elemental {{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! HLFIR:        hlfir.assign
+! HLFIR:        hlfir.destroy
+! HLFIR:        hlfir.destroy
+! HLFIR-NOT:    omp.barrier
+! HLFIR:        omp.terminator
+! HLFIR:      }
+! HLFIR-NOT:  omp.barrier
+! HLFIR:      omp.terminator
+! HLFIR:    }
+! HLFIR:    return
+! HLFIR:  }
+! HLFIR:}
+
+
+! FIR:  func.func private @_workshare_copy_heap_Uxi32(%{{[a-z0-9]+}}: !fir.ref<!fir.heap<!fir.array<?xi32>>>, %{{[a-z0-9]+}}: !fir.ref<!fir.heap<!fir.array<?xi32>>>
+! FIR:  func.func private @_workshare_copy_i32(%{{[a-z0-9]+}}: !fir.ref<i32>, %{{[a-z0-9]+}}: !fir.ref<i32>
+
+! FIR:  func.func @_QPsb1
+! FIR:    omp.parallel {
+! FIR:      omp.single copyprivate(%9 -> @_workshare_copy_i32 : !fir.ref<i32>, %10 -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>) {
+! FIR:        fir.allocmem
+! FIR:      omp.wsloop {
+! FIR:        omp.loop_nest
+! FIR:      omp.single nowait {
+! FIR:        fir.call @_FortranAAssign
+! FIR:        fir.freemem
+! FIR:        omp.terminator
+! FIR:      }
+! FIR:      omp.barrier
+! FIR:      omp.terminator
+! FIR:    }