[llvm] [mlir] [OpenMP][MLIR][OMPIRBuilder] Add a small optional constant alloca raise function pass to finalize, utilised in convertTarget (PR #78818)

Tue Jan 23 05:57:43 PST 2024

agozillon wrote:

> Could you share the HLFIR IR that needs this change?

Sure, here you go: 

```
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false, omp.requires = #omp<clause_requires none>, omp.version = #omp.version<version = 11>} {
  func.func @_QQmain() attributes {fir.bindc_name = "main"} {
    %0 = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "test", uniq_name = "_QFEtest"}
    %1 = fir.zero_bits !fir.heap<i32>
    %2 = fir.embox %1 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
    fir.store %2 to %0 : !fir.ref<!fir.box<!fir.heap<i32>>>
    %3:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEtest"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
    %4 = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFEtest.alloc"}
    %5 = fir.embox %4 : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
    fir.store %5 to %3#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
    %6 = fir.load %3#1 : !fir.ref<!fir.box<!fir.heap<i32>>>
    %7 = fir.box_offset %3#1 base_addr : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.llvm_ptr<!fir.ref<i32>>
    %8 = fir.box_addr %6 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
    %9 = omp.map_info var_ptr(%8 : !fir.heap<i32>, i32) var_ptr_ptr(%7 : !fir.llvm_ptr<!fir.ref<i32>>) map_clauses(tofrom) capture(ByRef) -> !fir.heap<i32> {name = "test"}
    %10 = omp.map_info var_ptr(%3#1 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(tofrom) capture(ByRef) members(%9 : !fir.heap<i32>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = "test"}
    omp.target map_entries(%10 -> %arg0 : !fir.ref<!fir.box<!fir.heap<i32>>>) {
    ^bb0(%arg0: !fir.ref<!fir.box<!fir.heap<i32>>>):
      %20:2 = hlfir.declare %arg0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEtest"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
      %c50_i32 = arith.constant 50 : i32
      hlfir.assign %c50_i32 to %20#0 realloc : i32, !fir.ref<!fir.box<!fir.heap<i32>>>
      omp.terminator
    }
    %c6_i32 = arith.constant 6 : i32
    %11 = fir.address_of(@_QQclXf0a9a187466a9dd5699bdae5c0b697a3) : !fir.ref<!fir.char<1,81>>
    %12 = fir.convert %11 : (!fir.ref<!fir.char<1,81>>) -> !fir.ref<i8>
    %c9_i32 = arith.constant 9 : i32
    %13 = fir.call @_FortranAioBeginExternalListOutput(%c6_i32, %12, %c9_i32) fastmath<contract> : (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
    %14 = fir.load %3#0 : !fir.ref<!fir.box<!fir.heap<i32>>>
    %15 = fir.box_addr %14 : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
    %16 = fir.load %15 : !fir.heap<i32>
    %c2_i32 = arith.constant 2 : i32
    %17 = arith.addi %16, %c2_i32 : i32
    %18 = fir.call @_FortranAioOutputInteger32(%13, %17) fastmath<contract> : (!fir.ref<i8>, i32) -> i1
    %19 = fir.call @_FortranAioEndIoStatement(%13) fastmath<contract> : (!fir.ref<i8>) -> i32
    return
  }
  fir.global linkonce @_QQclXf0a9a187466a9dd5699bdae5c0b697a3 constant : !fir.char<1,81> {
    %0 = fir.string_lit "/home/agozillo/git/flang-dev/work-dir/declare-target-map/single-value-alloca.f90\00"(81) : !fir.char<1,81>
    fir.has_value %0 : !fir.char<1,81>
  }
  func.func private @_FortranAioBeginExternalListOutput(i32, !fir.ref<i8>, i32) -> !fir.ref<i8> attributes {fir.io, fir.runtime}
  func.func private @_FortranAioOutputInteger32(!fir.ref<i8>, i32) -> i1 attributes {fir.io, fir.runtime}
  func.func private @_FortranAioEndIoStatement(!fir.ref<i8>) -> i32 attributes {fir.io, fir.runtime}
  fir.global @_QQEnvironmentDefaults constant : !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>> {
    %0 = fir.zero_bits !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
    fir.has_value %0 : !fir.ref<tuple<i32, !fir.ref<!fir.array<0xtuple<!fir.ref<i8>, !fir.ref<i8>>>>>>
  }
}
```

The relevant location is inside of the TargetOp, the rest is largely irrelevant for device but included for completeness. The HLFIR AssignOp will generate an allocation as part of it's lowering from my understanding (in this particular use case at least, there's different lowering's for it). Although, I wouldn't say the problem lies with the HLFIR operation necessarily, more just a side affect of how we lower to a target region for LLVM-IR being shown I think (maybe there's a way to make it more AMDGPU runtime friendly, or a different runtime function for AMDGPU, but I'm doubtful that's the ideal solution). Even if we make sure to raise all AllocaOp's (when we lower to the LLVM Dialect this is done currently, but TargetOp doesn't generate a new Blocks, so there is no isolated Entry Block as such) to the top this would still persist, as we inevitably embed the user code into a seperate block in-between some kernel entry code for the arguments that will branch off to a fail condition or the user code block containing the allocations, and then a later pass (not sure which yet unfortunately) will try to do some magic and end up breaking the generated executable.

https://github.com/llvm/llvm-project/pull/78818