<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/59302>59302</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            [MLIR] memref.alloca should use addrspace(5) when lowering gpu dialect to rocdl

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          Raphalex46

      </td>

    </tr>

</table>

<pre>

    Hi, 

When using `memref.alloca` in a gpu function like so:

```mlir

// File test.mlir

module attributes {gpu.container_module} {

 llvm.func @main() {

    %1 = arith.constant 1 : index

 gpu.launch_func @test_func::@test_func blocks in (%1, %1, %1) threads in (%1, %1, %1)

    llvm.return

  }

  gpu.module @test_func {

 gpu.func @test_func () kernel {

      %lb = arith.constant 0 : index

 %ub = arith.constant 1000 : index

      %buf = memref.alloca(): memref<1000xi8>

      %step = arith.constant 1 : index

      scf.for %iv = %lb to %ub step %step {

        %t = memref.load %buf[%iv] : memref<1000xi8>

        // Print the values in the allocated memref to be sure it isn't optimized out

        gpu.printf "%d\n" %t : i8

 }

      gpu.return

    }

  }

}

```

and lowering this code with the following command: 

```

mlir-opt -convert-scf-to-cf -convert-arith-to-llvm --pass-pipeline='gpu.module(convert-gpu-to-rocdl{runtime=HIP}, reconcile-unrealized-casts)' test.mlir -o test.rocdl.mlir

```

generates this code:

```mlir

// File test.rocdl.mlir

module attributes {gpu.container_module} {

  llvm.func @main() {

    %0 = llvm.mlir.constant(1 : index) : i64

    %1 = builtin.unrealized_conversion_cast %0 : i64 to index

    gpu.launch_func @test_func::@test_func blocks in (%1, %1, %1) threads in (%1, %1, %1) 

 llvm.return

  }

  gpu.module @test_func {

    llvm.mlir.global internal constant @printfFormat_0("%d\0A\00")

    llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr<i8>, i64, i32) -> i64

 llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64

    llvm.func @__ockl_printf_begin(i64) -> i64

 llvm.func @test_func() attributes {gpu.kernel, rocdl.kernel} {

      %0 = llvm.mlir.constant(1 : index) : i64

      %1 = llvm.mlir.constant(1000 : index) : i64

      %2 = llvm.mlir.constant(0 : index) : i64

      %3 = llvm.mlir.constant(1000 : index) : i64

      %4 = llvm.mlir.constant(1 : index) : i64

      %5 = llvm.mlir.null : !llvm.ptr<i8>

      %6 = llvm.getelementptr %5[%3] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>

      %7 = llvm.ptrtoint %6 : !llvm.ptr<i8> to i64

      %8 = llvm.alloca %7 x i8 : (i64) -> !llvm.ptr<i8>

      %9 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>

      %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>

      %11 = llvm.insertvalue %8, %10[1] : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>

 %12 = llvm.mlir.constant(0 : index) : i64

      %13 = llvm.insertvalue %12, %11[2] : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>

      %14 = llvm.insertvalue %3, %13[3, 0] : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>

      %15 = llvm.insertvalue %4, %14[4, 0] : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>

      llvm.br ^bb1(%2 : i64)

    ^bb1(%16: i64):  // 2 preds: ^bb0, ^bb2

      %17 = llvm.icmp "slt" %16, %1 : i64

      llvm.cond_br %17, ^bb2, ^bb3

    ^bb2:  // pred: ^bb1

 %18 = llvm.getelementptr %8[%16] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>

 %19 = llvm.load %18 : !llvm.ptr<i8>

      %20 = llvm.mlir.constant(0 : i64) : i64

      %21 = llvm.call @__ockl_printf_begin(%20) : (i64) -> i64

      %22 = llvm.mlir.addressof @printfFormat_0 : !llvm.ptr<array<4 x i8>>

      %23 = llvm.mlir.constant(0 : i64) : i64

      %24 = llvm.getelementptr %22[%23, %23] : (!llvm.ptr<array<4 x i8>>, i64, i64) -> !llvm.ptr<i8>

      %25 = llvm.mlir.constant(4 : i64) : i64

      %26 = llvm.mlir.constant(1 : i32) : i32

      %27 = llvm.mlir.constant(0 : i32) : i32

      %28 = llvm.call @__ockl_printf_append_string_n(%21, %24, %25, %27) : (i64, !llvm.ptr<i8>, i64, i32) -> i64

      %29 = llvm.mlir.constant(1 : i32) : i32

      %30 = llvm.zext %19 : i8 to i64

      %31 = llvm.call @__ockl_printf_append_args(%28, %29, %30, %20, %20, %20, %20, %20, %20, %26) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64

      %32 = llvm.add %16, %0  : i64

      llvm.br ^bb1(%32 : i64)

    ^bb3:  // pred: ^bb1

      llvm.return

    }

 }

}

```

The interesting part is the `llvm.alloca`, because for the AMDGPU backend, allocations like this need to be done using 

addrspace(5) (private memory).

Going further with LLVM 15.0.2, for example with this command (assuming the correct chip is `gfx906`): 

```

mlir-opt -gpu-to-hsaco='chip=gfx906' -gpu-to-llvm --reconcile-unrealized-casts | mlir-translate --mlir-to-llvmir -o test_issue.ll

```

Results in a crash (that should be addressed by #59250).

Replacing the `llvm.alloca` operation to use addrspace(5), for example like this:

```mlir

// replace this line:

%8 = llvm.alloca %7 x i8 : (i64) -> !llvm.ptr<i8>

// with this:

%buf = llvm.alloca %7 x i8 : (i64) -> !llvm.ptr<i8, 5>

%8 = llvm.addrspacecast %buf : !llvm.ptr<i8, 5> to !llvm.ptr<i8>

```

prevents and the crash and everything works as expected.

Allocations like this should always be made in addrspace(5) for AMDGPU targets, so the `memref.alloca` operation should

probably be lowered to use this address space in this case.

For more detail, here's a small discussion linked to this issue: https://discourse.llvm.org/t/memref-alloca-in-amd-gpu-kernels-seem-to-lower-to-llvm-alloca-with-an-incorrect-address-space/66864

In this exchange, @krzysz00 suggests to add a `memref.addrspacecast` operation in order to make this change cleanly.

I hope I've been able to clearly describe the problem here, I'm afraid I'm not familiar enough with LLVM and MLIR (yet) to 

suggest a patch myself for now, but I'll do my best to help reach a satisfying solution.

Thank you for reading, let me know if you have any questions or remarks !

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzEWdtu4zjSfhrmpmBDpiwfLnKRw2SmgRlgMPj_3UuDksoWNxSpJakk7qdfFCnZkg9JpnuANBqxRLHOX1VRJeGc3GnEW5bds-zxRrS-Mvb2L9FUQuHbfHGTm3J_-5tk_AFY8siSu39XqKF1Uu-ALZIaa4vbqVDKFIItEpAaBOyaFratLrw0GpR8RnCGpXeRAVsk8X-tpO2W-BPjT_AkFYJH56eDR-FvbcpWIQjvrcxbjw7Y8n7XtNPCaC-kRruJW9jykR5FKlDqpZ6SIsDmSS2kZnzF-HqwAwAYz2bA0kcQVvqKODovtAdavAOpS3zrNpNAJVpdVJueKWkbbsi89G64ArkyxbMjjwSx2Sw4cfS7Bl9ZFOUHu47KBoss-tbqfpEtH_tLUrBz1UiTo72040x36NzyjFajGnsn-EfllxyUnDmI8ay9uHWWJOe7e-55uw00YywFlYgkLrP0gZi8yRVLfzlh4Dw2n4tg-OeK7XRrLFHKl0AXTfSmMyDy6_mOvREE-qG-yoiyM4OyiJiy7BE-oTrxCsD_00rtwVcIL0K1GNBAd9EVHsuOE6mYI7jWIkgP0mnGlx5M42Utv2MJpvVjARTvhrhvgXHOeFay7EEzznsz7kCu-ugdgdSTjpE2wtrh8ngxTu_hotAlKPOKlsqGr6SDwpQIr9JXwdCtUcq80sPC1LXQJSl2kR9VholpPEwKo1_Q-okrthNvJsX2uBRwQIuULjCZNMK5SSMbVFIjSx8ZXx5ThfFVT7drWqKypigVW97bVntZE8Fv3_4kM_kDWCyMLqTCSastCkVunxTCeUdw5ctj_YKJiTeB3bCkjS3aoUYrqKYdHPO3auUp-x-slZ8ulkkAf9hNQg_pxvhqmHBESTeL-YVSm7dSeamnRx9uYgycNHpD7uwlBQ6E-5M0_rJaDMPe8hOVuC_mwYc7ZXKhQGqPVgsFhxLG5klM3ydja-E3SVCsT-Tkjv4kYeG0SfRe2WxM8aw2kctGNA3qcuM8ZeKGQkwBCtbNAlnjLUsfYqniD9A9lSlJgAlLfxlE9GM5wu7cUUbgcuT5-Z-L0t83NMedPJj3vuZH3ETAn2dObIwh_UOydffj_Pn59BgkyGX6cRe9yoJfZ_Ep-vTnVZj_pBeyE3rdKhW2XsLpmHRxJN2hR4U1at_40O-z2KHTvkGHiF8Hfoebj2UujzIbb72hdt6pclHlUNDOjF4ducTOH1m_gVz12v49tdYnXmx1iduRTs7btvAsfWB8deKBy5VAWCv2dKIhtRbz7uGl1fW5PrNBckjt0Ppw3gm2dyV2zbL75BieL9Fy9qGWs4Rl97OvUZPE_2yGz9KrJs54b-OMZff8i0Mxv6pn2quZsuw-3Hw1bLKrus57Xecsu59_ta5BXm6BZb_k-SyedPgBKMODxHDHbDHYkd4d3l04NBZLF2yh7Umwla74qYMGVVIWNb1icad89zIyW_ROuoTZQFQYXW5yG3kNxPRX6YnmfKgnaXlQcjbIpdU7_WIV-8Vs8U81DGI2qMr9u-Ns9cnmxt85XiSDGF47GQxKWyGopV47NwVZPaOr56gD39OCJMrSonNme36CvWBpj9p56HVk9bnh7xxKPmP4_J0ocx7DzPuawq8fEC6qenJi_XR_5qfHnIFR808YtfjwlBVPzv3lCfnyQ5e-S776CEvnLxsBgr2T-6rIs_5ieQq4H3svOWh4ev75e_5JB7n2Hd_8IXnv6EB28QSXfphf4xej4Mbe-nV3kSb9yo9dLM69-I-_eUVrB1kvynJUxRO4WsZPWk_6Tu9JP6zgR7bXZ1UXRlWXhlT_V2F8A0fnpd5BI6wH6cKAii2SwbmcCPkD5FiI1iFsjQ2b7v54_PXP_4dcFM-oy9CO4wBPGu3iHDyMeDRi2Y3ySqOxn6bHSVlZWteIAhlfZSGQfNVY-SI8Qo21sXvG19O491dDdNvW-gptHKb9_vu__oBZNk2mIeCkGb6JulGHYVsYMYUZG7EWzrV1HMohFMZaLDwUlWzIcLZIdtu3dbII9q4_MZPrBmiVE4WJczbixdLHjg9fHvZ0o7nr8zRgywcIrL0V2inywGQSFyL5cca2kc61OFXqon5_oWuVd_HbRGGFq8h0XwkPrjKtKikSXcfCEvI9MJ5ma54lR1_3nBolit5fZ6AA06AN4abwEjROw3kalAMoPjHys0F4h6E4yexp_rl3x07WASxDGf2g_sel8AfIBqJGWvee6qd_UdqFQ1HHJM7rr5kxRkBj8QW1d0CwD1APKKA7fEG79xXF9NXYZwfCAb41WHgsR7G_u5jLHYCEehV7RziqRYkBaaeJTGHvKoQXdofekSXO9FA6-5R2BFMU0pticpGrPckKI_VYSwhsQaEOxxBkxw8JlPLC4ciaJ2OhNhahRC9kmG1VaJHxpQMBrqbmVUpXtM7Fj3j6OQoK7EK6UXAq75uAkQAbIjCtdZSJL_XU2B3jT57xp2jbJNo2kXoi6jIUgjhJcxOHWIe0Jov6_O73ExgnQk-k7irUpDNy0jn4abFY9Y0m_v3W2Y1vRSX0DkNXmifP9vvefU8ScO1uh1RkvCGPgRhGYIjEcSCkBmNLtERWix4DUQQUCoVW-5Gbv0FlGoRvjC9fEHJEDSJXSPS03ao9lOgKK3MMOKDoKqy7WDwEwhrE1gpZdjfaeNiKWiopLKA27a4a1H6C9B-_f_uLUnGPPoy1TVe2O6NBQCN8UUG9d6i2AZjavIaO1voghYJvoCaQOU8MKlQNWBRFRegQXrrtnlLGGdWSZ6bjRir0M-xNG1hbFKXUO2Kv0EON8KzNK8ht2FGJFwSh9_DfltouJVegqQXlIuOzm_I2LdfpWtzg7WyxWK_Xq3TNb6rbcouZyOfpcrEo8mS5nGX5XCy3y_kW8yRZbG_kLU84n_GEzxZ8wdfTcpuu1mm2XGXzeVaslmyeYC2kOqD1JuD6NlunCb9RIkfl-i_j9jYgMm93js0TJZ13RzIvvQrf0Mnz9OowyuW-RlzqB_BaoT5-Gts1LZRSKOrC3sRp801r1e04z3bSV20-LUzN-BNp0f1MGmv-gwVlXLDEMf4UjPlfAAAA__-Z-xbG">