<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/90421>90421</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [NVPTX] LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32 on targets between SM30 and SM70
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          sasha0552
      </td>
    </tr>
</table>

<pre>
    ```
LLVM ERROR: Cannot select: intrinsic %llvm.nvvm.shfl.sync.bfly.i32
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0.      Program arguments: llvm-5e5a22ca-ubuntu-x64/bin/llc sum.ll -march=nvptx64 -mcpu sm_60 -mtriple=nvptx64-nvidia-cuda -o sum60.ptx
1.      Running pass 'Function Pass Manager' on module 'sum.ll'.
2.      Running pass 'NVPTX DAG->DAG Pattern Instruction Selection' on function '@test_sum_kernel_'
 #0 0x000000000201c9c7 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x201c9c7)
 #1 0x000000000201a47e llvm::sys::RunSignalHandlers() (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x201a47e)
 #2 0x000000000201d09f SignalHandler(int) Signals.cpp:0:0
 #3 0x00007fc72aa94420 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x14420)
 #4 0x00007fc72a55700b raise /build/glibc-wuryBv/glibc-2.31/signal/../sysdeps/unix/sysv/linux/raise.c:51:1
 #5 0x00007fc72a536859 abort /build/glibc-wuryBv/glibc-2.31/stdlib/abort.c:81:7
 #6 0x0000000001f9d190 llvm::report_fatal_error(llvm::Twine const&, bool) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1f9d190)
 #7 0x0000000001e360dd llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e360dd)
 #8 0x0000000001e351ab llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e351ab)
 #9 0x0000000001e29b15 llvm::SelectionDAGISel::DoInstructionSelection() (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e29b15)
#10 0x0000000001e28a05 llvm::SelectionDAGISel::CodeGenAndEmitDAG() (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e28a05)
#11 0x0000000001e26446 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e26446)
#12 0x0000000001e22cff llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1e22cff)
#13 0x0000000001497ebf llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x1497ebf)
#14 0x000000000198e5af llvm::FPPassManager::runOnFunction(llvm::Function&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x198e5af)
#15 0x00000000019950f1 llvm::FPPassManager::runOnModule(llvm::Module&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x19950f1)
#16 0x000000000198ecb1 llvm::legacy::PassManagerImpl::run(llvm::Module&) (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x198ecb1)
#17 0x00000000006eac2e compileModule(char**, llvm::LLVMContext&) llc.cpp:0:0
#18 0x00000000006e8a6d main (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x6e8a6d)
#19 0x00007fc72a538083 __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:342:3
#20 0x00000000006e7b2e _start (llvm-5e5a22ca-ubuntu-x64/bin/llc+0x6e7b2e)
Aborted (core dumped)
```

<details>
<summary>sum.ll</summary>

```
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

@printfFormat_0 = internal constant [32 x i8] c"pid (%u, %u, %u) idx (%1u)%s%f\0A\00"
@printfPrefix_0 = internal constant [6 x i8] c" sum: "
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]

; Function Attrs: nofree nounwind
declare noundef i32 @vprintf(ptr nocapture noundef readonly, ptr noundef) local_unnamed_addr #0

define void @test_sum_kernel_() local_unnamed_addr !dbg !7 {
  %1 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
  %2 = lshr i32 %1, 1, !dbg !10
  %3 = and i32 %2, 1, !dbg !10
  %4 = and i32 %1, 1, !dbg !10
 %5 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 0, i32 1, i32 31), !dbg !11
  %6 = bitcast i32 %5 to float, !dbg !11
  %7 = fadd float %6, 0.000000e+00, !dbg !15
  %8 = zext nneg i32 %3 to i64, !dbg !10
  %9 = getelementptr float, ptr addrspace(3) @global_smem, i64 %8, !dbg !10
  %10 = insertelement <1 x float> undef, float %7, i64 0, !dbg !10
  store <1 x float> %10, ptr addrspace(3) %9, align 4, !dbg !10
 %11 = zext nneg i32 %4 to i64, !dbg !10
  %12 = getelementptr float, ptr addrspace(3) @global_smem, i64 %11, !dbg !10
  %13 = load float, ptr addrspace(3) %12, align 4, !dbg !10
  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !10
  %15 = tail call i32 asm "mov.u32 $0, %ctaid.y;", "=r"() #4, !dbg !10
  %16 = tail call i32 asm "mov.u32 $0, %ctaid.z;", "=r"() #4, !dbg !10
  %17 = fpext float %13 to double
  %18 = alloca { i32, i32, i32, i32, ptr, double }, align 8
 store i32 %14, ptr %18, align 8
  %19 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 1
  store i32 %15, ptr %19, align 4
  %20 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 2
  store i32 %16, ptr %20, align 8
  %21 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 3
  store i32 %4, ptr %21, align 4
  %22 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 4
  store ptr @printfPrefix_0, ptr %22, align 8
  %23 = getelementptr inbounds { i32, i32, i32, i32, ptr, double }, ptr %18, i64 0, i32 5
  store double %17, ptr %23, align 8
  %24 = call i32 @vprintf(ptr nonnull @printfFormat_0, ptr nonnull %18)
  ret void, !dbg !19
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1

; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2

attributes #0 = { nofree nounwind }
attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
attributes #3 = { convergent nocallback nounwind }
attributes #4 = { nounwind }

!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "trittest.py", directory: "/tmp")
!4 = !{ptr @test_sum_kernel_, !"kernel", i32 1}
!5 = !{ptr @test_sum_kernel_, !"maxntidx", i32 128}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "test_sum_kernel_", linkageName: "test_sum_kernel_", scope: !3, file: !3, line: 13, type: !8, scopeLine: 13, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 16, column: 28, scope: !7)
!11 = !DILocation(line: 243, column: 36, scope: !12, inlinedAt: !14)
!12 = distinct !DILexicalBlockFile(scope: !7, file: !13, discriminator: 0)
!13 = !DIFile(filename: "standard.py", directory: "/mnt/ml/vllm/venv/lib/python3.11/site-packages/triton/language")
!14 = !DILocation(line: 15, column: 18, scope: !12)
!15 = !DILocation(line: 233, column: 15, scope: !16, inlinedAt: !17)
!16 = distinct !DILexicalBlockFile(scope: !12, file: !13, discriminator: 0)
!17 = !DILocation(line: 243, column: 36, scope: !16, inlinedAt: !18)
!18 = !DILocation(line: 15, column: 18, scope: !16)
!19 = !DILocation(line: 16, column: 4, scope: !7)
```
</details>

[LLVM build](https://tritonlang.blob.core.windows.net/llvm-builds/llvm-5e5a22ca-ubuntu-x64.tar.gz) @ https://github.com/llvm/llvm-project/tree/5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372

<details>
<summary>LLVM version</summary>

```
LLVM (http://llvm.org/):
  LLVM version 18.0.0git
  Optimized build with assertions.
  Default target: x86_64-unknown-linux-gnu
  Host CPU: znver3

  Registered Targets:
    amdgcn  - AMD GCN GPUs
    nvptx   - NVIDIA PTX 32-bit
 nvptx64 - NVIDIA PTX 64-bit
    r600    - AMD GPUs HD2XXX-HD6XXX
 x86     - 32-bit X86: Pentium-Pro and above
    x86-64  - 64-bit X86: EM64T and AMD64
```
</details>

It compiles successfully when I specify:
1. SM30
2. SM70+

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync
It should be supported with the SM30+

Related: vllm-project/vllm#4438
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy8el1z2yjU_6dRbhhrJPRi-SIXjh13M_-2m2m6-89z5UGAbJ4i0ABKnH76ZwBJlhznrd3dTGtbCM75cd44B0Bas52g9DLIroJsfYFas5fqUiO9R1GWwYtSkqfLII-6f9E6iJafP__9BVx_-_bntyBZghUSQhqgKafY2AYmjGJCMwwCmHH-UIfi4aEO9b7ioX4SOCwr_hSyBHpqt5-vl3fXQLdlzQxAoGx3QNFGKgOMBHtjGh0kywBuArjZMbNvyxDLOoAbS7r7mjVK_q9lDzdM65bqAG4AEgQwgXlLKDB7CrBCeg9KhH8YhTANPfs7g_APQNq6sVxcUxQC93er5E6hGiC1a2sqjMUBHLuMZghCjGZt2QrTzg55GsBNyYTDg4Fu65BzMKuRwvsgWYuHxhzyFMxq3LRA19s8ArPaKNZwenw9Ew-MMDTDLUFgJi2VPAobc_Cw4g7Wt1YIJnagQVqDAM43rcCGSQFubcMXJNCOqgDOgRSglqTl1PbykAI47-YNX6D29e_b7_dgvfw0C5Lr9fITuEXGUCXAjdBGtZ7TnVM2k6JjU_UQAjgP0shQbba6rbc_qBKUb22rYwoCmEQgOkT9H4xivMBzJ1Ur_2Spn7T_cauYME473626AlgcOyn0uJXaKIrqAOYBXFmjC-ACdL3eUlAAr6JDxzuAiyO4-AQcSuf0HLhvrbhjO4H4H0gQTpUOYPEL_C35CX94wp9EiwpMOAWw6Obqm3WIG2u6kfs_EEo6QvMKzyFCizSFEdhuFdVGKrpVBjjEG87KAG4ORb7N0xlnoj3MdqL1LxqzVxSRUMswcohjS2WCN52wybJ5FJVAIaatzW3KlnFi3ZazEs8eW_V09TA8wjCJA7jRbhYB3IShfXrShDbWfVvBDr7hwaERrX10pEMcJMssDpJlfESSTZEkeZEtACqlm-n7kBjipeFGOSaFZTKy3XysnrhakHgRjczDh61thQziW6qUVBOr_f7IBAVYCm06qy2l5B81m47tRA3zCS6a5BEhI1yDu66Xn27uKPetPnD_DzX-9QTq3fqrJDSAyw-j88wn6IoTdFmMyjfR-daVJHQl69pGmvPwVqAVbg0jAO-R6sU7ffML0aHDOZnIYjoRuCjj7M2JrOUoco4C54fjRcdwQGTDVXSCqEDR24isUD9RsRTkumZmvfz0a2AsrwmY-ARMnqb5O_W85PwKaYavuMQ_9ETXw_J2dJwPQ7VIJlDhCVSIq-pNqKoVf4ovCO-ZoD2qCdRn734Fq4UywZpMsKaLOS3HWE-Y2iRgBPcszt8B6PlPAKYTgIuCZmgMcHNrMfV5yb8IzXOeQMum0BZZVMXvgPbFpU1T3XZNvwDLcZ3Ayk8lhssxLE53CD91adAR4E3dHC3xnwLneE_ATRaTKKcIQ7tq1Q3jdBCMjbUuyrpAe0Ria4OVFIYeelflHD9PUSyf4oRPgXICasTEh2bgx00msDhJBYqoSMB2a9f6rTZImW3H5V1pAdZtn534n-6lo-NyhCSF9nNgD6OTec1LSIFn_MGZ2ZHDzJY2K6HEksBSUVey0NHEpzVa95msCDWIcR0k10OTbusaqacgue6KgmRl05-hdULhhGxyBbwN3KxBkKxtwm9VvmbIRszePLqUSctWYbqtGKcC1bQbAM8MgBOeadTY5L_aSFUjs43cQCZsGYK4XweQMCDIrhIIDoAVQbYGOICwYcTntVlrzXL6vQCMHLrXsX0OYKYDmFVBtoqW9iM6Aukh3CpascNrEPIJAlu02TJxTGnHZYn4Vte0dmTooSPDJUZ82worHLJFhChgP3Tj653EYvaDLaOoZzTV8BUYVsilMcoVqUJWilIgZCsemSC-K6GYI-VbCa0ASyAI0ujBTzSARWMUEBKjxrSjbrYEkII_WUH6Hq7dufZz_LbAG-MjtLJJ74O0mjlXGRYvEopJubNfcxDMr7okzKoydkK0Vg0w4ryfx3GnwRUtjTmEWtFdaBgJD56Pt4WebhyNiEJHlOu98vRgFtve8StDEjfE7TL4EfCtEenpiFd5BDDL3prqmU0VWNheM0fV_or6H0NLEj-XRTyCmTuuJTMYadNDzYCRoOISmVdGzt3IChHiuzpqtn8U-mhIbWSLTihkIwqFo_CTHgwQgu567onlzmyQfFG4CzdyRw3ltKbCWFsd8NqHZ641dU0nnTx1IF5hE_ehQFPVswJBsorBoWOXXIPOQ1ZHKcx78qeTH2i7wvwZJcfx5RnAzCkScbYT4CXpWBrxebmmb8s1hv-cYOPXvCP2DsUlIm_Rt6jenrfrl55xIaRrG6Br-RC2Tg5pp5UMG-TCRXJlA7hrg0GyVu6pK5KSV_mdc9m3-D39Br_8F_j9_A1-nYs31pIG646dfxLZlpyOO3tvRtzGdxvFgQtQq_NfjVH2y1MBwXx91HDREfU-0gfPtDcQx-p5b9d-LiwwUdpFTP8apAnHwaV9hJ34co8zG4-a-Oto_Yn-Q5zwPM58NApG5-UJ4_8QZ3IW51jtMH5BnOeC1r8FM53CdB1P88cxZviCaJP_EHM2xdwPsw4-hpq8ANVH1XFOcppGCtFyDp6l8scssuvg0fWbbEBR45LFkxDUFznzdyS_datNo-ROUa1dPst5ifCPY1Js86UhNwa6objlyCArgEfGuaKmVQLUtJbqKYCFkOJYhb2QRL8j-XTHC2-jx1I8ULWzWcUEe4d2QMUEwphqzUprLK7msKwfFTPP0b47b3zBvBz4SY2GjFGsbA3V_kzHlXbzq9PKAwwamw6IhwH_krZeYgwHxv-QoF9ilLyX0Qvj05FIT3v2ew2xU6g_4wsrjna6K7FjWzHBuM814-PIbgwpdyFup73huJezEndAgKx9nhDuM4Ts5Dt9xogRnx-PBufP5xGNevgFytMLIFzTst2BG1FJ8DdV2m0NwuMCMeIXnxBJj0TsbGaKVpxiM6vMzxGJiWy8dRCmDRPYZjbx-mbld77-EswdkSCxa9GOWjNY___t5-XXT9uVy_QZp77wj13QbJQkLXa7ijbJMoqZI3T9Z2NYzX5SYl8b1VLbrFphWE37aSZLH65pzbRt-H-2kk-WYNNy7qQy2vTqcmcHeMPcDl2_6zICYKvvsHnqUBCmKDbWyLv9CrgxdeN-HOmmI6F2C9vzGr4Xs294SbjZR0jV6CAMI4cxMViMyeVTqwogxFY54MGLDyRhEUYggIVBOx3Azbdrd9FgmxRRADcVc6eOC0t_RHR-xgDu2rLxNwFseBlJ9Bl4B5Uz8QPt6Nc3O2osm4nJnJoQZ8I9-yfzNPQuhuGfJ110s7FBwNnmzd2tfVjTignmz-bnq6H5aH_umIyZjvJE98XRpu7aUsnWMEG_WxiwwLjzgNVqK-zaznuMuiO1GJNajJU19tnoyOOzxKg_D-in5VJSLHlbO3-Axang5mMuxxBwlhpMkym5JD8l58tKJuwQsuylYuPdiMu5KPGZHhhG3J1fdQ54AnOiX68vwjRWrGYCGak6fx_xedOntUGCIEVe9elaGPvJA7h54Ly2X1Q8DEf_zZPZS5GEsT-MN3TWIGwN2PpMH7U2Q9ibRodjeDivvmwq7_iZ-qYGdwwR5_WXnOjP05_Qy8_qb2ol-cf15-3iowqc_7Y5np9OMeFS_K4O8gm5xYc8Mn3FIU-PDlYB3Dw7jvCf2ZW7VObPY7J1AIvpxS9viNYKw5LLMsRS0dAmRfJRh4Ka_h6YI6D7pzOnLKFBKtz97HapPn69zChKA7jpKVdFgXCMcYUKgrM5jOAiqwgtUYTIIpnDjxzGOAk89Iv_u49k3LBOYMNMXO4l1c49LYabbQCMmYC4CKMw2jHTvx2WBa8J8MjMHiCtqXJJYNj3W9MKtdwAg9SOOrPsbhC14oeQj2J0k6gb8YfUBqxu_7J9f9p8OBlPCYBvdMe0oYoS8N0R1SPQAKCa7LAAYAaWX9bg0-or-HT7lz6-dzfogH3_9e-b9c0S3H6_BwmclcPchit44x55OuoBAFB5FAEwsLn9S4M_1vD-_n72xzq_v7_veh6KHPhungW4L3I7sVsqDGvr2a2SbpcflfKBHqkfinyWp3aY59sPu_6Sp9_dgOWXdZ5-2HtuTH88q4FuXbFStZw_gcc9FeDG1UysehoEGofg7kt_TAPtwzwK4NWY5NQviMQ69JcTO-fALUF28UAKcU75zN8Um9EDxa2PGBsmCD2Ee2NXnoQgg2a1fHD7GjMkyMxXRdYMZ-x4P0bPbGU6szXfMDe9ly0noKRAt03jD0GdYZo99ROZYv9GOTI-s7YL3sh5_fqXpGlSXJDLhCySBbqgl_E8TpM0SWN4sb_EKKpyRBDOKM5RXCWLKkeQZlkRpwjl-IJdwgimUQoLCJMohmGakBxFdBFntCLzBQnSiNaI8bB3wgt3J_VyEaUwvuCopFy7-7YQCvoI3Eu7rmbrC3XZRbGdtuU600YfqRhmuLuo665oBtka_FMXcYEUnStrUFLzSKlwgnU2aa3jolX88tev4rqZ_18AAAD__07gv4c">