[llvm] [amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic (PR #105822)

Mon Sep 9 08:57:31 PDT 2024

================
@@ -0,0 +1,1127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
+; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
+; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
+
+define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
+; GISEL12-LABEL: basic:
+; GISEL12:       ; %bb.0: ; %entry
+; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL12-NEXT:    s_wait_expcnt 0x0
+; GISEL12-NEXT:    s_wait_samplecnt 0x0
+; GISEL12-NEXT:    s_wait_bvhcnt 0x0
+; GISEL12-NEXT:    s_wait_kmcnt 0x0
+; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
+; GISEL12-NEXT:    s_mov_b32 s6, s3
+; GISEL12-NEXT:    s_mov_b32 s7, s4
+; GISEL12-NEXT:    s_wait_alu 0xfffe
+; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
+; GISEL12-NEXT:  ; %bb.1: ; %shader
+; GISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
+; GISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
+; GISEL12-NEXT:  ; %bb.2: ; %tail
+; GISEL12-NEXT:    s_wait_alu 0xfffe
+; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
----------------
nhaehnle wrote:

We could just `s_mov_b32 exec_lo, -1` here. Not particularly high priority, and if anything should happen as part of a more general micro-optimization that detects the equivalent condition in compute shaders with known workgroup size.

https://github.com/llvm/llvm-project/pull/105822