[llvm] 0775f21 - AMDGPU: Add a scheduler test to demonstrate an upcoming change

Nicolai Hähnle via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 23 12:43:27 PST 2023


Author: Nicolai Hähnle
Date: 2023-01-23T21:43:06+01:00
New Revision: 0775f21b62a5fe6422c3d021685a779e647bf8d1

URL: https://github.com/llvm/llvm-project/commit/0775f21b62a5fe6422c3d021685a779e647bf8d1
DIFF: https://github.com/llvm/llvm-project/commit/0775f21b62a5fe6422c3d021685a779e647bf8d1.diff

LOG: AMDGPU: Add a scheduler test to demonstrate an upcoming change

Added: 
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
new file mode 100644
index 000000000000..5a081a75b6b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-lds.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1102 -mattr=+wavefrontsize32 -misched=gcn-max-occupancy -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+
+; Provide a long sequence of 32 vec4 load/store pairs that ought to be fully
+; overlapped for latency hiding. Doing so requires using (at least) 128 VGPRs,
+; which currently looks to the scheduler like an occupancy reduction, even
+; though it's not. TODO: Fix!
+
+; 6 kB of LDS, allows 10 workgroups
+ at lds = internal addrspace(3) global [384 x <4 x i32>] undef
+
+define internal void @copy(ptr addrspace(1) %src, i32 %ofs) alwaysinline {
+  %src.gep = getelementptr <4 x i32>, ptr addrspace(1) %src, i32 %ofs
+  %ld = load <4 x i32>, ptr addrspace(1) %src.gep
+  %dst.gep = getelementptr <4 x i32>, ptr addrspace(3) @lds, i32 %ofs
+  store <4 x i32> %ld, ptr addrspace(3) %dst.gep
+  ret void
+}
+
+define amdgpu_cs void @test(ptr addrspace(1) %src) "amdgpu-flat-work-group-size"="32,32" {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_clause 0xa
+; CHECK-NEXT:    global_load_b128 v[2:5], v[0:1], off
+; CHECK-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:16
+; CHECK-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:32
+; CHECK-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:48
+; CHECK-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:64
+; CHECK-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:80
+; CHECK-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:96
+; CHECK-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:112
+; CHECK-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:128
+; CHECK-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:144
+; CHECK-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:160
+; CHECK-NEXT:    v_mov_b32_e32 v86, 0
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    global_load_b128 v[46:49], v[0:1], off offset:176
+; CHECK-NEXT:    global_load_b128 v[50:53], v[0:1], off offset:240
+; CHECK-NEXT:    global_load_b128 v[54:57], v[0:1], off offset:224
+; CHECK-NEXT:    global_load_b128 v[58:61], v[0:1], off offset:208
+; CHECK-NEXT:    global_load_b128 v[62:65], v[0:1], off offset:192
+; CHECK-NEXT:    global_load_b128 v[66:69], v[0:1], off offset:304
+; CHECK-NEXT:    global_load_b128 v[70:73], v[0:1], off offset:288
+; CHECK-NEXT:    global_load_b128 v[74:77], v[0:1], off offset:272
+; CHECK-NEXT:    global_load_b128 v[78:81], v[0:1], off offset:256
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    ds_store_b128 v86, v[2:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    ds_store_b128 v86, v[6:9] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    ds_store_b128 v86, v[10:13] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    ds_store_b128 v86, v[14:17] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(15)
+; CHECK-NEXT:    ds_store_b128 v86, v[18:21] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(14)
+; CHECK-NEXT:    ds_store_b128 v86, v[22:25] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    ds_store_b128 v86, v[26:29] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    ds_store_b128 v86, v[30:33] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(11)
+; CHECK-NEXT:    ds_store_b128 v86, v[34:37] offset:128
+; CHECK-NEXT:    s_waitcnt vmcnt(10)
+; CHECK-NEXT:    ds_store_b128 v86, v[38:41] offset:144
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    ds_store_b128 v86, v[42:45] offset:160
+; CHECK-NEXT:    s_clause 0xb
+; CHECK-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:368
+; CHECK-NEXT:    global_load_b128 v[6:9], v[0:1], off offset:352
+; CHECK-NEXT:    global_load_b128 v[10:13], v[0:1], off offset:336
+; CHECK-NEXT:    global_load_b128 v[14:17], v[0:1], off offset:320
+; CHECK-NEXT:    global_load_b128 v[18:21], v[0:1], off offset:432
+; CHECK-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:416
+; CHECK-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:400
+; CHECK-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:384
+; CHECK-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:464
+; CHECK-NEXT:    global_load_b128 v[38:41], v[0:1], off offset:448
+; CHECK-NEXT:    global_load_b128 v[42:45], v[0:1], off offset:480
+; CHECK-NEXT:    global_load_b128 v[82:85], v[0:1], off offset:496
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    ds_store_b128 v86, v[46:49] offset:176
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    ds_store_b128 v86, v[62:65] offset:192
+; CHECK-NEXT:    ds_store_b128 v86, v[58:61] offset:208
+; CHECK-NEXT:    ds_store_b128 v86, v[54:57] offset:224
+; CHECK-NEXT:    ds_store_b128 v86, v[50:53] offset:240
+; CHECK-NEXT:    s_waitcnt vmcnt(12)
+; CHECK-NEXT:    ds_store_b128 v86, v[78:81] offset:256
+; CHECK-NEXT:    ds_store_b128 v86, v[74:77] offset:272
+; CHECK-NEXT:    ds_store_b128 v86, v[70:73] offset:288
+; CHECK-NEXT:    ds_store_b128 v86, v[66:69] offset:304
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    ds_store_b128 v86, v[14:17] offset:320
+; CHECK-NEXT:    ds_store_b128 v86, v[10:13] offset:336
+; CHECK-NEXT:    ds_store_b128 v86, v[6:9] offset:352
+; CHECK-NEXT:    ds_store_b128 v86, v[2:5] offset:368
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_store_b128 v86, v[30:33] offset:384
+; CHECK-NEXT:    ds_store_b128 v86, v[26:29] offset:400
+; CHECK-NEXT:    ds_store_b128 v86, v[22:25] offset:416
+; CHECK-NEXT:    ds_store_b128 v86, v[18:21] offset:432
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    ds_store_b128 v86, v[38:41] offset:448
+; CHECK-NEXT:    ds_store_b128 v86, v[34:37] offset:464
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    ds_store_b128 v86, v[42:45] offset:480
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_store_b128 v86, v[82:85] offset:496
+; CHECK-NEXT:    s_endpgm
+  call void @copy(ptr addrspace(1) %src, i32 0)
+  call void @copy(ptr addrspace(1) %src, i32 1)
+  call void @copy(ptr addrspace(1) %src, i32 2)
+  call void @copy(ptr addrspace(1) %src, i32 3)
+  call void @copy(ptr addrspace(1) %src, i32 4)
+  call void @copy(ptr addrspace(1) %src, i32 5)
+  call void @copy(ptr addrspace(1) %src, i32 6)
+  call void @copy(ptr addrspace(1) %src, i32 7)
+  call void @copy(ptr addrspace(1) %src, i32 8)
+  call void @copy(ptr addrspace(1) %src, i32 9)
+  call void @copy(ptr addrspace(1) %src, i32 10)
+  call void @copy(ptr addrspace(1) %src, i32 11)
+  call void @copy(ptr addrspace(1) %src, i32 12)
+  call void @copy(ptr addrspace(1) %src, i32 13)
+  call void @copy(ptr addrspace(1) %src, i32 14)
+  call void @copy(ptr addrspace(1) %src, i32 15)
+  call void @copy(ptr addrspace(1) %src, i32 16)
+  call void @copy(ptr addrspace(1) %src, i32 17)
+  call void @copy(ptr addrspace(1) %src, i32 18)
+  call void @copy(ptr addrspace(1) %src, i32 19)
+  call void @copy(ptr addrspace(1) %src, i32 20)
+  call void @copy(ptr addrspace(1) %src, i32 21)
+  call void @copy(ptr addrspace(1) %src, i32 22)
+  call void @copy(ptr addrspace(1) %src, i32 23)
+  call void @copy(ptr addrspace(1) %src, i32 24)
+  call void @copy(ptr addrspace(1) %src, i32 25)
+  call void @copy(ptr addrspace(1) %src, i32 26)
+  call void @copy(ptr addrspace(1) %src, i32 27)
+  call void @copy(ptr addrspace(1) %src, i32 28)
+  call void @copy(ptr addrspace(1) %src, i32 29)
+  call void @copy(ptr addrspace(1) %src, i32 30)
+  call void @copy(ptr addrspace(1) %src, i32 31)
+  ret void
+}


        


More information about the llvm-commits mailing list