[llvm] 5fdc985 - AMDGPU/GlobalISel: Run the localizer pass

Mon Feb 17 07:38:24 PST 2020

Author: Matt Arsenault
Date: 2020-02-17T07:38:12-08:00
New Revision: 5fdc9851d06f46ed07cb4db2f29bca502211d127

URL: https://github.com/llvm/llvm-project/commit/5fdc9851d06f46ed07cb4db2f29bca502211d127
DIFF: https://github.com/llvm/llvm-project/commit/5fdc9851d06f46ed07cb4db2f29bca502211d127.diff

LOG: AMDGPU/GlobalISel: Run the localizer pass

While looking at the output on real sized programs, there is a lot of
extra SGPR spilling compared to the DAG path. This seems to largely be
from all constants being SGPRs in the entry block.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d6c4ffe837d9..2151d379fb28 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
 #include "llvm/CodeGen/Passes.h"
@@ -623,6 +624,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
   void addPreLegalizeMachineIR() override;
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
+  void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
   void addFastRegAlloc() override;
   void addOptimizedRegAlloc() override;
@@ -914,6 +916,12 @@ bool GCNPassConfig::addRegBankSelect() {
   return false;
 }
 
+void GCNPassConfig::addPreGlobalInstructionSelect() {
+  // FIXME: We should run this before legalizing globals, but for some reason
+  // this requires legalized and regbankselected.
+  addPass(new Localizer());
+}
+
 bool GCNPassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
   return false;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
new file mode 100644
index 000000000000..117cae294cff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+; Test the localizer did something and we don't materialize all
+; constants in SGPRs in the entry block.
+
+define amdgpu_kernel void @localize_constants(i1 %cond) {
+; GFX9-LABEL: localize_constants:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_cbranch_scc0 BB0_2
+; GFX9-NEXT:  ; %bb.1: ; %bb0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e8
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c7
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x5be6
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB0_2: ; %bb1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x5be6
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c7
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e8
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1c8
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_endpgm
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  store volatile i32 123, i32 addrspace(1)* undef
+  store volatile i32 456, i32 addrspace(1)* undef
+  store volatile i32 999, i32 addrspace(1)* undef
+  store volatile i32 1000, i32 addrspace(1)* undef
+  store volatile i32 455, i32 addrspace(1)* undef
+  store volatile i32 23526, i32 addrspace(1)* undef
+  br label %bb2
+
+bb1:
+  store volatile i32 23526, i32 addrspace(1)* undef
+  store volatile i32 455, i32 addrspace(1)* undef
+  store volatile i32 1000, i32 addrspace(1)* undef
+  store volatile i32 456, i32 addrspace(1)* undef
+  store volatile i32 999, i32 addrspace(1)* undef
+  store volatile i32 123, i32 addrspace(1)* undef
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+; FIXME: These aren't localized because thesee were legalized before
+; the localizer, and are no longer G_GLOBAL_VALUE.
+ at gv0 = addrspace(1) global i32 undef, align 4
+ at gv1 = addrspace(1) global i32 undef, align 4
+ at gv2 = addrspace(1) global i32 undef, align 4
+ at gv3 = addrspace(1) global i32 undef, align 4
+
+define amdgpu_kernel void @localize_globals(i1 %cond) {
+; GFX9-LABEL: localize_globals:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-NEXT:    s_add_u32 s2, s2, gv2 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s3, s3, gv2 at gotpcrel32@hi+4
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, gv3 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, gv3 at gotpcrel32@hi+4
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, gv0 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, gv0 at gotpcrel32@hi+4
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, gv1 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, gv1 at gotpcrel32@hi+4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:  ; %bb.1: ; %bb0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_branch BB1_3
+; GFX9-NEXT:  BB1_2: ; %bb1
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:  BB1_3: ; %bb2
+; GFX9-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9-NEXT:    global_store_dword v[2:3], v5, off
+; GFX9-NEXT:    s_endpgm
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  store volatile i32 0, i32 addrspace(1)* @gv0
+  store volatile i32 1, i32 addrspace(1)* @gv1
+  br label %bb2
+
+bb1:
+  store volatile i32 0, i32 addrspace(1)* @gv2
+  store volatile i32 1, i32 addrspace(1)* @gv3
+  br label %bb2
+
+bb2:
+  ret void
+}
+
+ at static.gv0 = internal addrspace(1) global i32 undef, align 4
+ at static.gv1 = internal addrspace(1) global i32 undef, align 4
+ at static.gv2 = internal addrspace(1) global i32 undef, align 4
+ at static.gv3 = internal addrspace(1) global i32 undef, align 4
+
+define void @localize_internal_globals(i1 %cond) {
+; GFX9-LABEL: localize_internal_globals:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_getpc_b64 s[10:11]
+; GFX9-NEXT:    s_add_u32 s10, s10, static.gv2 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s11, s11, static.gv2 at rel32@hi+4
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, static.gv3 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, static.gv3 at rel32@hi+4
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, static.gv0 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, static.gv0 at rel32@hi+4
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, static.gv1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, static.gv1 at rel32@hi+4
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[12:13], 0, 1
+; GFX9-NEXT:    s_xor_b64 s[12:13], vcc, s[12:13]
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], s[12:13]
+; GFX9-NEXT:    s_xor_b64 s[12:13], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execnz BB2_2
+; GFX9-NEXT:  ; %bb.1: ; %bb1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:  BB2_2: ; %Flow
+; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], s[12:13]
+; GFX9-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    s_cbranch_execz BB2_4
+; GFX9-NEXT:  ; %bb.3: ; %bb0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:  BB2_4: ; %bb2
+; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  store volatile i32 0, i32 addrspace(1)* @static.gv0
+  store volatile i32 1, i32 addrspace(1)* @static.gv1
+  br label %bb2
+
+bb1:
+  store volatile i32 0, i32 addrspace(1)* @static.gv2
+  store volatile i32 1, i32 addrspace(1)* @static.gv3
+  br label %bb2
+
+bb2:
+  ret void
+}