[llvm] 21bc8e5 - AMDGPU: Make VReg_1 only include 1 artificial register

Mon Oct 28 20:52:02 PDT 2019

Author: Matt Arsenault
Date: 2019-10-28T20:51:51-07:00
New Revision: 21bc8e5a137d76879223ac2d8ff1ba92e2ea3acb

URL: https://github.com/llvm/llvm-project/commit/21bc8e5a137d76879223ac2d8ff1ba92e2ea3acb
DIFF: https://github.com/llvm/llvm-project/commit/21bc8e5a137d76879223ac2d8ff1ba92e2ea3acb.diff

LOG: AMDGPU: Make VReg_1 only include 1 artificial register

When TableGen is inferring register classes from contexts, it uses a
sorting function based on the number of registers in the class. Since
this was being treated as an alias of VGPR_32, they had exactly the
same size. The sort used wasn't a stable sort, and even if it were, I
believe the tie breaker would effectively end up being the
alphabetical ordering of the class name. There appear to be issues
trying to use an empty set of registers, so add only one so this will
always sort to the end.

Also add a comment explaining how VReg_1 is a dirty hack for
SelectionDAG.

This does end up changing the behavior of i1 with inline asm and VGPR
constraints, but the existing behavior was was already nonsensical and
inconsistent. It should probably be disallowed anyway.

Fixes bug 43699

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.td
    llvm/test/CodeGen/AMDGPU/inline-asm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 82219cbdf3b2..6ea6ec00e742 100644

--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -682,7 +682,21 @@ def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
   let AllocationPriority = 8;
 }
 
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+
+// This is not a real register. This is just to have a register to add
+// to VReg_1 that does not alias any real register that would
+// introduce inferred register classess.
+def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> {
+  let isArtificial = 1;
+}
+
+// FIXME: Should specify an empty set for this. No register should
+// ever be allocated using VReg_1. This is a hack for SelectionDAG
+// that should always be lowered by SILowerI1Copies. TableGen crashes
+// on an empty register set, but also sorts register classes based on
+// the number of registerss in them. Add only one register so this is
+// sorted to the end and not preferred over VGPR_32.
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add ARTIFICIAL_VGPR)> {
   let Size = 1;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index a964dedb2713..cb06eb043e18 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -198,8 +198,7 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
-; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
+; CHECK: v_mov_b32_e32 v0, 1{{$}}
 ; CHECK: ; use v0
 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 entry:
@@ -207,14 +206,14 @@ entry:
   ret void
 }
 
+
+; FIXME: This behavior is nonsense. We should probably disallow i1 asm
+
 ; CHECK-LABEL: {{^}}i1_input_phys_vgpr:
 ; CHECK: {{buffer|flat}}_load_ubyte [[LOAD:v[0-9]+]]
-; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]]
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NOT: [[LOAD]]
 ; CHECK: ; use v0
-; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
+; CHECK: v_and_b32_e32 [[STORE:v[0-9]+]], 1, v1
 ; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
@@ -224,12 +223,12 @@ entry:
   ret void
 }
 
-; FIXME: Should be scheduled to shrink vcc
+; FIXME: Should prodbably be masking high bits of load.
 ; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
-; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1
-; CHECK: v_cndmask_b32_e64 v1, 0, -1, vcc
+; CHECK: buffer_load_ubyte v0
+; CHECK-NEXT: buffer_load_ubyte v1
+; CHECK-NEXT: s_waitcnt
+; CHECK-NEXT: ASMSTART
 define amdgpu_kernel void @i1_input_phys_vgpr_x2() {
 entry:
   %val0 = load volatile i1, i1 addrspace(1)* undef