[llvm] r293540 - AMDGPU: Undo sub x, c -> add x, -c canonicalization

Mon Jan 30 11:30:25 PST 2017

Author: arsenm
Date: Mon Jan 30 13:30:24 2017
New Revision: 293540

URL: http://llvm.org/viewvc/llvm-project?rev=293540&view=rev
Log:
AMDGPU: Undo sub x, c -> add x, -c canonicalization

This is worse if the original constant is an inline immediate.

This should also be done for 64-bit adds, but requires fixing
operand folding bugs first.

Added:
    llvm/trunk/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
    llvm/trunk/test/CodeGen/AMDGPU/s_addk_i32.ll
    llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=293540&r1=293539&r2=293540&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Mon Jan 30 13:30:24 2017
@@ -295,6 +295,19 @@ class VGPRImm <dag frag> : PatLeaf<frag,
   return Limit < 10;
 }]>;
 
+def NegateImm : SDNodeXForm<imm, [{
+  return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// TODO: When FP inline imm values work?
+def NegSubInlineConst32 : ImmLeaf<i32, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
+def NegSubInlineConst16 : ImmLeaf<i16, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=293540&r1=293539&r2=293540&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon Jan 30 13:30:24 2017
@@ -1125,6 +1125,15 @@ def : SHA256MaPattern <V_BFI_B32, V_XOR_
 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
 
+
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//

Modified: llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td?rev=293540&r1=293539&r2=293540&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td Mon Jan 30 13:30:24 2017
@@ -494,6 +494,14 @@ def : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
 >;
 
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
+  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+>;
+
 } // End Predicates = [isVI]
 
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/test/CodeGen/AMDGPU/s_addk_i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/s_addk_i32.ll?rev=293540&r1=293539&r2=293540&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/s_addk_i32.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/s_addk_i32.ll Mon Jan 30 13:30:24 2017
@@ -37,13 +37,22 @@ define void @s_addk_i32_k1(i32 addrspace
 }
 
 ; SI-LABEL: {{^}}s_addk_i32_k2:
-; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}}
+; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
 define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -17
   store i32 %add, i32 addrspace(1)* %out
   ret void
 }
+
+; SI-LABEL: {{^}}s_addk_i32_k3:
+; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
+; SI: s_endpgm
+define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+  %add = add i32 %b, -65
+  store i32 %add, i32 addrspace(1)* %out
+  ret void
+}
 
 ; SI-LABEL: {{^}}s_addk_v2i32_k0:
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41

Added: llvm/trunk/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll?rev=293540&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll Mon Jan 30 13:30:24 2017
@@ -0,0 +1,186 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; Test that add/sub with a constant is swapped to sub/add with negated
+; constant to minimize code size.
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 64
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i32, i32 addrspace(1)* %gep
+  %y = load volatile i32, i32 addrspace(1)* %gep
+  %result0 = sub i32 %x, 64
+  %result1 = sub i32 %y, 64
+  store volatile i32 %result0, i32 addrspace(1)* %gep.out
+  store volatile i32 %result1, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 64, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
+define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, 65
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
+define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 65, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
+define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -16
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
+define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -16, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
+define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 %x, -17
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
+; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
+; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
+define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+  %x = load i32, i32 addrspace(1)* %gep
+  %result = sub i32 -17, %x
+  store i32 %result, i32 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
+define void @s_test_i32_x_sub_64(i32 %x) #0 {
+  %result = sub i32 %x, 64
+  call void asm sideeffect "; use $0", "s"(i32 %result)
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
+; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load i16, i16 addrspace(1)* %gep
+  %result = sub i16 %x, 64
+  store i16 %result, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_i16_x_sub_64_multi_use:
+; GCN: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort [[Y:v[0-9]+]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
+; VI-DAG: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[Y]]
+
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
+; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
+define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
+  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
+  %x = load volatile i16, i16 addrspace(1)* %gep
+  %y = load volatile i16, i16 addrspace(1)* %gep
+  %result0 = sub i16 %x, 64
+  %result1 = sub i16 %y, 64
+  store volatile i16 %result0, i16 addrspace(1)* %gep.out
+  store volatile i16 %result1, i16 addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

Modified: llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll?rev=293540&r1=293539&r2=293540&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll Mon Jan 30 13:30:24 2017
@@ -52,7 +52,7 @@ define void @v_test_sub_i16_neg_constant
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63:
 ; VI: flat_load_ushort [[A:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffc1, [[A]]
+; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
 define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()