[llvm] r348601 - [AMDGPU] Shrink scalar AND, OR, XOR instructions
Graham Sellers via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 7 07:33:21 PST 2018
Author: gsellers
Date: Fri Dec 7 07:33:21 2018
New Revision: 348601
URL: http://llvm.org/viewvc/llvm-project?rev=348601&view=rev
Log:
[AMDGPU] Shrink scalar AND, OR, XOR instructions
This change attempts to shrink scalar AND, OR and XOR instructions which take an immediate that isn't inlineable.
It performs:
AND s0, s0, ~(1 << n) -> BITSET0 s0, n
OR s0, s0, (1 << n) -> BITSET1 s0, n
AND s0, s1, x -> ANDN2 s0, s1, ~x
OR s0, s1, x -> ORN2 s0, s1, ~x
XOR s0, s1, x -> XNOR s0, s1, ~x
In particular, this catches setting and clearing the sign bit for fabs (and x, 0x7ffffffff -> bitset0 x, 31 and or x, 0x80000000 -> bitset1 x, 31).
Added:
llvm/trunk/test/CodeGen/AMDGPU/andorbitset.ll
llvm/trunk/test/CodeGen/AMDGPU/andorxorinvimm.ll
Modified:
llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp
llvm/trunk/test/CodeGen/AMDGPU/fabs.ll
llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.ll
llvm/trunk/test/CodeGen/AMDGPU/gep-address-space.ll
llvm/trunk/test/CodeGen/AMDGPU/local-64.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp?rev=348601&r1=348600&r2=348601&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp Fri Dec 7 07:33:21 2018
@@ -212,6 +212,82 @@ static void shrinkScalarCompare(const SI
}
}
+/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
+/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
+/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
+/// XNOR (as a ^ b == ~(a ^ ~b)).
+/// \returns true if the caller should continue the machine function iterator
+static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
+ MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII,
+ MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+ MachineOperand *SrcReg = Src0;
+ MachineOperand *SrcImm = Src1;
+
+ if (SrcImm->isImm() &&
+ !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
+ uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+ uint32_t NewImm = 0;
+
+ if (Opc == AMDGPU::S_AND_B32) {
+ if (isPowerOf2_32(~Imm)) {
+ NewImm = countTrailingOnes(Imm);
+ Opc = AMDGPU::S_BITSET0_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ANDN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_OR_B32) {
+ if (isPowerOf2_32(Imm)) {
+ NewImm = countTrailingZeros(Imm);
+ Opc = AMDGPU::S_BITSET1_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ORN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_XOR_B32) {
+ if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_XNOR_B32;
+ }
+ } else {
+ llvm_unreachable("unexpected opcode");
+ }
+
+ if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+ SrcImm == Src0) {
+ if (!TII->commuteInstruction(MI, false, 1, 2))
+ NewImm = 0;
+ }
+
+ if (NewImm != 0) {
+ if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+ SrcReg->isReg()) {
+ MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+ MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ return true;
+ }
+
+ if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+ MI.setDesc(TII->get(Opc));
+ if (Opc == AMDGPU::S_BITSET0_B32 ||
+ Opc == AMDGPU::S_BITSET1_B32) {
+ Src0->ChangeToImmediate(NewImm);
+ MI.RemoveOperand(2);
+ } else {
+ SrcImm->setImm(NewImm);
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
// This is the same as MachineInstr::readsRegister/modifiesRegister except
// it takes subregs into account.
static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
@@ -512,6 +588,14 @@ bool SIShrinkInstructions::runOnMachineF
continue;
}
+ // Shrink scalar logic operations.
+ if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+ MI.getOpcode() == AMDGPU::S_OR_B32 ||
+ MI.getOpcode() == AMDGPU::S_XOR_B32) {
+ if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
Added: llvm/trunk/test/CodeGen/AMDGPU/andorbitset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/andorbitset.ll?rev=348601&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/andorbitset.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/andorbitset.ll Fri Dec 7 07:33:21 2018
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_clear_msb:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_clear_msb(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, 2147483647
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_set_msb:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_set_msb(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, 2147483648
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_clear_lsb:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2
+define amdgpu_kernel void @s_clear_lsb(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, 4294967294
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_set_lsb:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @s_set_lsb(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, 1
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_clear_midbit:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_clear_midbit(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, 4294967039
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_set_midbit:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, 256
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
Added: llvm/trunk/test/CodeGen/AMDGPU/andorxorinvimm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/andorxorinvimm.ll?rev=348601&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/andorxorinvimm.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/andorxorinvimm.ll Fri Dec 7 07:33:21 2018
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_or_to_orn2:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, -51
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_or_to_orn2_imm0:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 -51, %in
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, -51
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2_imm0:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 -51, %in
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) {
+ %x = xor i32 %in, -51
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) {
+ %x = xor i32 -51, %in
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
Modified: llvm/trunk/test/CodeGen/AMDGPU/fabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fabs.ll?rev=348601&r1=348600&r2=348601&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fabs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fabs.ll Fri Dec 7 07:33:21 2018
@@ -11,7 +11,8 @@
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
%fabs = call float @fabs(float %bc)
@@ -23,7 +24,8 @@ define amdgpu_kernel void @s_fabs_fn_fre
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
@@ -34,7 +36,8 @@ define amdgpu_kernel void @s_fabs_free(f
; FUNC-LABEL: {{^}}s_fabs_f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
%fabs = call float @llvm.fabs.f32(float %in)
store float %fabs, float addrspace(1)* %out
Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.ll?rev=348601&r1=348600&r2=348601&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.ll Fri Dec 7 07:33:21 2018
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
@@ -35,6 +35,7 @@ define amdgpu_kernel void @fneg_fabs_fmu
; R600: -PV
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; VI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
%bc = bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
Modified: llvm/trunk/test/CodeGen/AMDGPU/gep-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/gep-address-space.ll?rev=348601&r1=348600&r2=348601&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/gep-address-space.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/gep-address-space.ll Fri Dec 7 07:33:21 2018
@@ -14,7 +14,7 @@ define amdgpu_kernel void @use_gep_addre
; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32
+; SI: s_bitset1_b32
; CI: s_add_i32
; CHECK: ds_write_b32
define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
Modified: llvm/trunk/test/CodeGen/AMDGPU/local-64.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-64.ll?rev=348601&r1=348600&r2=348601&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/local-64.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/local-64.ll Fri Dec 7 07:33:21 2018
@@ -48,7 +48,7 @@ define amdgpu_kernel void @local_i8_load
; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
; SI, which is why it is being OR'd with the base pointer.
-; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; SI-DAG: s_bitset1_b32 [[ADDR:s[0-9]+]], 16
; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
More information about the llvm-commits
mailing list