[llvm] AMDGPU: Match and Select BITOP3 on gfx950 (PR #117843)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 26 22:28:43 PST 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117843
>From c609205ba0205f996a033807d384e9148d30ea2e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 2 Mar 2023 14:10:01 -0800
Subject: [PATCH] AMDGPU: Match and Select BITOP3 on gfx950
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 170 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 3 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 202 ++++++++++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 13 +
llvm/test/CodeGen/AMDGPU/bitop3.ll | 368 ++++++++++++++++++
6 files changed, 757 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/bitop3.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7d78e9cd7eab6f..c0e01a020e0eb9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
return true;
}
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
+ SmallVectorImpl<SDValue> &Src) {
+ unsigned NumOpcodes = 0;
+ uint8_t LHSBits, RHSBits;
+
+ auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
+ // Define truth table given Src0, Src1, Src2 bits permutations:
+ // 0 0 0
+ // 0 0 1
+ // 0 1 0
+ // 0 1 1
+ // 1 0 0
+ // 1 0 1
+ // 1 1 0
+ // 1 1 1
+ const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+ if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->isAllOnes()) {
+ Bits = 0xff;
+ return true;
+ }
+ if (C->isZero()) {
+ Bits = 0;
+ return true;
+ }
+ }
+
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ // Try to find existing reused operand
+ if (Src[I] == Op) {
+ Bits = SrcBits[I];
+ return true;
+ }
+ // Try to replace parent operator
+ if (Src[I] == In) {
+ Bits = SrcBits[I];
+ Src[I] = Op;
+ return true;
+ }
+ }
+
+ if (Src.size() == 3) {
+ // No room left for operands. Try one last time, there can be a 'not' of
+ // one of our source operands. In this case we can compute the bits
+ // without growing Src vector.
+ if (Op.getOpcode() == ISD::XOR) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ if (C->isAllOnes()) {
+ SDValue LHS = Op.getOperand(0);
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ if (Src[I] == LHS) {
+ Bits = ~SrcBits[I];
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+ }
+
+ Bits = SrcBits[Src.size()];
+ Src.push_back(Op);
+ return true;
+ };
+
+ switch (In.getOpcode()) {
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue LHS = In.getOperand(0);
+ SDValue RHS = In.getOperand(1);
+
+ SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
+ if (!getOperandBits(LHS, LHSBits) ||
+ !getOperandBits(RHS, RHSBits)) {
+ Src = Backup;
+ return std::make_pair(0, 0);
+ }
+
+ // Recursion is naturally limited by the size of the operand vector.
+ auto Op = BitOp3_Op(LHS, Src);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ LHSBits = Op.second;
+ }
+
+ Op = BitOp3_Op(RHS, Src);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ RHSBits = Op.second;
+ }
+ break;
+ }
+ default:
+ return std::make_pair(0, 0);
+ }
+
+ uint8_t TTbl;
+ switch (In.getOpcode()) {
+ case ISD::AND:
+ TTbl = LHSBits & RHSBits;
+ break;
+ case ISD::OR:
+ TTbl = LHSBits | RHSBits;
+ break;
+ case ISD::XOR:
+ TTbl = LHSBits ^ RHSBits;
+ break;
+ default:
+ break;
+ }
+
+ return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
+ SDValue &Src2, SDValue &Tbl) const {
+ SmallVector<SDValue, 3> Src;
+ uint8_t TTbl;
+ unsigned NumOpcodes;
+
+ std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
+
+ // Src.empty() case can happen if all operands are all zero or all ones.
+ // Normally it shall be optimized out before reaching this.
+ if (NumOpcodes < 2 || Src.empty())
+ return false;
+
+ // For a uniform case threshold should be higher to account for moves between
+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+ // and a readtfirstlane after.
+ if (NumOpcodes < 4 && !In->isDivergent())
+ return false;
+
+ if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+ // asm more readable. This cannot be modeled with AddedComplexity because
+ // selector does not know how many operations did we match.
+ if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
+ (In.getOperand(0).getOpcode() == In.getOpcode() ||
+ In.getOperand(1).getOpcode() == In.getOpcode()))
+ return false;
+
+ if (In.getOpcode() == ISD::OR &&
+ (In.getOperand(0).getOpcode() == ISD::AND ||
+ In.getOperand(1).getOpcode() == ISD::AND))
+ return false;
+ }
+
+ // Last operand can be ignored, turning a ternary operation into a binary.
+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+ // 'c' with 'a' here without changing the answer. In some pathological
+ // cases it should be possible to get an operation with a single operand
+ // too if optimizer would not catch it.
+ while (Src.size() < 3)
+ Src.push_back(Src[0]);
+
+ Src0 = Src[0];
+ Src1 = Src[1];
+ Src2 = Src[2];
+
+ Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
+ return true;
+}
+
SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
if (In.isUndef())
return CurDAG->getUNDEF(MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5ae0b179d7d0e6..7e61eb470622f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -242,6 +242,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
+ SDValue &Tbl) const;
+
SDValue getHi16Elt(SDValue In) const;
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7ce7562cdcaa95..71d23f9fe30c49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3643,6 +3643,206 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
return true;
}
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
+ SmallVectorImpl<Register> &Src,
+ const MachineRegisterInfo &MRI) {
+ unsigned NumOpcodes = 0;
+ uint8_t LHSBits, RHSBits;
+
+ auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
+ // Define truth table given Src0, Src1, Src2 bits permutations:
+ // 0 0 0
+ // 0 0 1
+ // 0 1 0
+ // 0 1 1
+ // 1 0 0
+ // 1 0 1
+ // 1 1 0
+ // 1 1 1
+ const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+ if (mi_match(Op, MRI, m_AllOnesInt())) {
+ Bits = 0xff;
+ return true;
+ }
+ if (mi_match(Op, MRI, m_ZeroInt())) {
+ Bits = 0;
+ return true;
+ }
+
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ // Try to find existing reused operand
+ if (Src[I] == Op) {
+ Bits = SrcBits[I];
+ return true;
+ }
+ // Try to replace parent operator
+ if (Src[I] == R) {
+ Bits = SrcBits[I];
+ Src[I] = Op;
+ return true;
+ }
+ }
+
+ if (Src.size() == 3) {
+ // No room left for operands. Try one last time, there can be a 'not' of
+ // one of our source operands. In this case we can compute the bits
+ // without growing Src vector.
+ Register LHS;
+ if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
+ LHS = getSrcRegIgnoringCopies(LHS, MRI);
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ if (Src[I] == LHS) {
+ Bits = ~SrcBits[I];
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ Bits = SrcBits[Src.size()];
+ Src.push_back(Op);
+ return true;
+ };
+
+ MachineInstr *MI = MRI.getVRegDef(R);
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_AND:
+ case TargetOpcode::G_OR:
+ case TargetOpcode::G_XOR: {
+ Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
+ Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
+
+ SmallVector<Register, 3> Backup(Src.begin(), Src.end());
+ if (!getOperandBits(LHS, LHSBits) ||
+ !getOperandBits(RHS, RHSBits)) {
+ Src = Backup;
+ return std::make_pair(0, 0);
+ }
+
+ // Recursion is naturally limited by the size of the operand vector.
+ auto Op = BitOp3_Op(LHS, Src, MRI);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ LHSBits = Op.second;
+ }
+
+ Op = BitOp3_Op(RHS, Src, MRI);
+ if (Op.first) {
+ NumOpcodes += Op.first;
+ RHSBits = Op.second;
+ }
+ break;
+ }
+ default:
+ return std::make_pair(0, 0);
+ }
+
+ uint8_t TTbl;
+ switch (MI->getOpcode()) {
+ case TargetOpcode::G_AND:
+ TTbl = LHSBits & RHSBits;
+ break;
+ case TargetOpcode::G_OR:
+ TTbl = LHSBits | RHSBits;
+ break;
+ case TargetOpcode::G_XOR:
+ TTbl = LHSBits ^ RHSBits;
+ break;
+ default:
+ break;
+ }
+
+ return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
+ if (!Subtarget->hasBitOp3Insts())
+ return false;
+
+ SmallVector<Register, 3> Src;
+ uint8_t TTbl;
+ unsigned NumOpcodes;
+ Register DstReg = MI.getOperand(0).getReg();
+
+ std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
+
+ // Src.empty() case can happen if all operands are all zero or all ones.
+ // Normally it shall be optimized out before reaching this.
+ if (NumOpcodes < 2 || Src.empty())
+ return false;
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
+ // For a uniform case threshold should be higher to account for moves between
+ // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+ // and a readtfirstlane after.
+ if (NumOpcodes < 4 && !IsVALU)
+ return false;
+
+ bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
+ if (NumOpcodes == 2 && IsB32) {
+ // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+ // asm more readable. This cannot be modeled with AddedComplexity because
+ // selector does not know how many operations did we match.
+ if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
+ mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
+ mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
+ return false;
+ }
+
+ unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+ unsigned CBL = STI.getConstantBusLimit(Opc);
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ for (unsigned I = 0; I < Src.size(); ++I) {
+ const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
+ if (RB->getID() != AMDGPU::SGPRRegBankID)
+ continue;
+ if (CBL > 0) {
+ --CBL;
+ continue;
+ }
+ Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
+ .addReg(Src[I]);
+ Src[I] = NewReg;
+ }
+
+ // Last operand can be ignored, turning a ternary operation into a binary.
+ // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+ // 'c' with 'a' here without changing the answer. In some pathological
+ // cases it should be possible to get an operation with a single operand
+ // too if optimizer would not catch it.
+ while (Src.size() < 3)
+ Src.push_back(Src[0]);
+
+ auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
+ if (!IsB32)
+ MIB.addImm(0); // src_mod0
+ MIB.addReg(Src[0]);
+ if (!IsB32)
+ MIB.addImm(0); // src_mod1
+ MIB.addReg(Src[1]);
+ if (!IsB32)
+ MIB.addImm(0); // src_mod2
+ MIB.addReg(Src[2])
+ .addImm(TTbl);
+ if (!IsB32)
+ MIB.addImm(0); // op_sel
+
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ MI.eraseFromParent();
+
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
Register SrcReg = MI.getOperand(0).getReg();
if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -3682,6 +3882,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR:
+ if (selectBITOP3(I))
+ return true;
if (selectImpl(I, *CoverageInfo))
return true;
return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index a81f1579fb9f33..d294300be40497 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -147,6 +147,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;
bool selectWaveAddress(MachineInstr &I) const;
+ bool selectBITOP3(MachineInstr &I) const;
bool selectStackRestore(MachineInstr &MI) const;
bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const;
bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34850e42a3d605..c8c36714909adf 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -6,6 +6,9 @@
//
//===----------------------------------------------------------------------===//
+def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
+def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
+
// Special case for v_div_fmas_{f32|f64}, since it seems to be the
// only VOP instruction that implicitly reads VCC.
let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -1275,6 +1278,16 @@ let SubtargetPredicate = HasBitOp3Insts in {
(i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
(i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
>;
+
+ def : GCNPat<
+ (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)),
+ (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
+ >;
+
+ def : GCNPat<
+ (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
+ (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+ >;
} // End SubtargetPredicate = HasBitOp3Insts
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
new file mode 100644
index 00000000000000..dd608ef0e5a53d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s
+
+; ========= Single bit functions =========
+
+define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:1
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %nota, %notc
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:2
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:4
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %nota, %notc
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:8
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
+; GCN-NEXT: ; return to shader part epilog
+ %notb = xor i32 %b, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %a, %notc
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20
+; GCN-NEXT: ; return to shader part epilog
+ %notb = xor i32 %b, -1
+ %and1 = and i32 %a, %c
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_not_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40
+; GCN-NEXT: ; return to shader part epilog
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %a, %notc
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_and:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GCN-NEXT: ; return to shader part epilog
+ %and1 = and i32 %a, %c
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+; ========= Multi bit functions =========
+
+define amdgpu_ps float @test_12(i32 %a, i32 %b) {
+; GCN-LABEL: test_12:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %and1 = and i32 %nota, %b
+ %ret_cast = bitcast i32 %and1 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_63(i32 %a, i32 %b) {
+; GCN-LABEL: test_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %or = or i32 %nota, %notb
+ %ret_cast = bitcast i32 %or to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_59:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %and1 = and i32 %nota, %c
+ %or = or i32 %and1, %notb
+ %ret_cast = bitcast i32 %or to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_126(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_126:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v0, v0, v2, v1 bitop3:0x7e
+; GCN-NEXT: ; return to shader part epilog
+ %xor1 = xor i32 %a, %b
+ %xor2 = xor i32 %a, %c
+ %or = or i32 %xor1, %xor2
+ %ret_cast = bitcast i32 %or to float
+ ret float %ret_cast
+}
+
+; Src vector exhausted during search but recovered using 'not' lookahead.
+; GlobalISel has slightly different input, so it does not happen.
+
+; FIXME: Improve global isel code.
+
+define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) {
+; GFX950-SDAG-LABEL: test_12_src_overflow:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_12_src_overflow:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_bitop3_b32 v3, v0, v2, v0 bitop3:0xc
+; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v0, v2, v0 bitop3:3
+; GFX950-GISEL-NEXT: v_bitop3_b32 v0, v3, v1, v0 bitop3:0xc8
+; GFX950-GISEL-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notc = xor i32 %c, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %b
+ %and3 = and i32 %nota, %notc
+ %and4 = and i32 %and3, %b
+ %or = or i32 %and2, %and4
+ %ret_cast = bitcast i32 %or to float
+ ret float %ret_cast
+}
+
+; This could be a single LOP3 operation with tbl = 100, but Src vector exhausted during search.
+
+define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_100_src_overflow:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b32 v3, v1, v2, v0 bitop3:0x10
+; GCN-NEXT: v_bitop3_b32 v4, v0, v2, v1 bitop3:0x40
+; GCN-NEXT: v_bitop3_b32 v0, v1, v2, v0 bitop3:0x20
+; GCN-NEXT: v_or3_b32 v0, v3, v4, v0
+; GCN-NEXT: ; return to shader part epilog
+ %or1 = or i32 %c, %a
+ %not1 = xor i32 %or1, -1
+ %and1 = and i32 %b, %not1
+ %not2 = xor i32 %b, -1
+ %and2 = and i32 %a, %not2
+ %and3 = and i32 %and2, %c
+ %and4 = and i32 %b, %a
+ %not3 = xor i32 %c, -1
+ %and5 = and i32 %and4, %not3
+ %or2 = or i32 %and1, %and3
+ %or3 = or i32 %or2, %and5
+ %ret_cast = bitcast i32 %or3 to float
+ ret float %ret_cast
+}
+
+; ========= Ternary logical operations take precedence =========
+
+define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_xor3:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
+; GCN-NEXT: ; return to shader part epilog
+ %xor1 = xor i32 %a, %b
+ %xor2 = xor i32 %xor1, %c
+ %ret_cast = bitcast i32 %xor2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_or3:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_or3_b32 v0, v0, v1, v2
+; GCN-NEXT: ; return to shader part epilog
+ %or1 = or i32 %a, %b
+ %or2 = or i32 %or1, %c
+ %ret_cast = bitcast i32 %or2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @test_and_or(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_and_or:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_and_or_b32 v0, v0, v1, v2
+; GCN-NEXT: ; return to shader part epilog
+ %and1 = and i32 %a, %b
+ %or1 = or i32 %and1, %c
+ %ret_cast = bitcast i32 %or1 to float
+ ret float %ret_cast
+}
+
+; ========= Uniform cases =========
+
+define amdgpu_ps float @uniform_3_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GCN-LABEL: uniform_3_op:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_andn2_b32 s0, s2, s0
+; GCN-NEXT: s_and_b32 s0, s0, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %b
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GCN-LABEL: uniform_4_op:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_bitop3_b32 v0, s0, v0, v1 bitop3:2
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i32 %a, -1
+ %notb = xor i32 %b, -1
+ %and1 = and i32 %nota, %c
+ %and2 = and i32 %and1, %notb
+ %ret_cast = bitcast i32 %and2 to float
+ ret float %ret_cast
+}
+
+; ========= 16 bit tests =========
+
+define amdgpu_ps half @not_and_not_and_not_and_b16(i16 %a, i16 %b, i16 %c) {
+; GCN-LABEL: not_and_not_and_not_and_b16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:1
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i16 %a, -1
+ %notb = xor i16 %b, -1
+ %notc = xor i16 %c, -1
+ %and1 = and i16 %nota, %notc
+ %and2 = and i16 %and1, %notb
+ %ret_cast = bitcast i16 %and2 to half
+ ret half %ret_cast
+}
+
+define amdgpu_ps half @not_and_not_and_and_b16(i16 %a, i16 %b, i16 %c) {
+; GCN-LABEL: not_and_not_and_and_b16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:2
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i16 %a, -1
+ %notb = xor i16 %b, -1
+ %and1 = and i16 %nota, %c
+ %and2 = and i16 %and1, %notb
+ %ret_cast = bitcast i16 %and2 to half
+ ret half %ret_cast
+}
+
+define amdgpu_ps half @not_and_and_not_and_b16(i16 %a, i16 %b, i16 %c) {
+; GCN-LABEL: not_and_and_not_and_b16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bitop3_b16 v0, v0, v1, v2 bitop3:4
+; GCN-NEXT: ; return to shader part epilog
+ %nota = xor i16 %a, -1
+ %notc = xor i16 %c, -1
+ %and1 = and i16 %nota, %notc
+ %and2 = and i16 %and1, %b
+ %ret_cast = bitcast i16 %and2 to half
+ ret half %ret_cast
+}
+
+define amdgpu_ps half @test_xor3_b16(i16 %a, i16 %b, i16 %c) {
+; GFX950-SDAG-LABEL: test_xor3_b16:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0x96
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_xor3_b16:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT: ; return to shader part epilog
+ %xor1 = xor i16 %a, %b
+ %xor2 = xor i16 %xor1, %c
+ %ret_cast = bitcast i16 %xor2 to half
+ ret half %ret_cast
+}
+
+define amdgpu_ps half @test_or3_b16(i16 %a, i16 %b, i16 %c) {
+; GFX950-SDAG-LABEL: test_or3_b16:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0xfe
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_or3_b16:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2
+; GFX950-GISEL-NEXT: ; return to shader part epilog
+ %or1 = or i16 %a, %b
+ %or2 = or i16 %or1, %c
+ %ret_cast = bitcast i16 %or2 to half
+ ret half %ret_cast
+}
+
+define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) {
+; GFX950-SDAG-LABEL: test_and_or_b16:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v0, v2, v1 bitop3:0xec
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_and_or_b16:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_and_or_b32 v0, v0, v1, v2
+; GFX950-GISEL-NEXT: ; return to shader part epilog
+ %and1 = and i16 %a, %b
+ %or1 = or i16 %and1, %c
+ %ret_cast = bitcast i16 %or1 to half
+ ret half %ret_cast
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX950: {{.*}}
More information about the llvm-commits
mailing list