[llvm] AMDGPU: Match and Select BITOP3 on gfx950 (PR #117843)

Tue Nov 26 22:28:43 PST 2024

https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117843

>From c609205ba0205f996a033807d384e9148d30ea2e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 2 Mar 2023 14:10:01 -0800
Subject: [PATCH] AMDGPU: Match and Select BITOP3 on gfx950

Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 170 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   3 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 202 ++++++++++
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   1 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  13 +
 llvm/test/CodeGen/AMDGPU/bitop3.ll            | 368 ++++++++++++++++++
 6 files changed, 757 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/bitop3.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7d78e9cd7eab6f..c0e01a020e0eb9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3552,6 +3552,176 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
   return true;
 }
 
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
+                                              SmallVectorImpl<SDValue> &Src) {
+  unsigned NumOpcodes = 0;
+  uint8_t LHSBits, RHSBits;
+
+  auto getOperandBits = [&Src, In](SDValue Op, uint8_t &Bits) -> bool {
+    // Define truth table given Src0, Src1, Src2 bits permutations:
+    //                          0     0     0
+    //                          0     0     1
+    //                          0     1     0
+    //                          0     1     1
+    //                          1     0     0
+    //                          1     0     1
+    //                          1     1     0
+    //                          1     1     1
+    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->isAllOnes()) {
+        Bits = 0xff;
+        return true;
+      }
+      if (C->isZero()) {
+        Bits = 0;
+        return true;
+      }
+    }
+
+    for (unsigned I = 0; I < Src.size(); ++I) {
+      // Try to find existing reused operand
+      if (Src[I] == Op) {
+        Bits = SrcBits[I];
+        return true;
+      }
+      // Try to replace parent operator
+      if (Src[I] == In) {
+        Bits = SrcBits[I];
+        Src[I] = Op;
+        return true;
+      }
+    }
+
+    if (Src.size() == 3) {
+      // No room left for operands. Try one last time, there can be a 'not' of
+      // one of our source operands. In this case we can compute the bits
+      // without growing Src vector.
+      if (Op.getOpcode() == ISD::XOR) {
+        if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          if (C->isAllOnes()) {
+            SDValue LHS = Op.getOperand(0);
+            for (unsigned I = 0; I < Src.size(); ++I) {
+              if (Src[I] == LHS) {
+                Bits = ~SrcBits[I];
+                return true;
+              }
+            }
+          }
+        }
+      }
+
+      return false;
+    }
+
+    Bits = SrcBits[Src.size()];
+    Src.push_back(Op);
+    return true;
+  };
+
+  switch (In.getOpcode()) {
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    SDValue LHS = In.getOperand(0);
+    SDValue RHS = In.getOperand(1);
+
+    SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
+    if (!getOperandBits(LHS, LHSBits) ||
+        !getOperandBits(RHS, RHSBits)) {
+      Src = Backup;
+      return std::make_pair(0, 0);
+    }
+
+    // Recursion is naturally limited by the size of the operand vector.
+    auto Op = BitOp3_Op(LHS, Src);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      LHSBits = Op.second;
+    }
+
+    Op = BitOp3_Op(RHS, Src);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      RHSBits = Op.second;
+    }
+    break;
+  }
+  default:
+    return std::make_pair(0, 0);
+  }
+
+  uint8_t TTbl;
+  switch (In.getOpcode()) {
+  case ISD::AND:
+    TTbl = LHSBits & RHSBits;
+    break;
+  case ISD::OR:
+    TTbl = LHSBits | RHSBits;
+    break;
+  case ISD::XOR:
+    TTbl = LHSBits ^ RHSBits;
+    break;
+  default:
+    break;
+  }
+
+  return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
+                                      SDValue &Src2, SDValue &Tbl) const {
+  SmallVector<SDValue, 3> Src;
+  uint8_t TTbl;
+  unsigned NumOpcodes;
+
+  std::tie(NumOpcodes, TTbl) = BitOp3_Op(In, Src);
+
+  // Src.empty() case can happen if all operands are all zero or all ones.
+  // Normally it shall be optimized out before reaching this.
+  if (NumOpcodes < 2 || Src.empty())
+    return false;
+
+  // For a uniform case threshold should be higher to account for moves between
+  // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+  // and a readtfirstlane after.
+  if (NumOpcodes < 4 && !In->isDivergent())
+    return false;
+
+  if (NumOpcodes == 2 && In.getValueType() == MVT::i32) {
+    // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+    // asm more readable. This cannot be modeled with AddedComplexity because
+    // selector does not know how many operations did we match.
+    if ((In.getOpcode() == ISD::XOR || In.getOpcode() == ISD::OR) &&
+        (In.getOperand(0).getOpcode() == In.getOpcode() ||
+         In.getOperand(1).getOpcode() == In.getOpcode()))
+      return false;
+
+    if (In.getOpcode() == ISD::OR &&
+        (In.getOperand(0).getOpcode() == ISD::AND ||
+         In.getOperand(1).getOpcode() == ISD::AND))
+      return false;
+  }
+
+  // Last operand can be ignored, turning a ternary operation into a binary.
+  // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+  // 'c' with 'a' here without changing the answer. In some pathological
+  // cases it should be possible to get an operation with a single operand
+  // too if optimizer would not catch it.
+  while (Src.size() < 3)
+    Src.push_back(Src[0]);
+
+  Src0 = Src[0];
+  Src1 = Src[1];
+  Src2 = Src[2];
+
+  Tbl = CurDAG->getTargetConstant(TTbl, SDLoc(In), MVT::i32);
+  return true;
+}
+
 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   if (In.isUndef())
     return CurDAG->getUNDEF(MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 5ae0b179d7d0e6..7e61eb470622f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -242,6 +242,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                                 SDValue &SrcMods) const;
   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
+  bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
+                   SDValue &Tbl) const;
+
   SDValue getHi16Elt(SDValue In) const;
 
   SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7ce7562cdcaa95..71d23f9fe30c49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3643,6 +3643,206 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
   return true;
 }
 
+// Match BITOP3 operation and return a number of matched instructions plus
+// truth table.
+static std::pair<unsigned, uint8_t> BitOp3_Op(Register R,
+                                              SmallVectorImpl<Register> &Src,
+                                              const MachineRegisterInfo &MRI) {
+  unsigned NumOpcodes = 0;
+  uint8_t LHSBits, RHSBits;
+
+  auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool {
+    // Define truth table given Src0, Src1, Src2 bits permutations:
+    //                          0     0     0
+    //                          0     0     1
+    //                          0     1     0
+    //                          0     1     1
+    //                          1     0     0
+    //                          1     0     1
+    //                          1     1     0
+    //                          1     1     1
+    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+
+    if (mi_match(Op, MRI, m_AllOnesInt())) {
+      Bits = 0xff;
+      return true;
+    }
+    if (mi_match(Op, MRI, m_ZeroInt())) {
+      Bits = 0;
+      return true;
+    }
+
+    for (unsigned I = 0; I < Src.size(); ++I) {
+      // Try to find existing reused operand
+      if (Src[I] == Op) {
+        Bits = SrcBits[I];
+        return true;
+      }
+      // Try to replace parent operator
+      if (Src[I] == R) {
+        Bits = SrcBits[I];
+        Src[I] = Op;
+        return true;
+      }
+    }
+
+    if (Src.size() == 3) {
+      // No room left for operands. Try one last time, there can be a 'not' of
+      // one of our source operands. In this case we can compute the bits
+      // without growing Src vector.
+      Register LHS;
+      if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) {
+        LHS = getSrcRegIgnoringCopies(LHS, MRI);
+        for (unsigned I = 0; I < Src.size(); ++I) {
+          if (Src[I] == LHS) {
+            Bits = ~SrcBits[I];
+            return true;
+          }
+        }
+      }
+
+      return false;
+    }
+
+    Bits = SrcBits[Src.size()];
+    Src.push_back(Op);
+    return true;
+  };
+
+  MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI);
+    Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI);
+
+    SmallVector<Register, 3> Backup(Src.begin(), Src.end());
+    if (!getOperandBits(LHS, LHSBits) ||
+        !getOperandBits(RHS, RHSBits)) {
+      Src = Backup;
+      return std::make_pair(0, 0);
+    }
+
+    // Recursion is naturally limited by the size of the operand vector.
+    auto Op = BitOp3_Op(LHS, Src, MRI);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      LHSBits = Op.second;
+    }
+
+    Op = BitOp3_Op(RHS, Src, MRI);
+    if (Op.first) {
+      NumOpcodes += Op.first;
+      RHSBits = Op.second;
+    }
+    break;
+  }
+  default:
+    return std::make_pair(0, 0);
+  }
+
+  uint8_t TTbl;
+  switch (MI->getOpcode()) {
+  case TargetOpcode::G_AND:
+    TTbl = LHSBits & RHSBits;
+    break;
+  case TargetOpcode::G_OR:
+    TTbl = LHSBits | RHSBits;
+    break;
+  case TargetOpcode::G_XOR:
+    TTbl = LHSBits ^ RHSBits;
+    break;
+  default:
+    break;
+  }
+
+  return std::make_pair(NumOpcodes + 1, TTbl);
+}
+
+bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const {
+  if (!Subtarget->hasBitOp3Insts())
+    return false;
+
+  SmallVector<Register, 3> Src;
+  uint8_t TTbl;
+  unsigned NumOpcodes;
+  Register DstReg = MI.getOperand(0).getReg();
+
+  std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI);
+
+  // Src.empty() case can happen if all operands are all zero or all ones.
+  // Normally it shall be optimized out before reaching this.
+  if (NumOpcodes < 2 || Src.empty())
+    return false;
+
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
+  // For a uniform case threshold should be higher to account for moves between
+  // VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be in SGPRs
+  // and a readtfirstlane after.
+  if (NumOpcodes < 4 && !IsVALU)
+    return false;
+
+  bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32);
+  if (NumOpcodes == 2 && IsB32) {
+    // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
+    // asm more readable. This cannot be modeled with AddedComplexity because
+    // selector does not know how many operations did we match.
+    if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) ||
+        mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) ||
+        mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg())))
+      return false;
+  }
+
+  unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
+  unsigned CBL = STI.getConstantBusLimit(Opc);
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  for (unsigned I = 0; I < Src.size(); ++I) {
+    const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
+    if (RB->getID() != AMDGPU::SGPRRegBankID)
+      continue;
+    if (CBL > 0) {
+      --CBL;
+      continue;
+    }
+    Register NewReg =  MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg)
+        .addReg(Src[I]);
+    Src[I] = NewReg;
+  }
+
+  // Last operand can be ignored, turning a ternary operation into a binary.
+  // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
+  // 'c' with 'a' here without changing the answer. In some pathological
+  // cases it should be possible to get an operation with a single operand
+  // too if optimizer would not catch it.
+  while (Src.size() < 3)
+    Src.push_back(Src[0]);
+
+  auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg);
+  if (!IsB32)
+    MIB.addImm(0); // src_mod0
+  MIB.addReg(Src[0]);
+  if (!IsB32)
+    MIB.addImm(0); // src_mod1
+  MIB.addReg(Src[1]);
+  if (!IsB32)
+    MIB.addImm(0); // src_mod2
+  MIB.addReg(Src[2])
+     .addImm(TTbl);
+  if (!IsB32)
+    MIB.addImm(0); // op_sel
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
   Register SrcReg = MI.getOperand(0).getReg();
   if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -3682,6 +3882,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
+    if (selectBITOP3(I))
+      return true;
     if (selectImpl(I, *CoverageInfo))
       return true;
     return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index a81f1579fb9f33..d294300be40497 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -147,6 +147,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectSMFMACIntrin(MachineInstr &I) const;
   bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;
   bool selectWaveAddress(MachineInstr &I) const;
+  bool selectBITOP3(MachineInstr &I) const;
   bool selectStackRestore(MachineInstr &MI) const;
   bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 34850e42a3d605..c8c36714909adf 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+def BITOP3_32 : ComplexPattern<i32, 4, "SelectBITOP3", [and, or, xor]>;
+def BITOP3_16 : ComplexPattern<i16, 4, "SelectBITOP3", [and, or, xor]>;
+
 // Special case for v_div_fmas_{f32|f64}, since it seems to be the
 // only VOP instruction that implicitly reads VCC.
 let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in {
@@ -1275,6 +1278,16 @@ let SubtargetPredicate = HasBitOp3Insts  in {
     (i16 (int_amdgcn_bitop3 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
     (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
   >;
+
+  def : GCNPat<
+    (i32 (BITOP3_32 i32:$src0, i32:$src1, i32:$src2, i8:$bitop3)),
+    (i32 (V_BITOP3_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2, timm:$bitop3))
+  >;
+
+  def : GCNPat<
+    (i16 (BITOP3_16 i16:$src0, i16:$src1, i16:$src2, i8:$bitop3)),
+    (i16 (V_BITOP3_B16_e64 0, VSrc_b16:$src0, 0, VSrc_b16:$src1, 0, VSrc_b16:$src2, timm:$bitop3, 0))
+  >;
 } // End SubtargetPredicate = HasBitOp3Insts
 
 class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
new file mode 100644
index 00000000000000..dd608ef0e5a53d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX950,GFX950-GISEL %s
+
+; ========= Single bit functions =========
+
+define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:1
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %nota, %notc
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_not_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:2
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:4
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %nota, %notc
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: not_and_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:8
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
+; GCN-NEXT:    ; return to shader part epilog
+  %notb = xor i32 %b, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %a, %notc
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_not_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20
+; GCN-NEXT:    ; return to shader part epilog
+  %notb = xor i32 %b, -1
+  %and1 = and i32 %a, %c
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_not_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40
+; GCN-NEXT:    ; return to shader part epilog
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %a, %notc
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: and_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GCN-NEXT:    ; return to shader part epilog
+  %and1 = and i32 %a, %c
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+; ========= Multi bit functions =========
+
+define amdgpu_ps float @test_12(i32 %a, i32 %b) {
+; GCN-LABEL: test_12:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %and1 = and i32 %nota, %b
+  %ret_cast = bitcast i32 %and1 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_63(i32 %a, i32 %b) {
+; GCN-LABEL: test_63:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %or = or i32 %nota, %notb
+  %ret_cast = bitcast i32 %or to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_59:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %and1 = and i32 %nota, %c
+  %or = or i32 %and1, %notb
+  %ret_cast = bitcast i32 %or to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_126(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_126:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v2, v1 bitop3:0x7e
+; GCN-NEXT:    ; return to shader part epilog
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %a, %c
+  %or = or i32 %xor1, %xor2
+  %ret_cast = bitcast i32 %or to float
+  ret float %ret_cast
+}
+
+; Src vector exhausted during search but recovered using 'not' lookahead.
+; GlobalISel has slightly different input, so it does not happen.
+
+; FIXME: Improve global isel code.
+
+define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) {
+; GFX950-SDAG-LABEL: test_12_src_overflow:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_12_src_overflow:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v3, v0, v2, v0 bitop3:0xc
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:3
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v3, v1, v0 bitop3:0xc8
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notc = xor i32 %c, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %b
+  %and3 = and i32 %nota, %notc
+  %and4 = and i32 %and3, %b
+  %or = or i32 %and2, %and4
+  %ret_cast = bitcast i32 %or to float
+  ret float %ret_cast
+}
+
+; This could be a single LOP3 operation with tbl = 100, but Src vector exhausted during search.
+
+define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_100_src_overflow:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v3, v1, v2, v0 bitop3:0x10
+; GCN-NEXT:    v_bitop3_b32 v4, v0, v2, v1 bitop3:0x40
+; GCN-NEXT:    v_bitop3_b32 v0, v1, v2, v0 bitop3:0x20
+; GCN-NEXT:    v_or3_b32 v0, v3, v4, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %or1 = or i32 %c, %a
+  %not1 = xor i32 %or1, -1
+  %and1 = and i32 %b, %not1
+  %not2 = xor i32 %b, -1
+  %and2 = and i32 %a, %not2
+  %and3 = and i32 %and2, %c
+  %and4 = and i32 %b, %a
+  %not3 = xor i32 %c, -1
+  %and5 = and i32 %and4, %not3
+  %or2 = or i32 %and1, %and3
+  %or3 = or i32 %or2, %and5
+  %ret_cast = bitcast i32 %or3 to float
+  ret float %ret_cast
+}
+
+; ========= Ternary logical operations take precedence =========
+
+define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_xor3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GCN-NEXT:    ; return to shader part epilog
+  %xor1 = xor i32 %a, %b
+  %xor2 = xor i32 %xor1, %c
+  %ret_cast = bitcast i32 %xor2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_or3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GCN-NEXT:    ; return to shader part epilog
+  %or1 = or i32 %a, %b
+  %or2 = or i32 %or1, %c
+  %ret_cast = bitcast i32 %or2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @test_and_or(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_and_or:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GCN-NEXT:    ; return to shader part epilog
+  %and1 = and i32 %a, %b
+  %or1 = or i32 %and1, %c
+  %ret_cast = bitcast i32 %or1 to float
+  ret float %ret_cast
+}
+
+; ========= Uniform cases =========
+
+define amdgpu_ps float @uniform_3_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GCN-LABEL: uniform_3_op:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_andn2_b32 s0, s2, s0
+; GCN-NEXT:    s_and_b32 s0, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %b
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
+; GCN-LABEL: uniform_4_op:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_bitop3_b32 v0, s0, v0, v1 bitop3:2
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i32 %a, -1
+  %notb = xor i32 %b, -1
+  %and1 = and i32 %nota, %c
+  %and2 = and i32 %and1, %notb
+  %ret_cast = bitcast i32 %and2 to float
+  ret float %ret_cast
+}
+
+; ========= 16 bit tests =========
+
+define amdgpu_ps half @not_and_not_and_not_and_b16(i16 %a, i16 %b, i16 %c) {
+; GCN-LABEL: not_and_not_and_not_and_b16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:1
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i16 %a, -1
+  %notb = xor i16 %b, -1
+  %notc = xor i16 %c, -1
+  %and1 = and i16 %nota, %notc
+  %and2 = and i16 %and1, %notb
+  %ret_cast = bitcast i16 %and2 to half
+  ret half %ret_cast
+}
+
+define amdgpu_ps half @not_and_not_and_and_b16(i16 %a, i16 %b, i16 %c) {
+; GCN-LABEL: not_and_not_and_and_b16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:2
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i16 %a, -1
+  %notb = xor i16 %b, -1
+  %and1 = and i16 %nota, %c
+  %and2 = and i16 %and1, %notb
+  %ret_cast = bitcast i16 %and2 to half
+  ret half %ret_cast
+}
+
+define amdgpu_ps half @not_and_and_not_and_b16(i16 %a, i16 %b, i16 %c) {
+; GCN-LABEL: not_and_and_not_and_b16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:4
+; GCN-NEXT:    ; return to shader part epilog
+  %nota = xor i16 %a, -1
+  %notc = xor i16 %c, -1
+  %and1 = and i16 %nota, %notc
+  %and2 = and i16 %and1, %b
+  %ret_cast = bitcast i16 %and2 to half
+  ret half %ret_cast
+}
+
+define amdgpu_ps half @test_xor3_b16(i16 %a, i16 %b, i16 %c) {
+; GFX950-SDAG-LABEL: test_xor3_b16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_bitop3_b16 v0, v0, v2, v1 bitop3:0x96
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_xor3_b16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX950-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
+  %xor1 = xor i16 %a, %b
+  %xor2 = xor i16 %xor1, %c
+  %ret_cast = bitcast i16 %xor2 to half
+  ret half %ret_cast
+}
+
+define amdgpu_ps half @test_or3_b16(i16 %a, i16 %b, i16 %c) {
+; GFX950-SDAG-LABEL: test_or3_b16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_bitop3_b16 v0, v0, v2, v1 bitop3:0xfe
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_or3_b16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
+  %or1 = or i16 %a, %b
+  %or2 = or i16 %or1, %c
+  %ret_cast = bitcast i16 %or2 to half
+  ret half %ret_cast
+}
+
+define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) {
+; GFX950-SDAG-LABEL: test_and_or_b16:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_bitop3_b16 v0, v0, v2, v1 bitop3:0xec
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_and_or_b16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
+  %and1 = and i16 %a, %b
+  %or1 = or i16 %and1, %c
+  %ret_cast = bitcast i16 %or1 to half
+  ret half %ret_cast
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX950: {{.*}}