[PATCH] R600: Remove AMDILPeeopholeOptimizer and replace optimizations with tablegen patterns
Christian König
deathsimple at vodafone.de
Tue May 7 01:48:53 PDT 2013
Am 06.05.2013 18:48, schrieb Tom Stellard:
> From: Tom Stellard <thomas.stellard at amd.com>
>
> The BFE optimization was the only one we were actually using, and it was
> emitting an intrinsic that we don't support.
>
> https://bugs.freedesktop.org/show_bug.cgi?id=64201
The patch has my rb.
I'm wondering if we shouldn't get ride of all those AMDIL* files, cause
we obviously don't use that namespace anymore and at least I sometimes
wonder why we still have AMDILISelLowering.cpp and AMDGPUISelLowering.cpp.
Christian.
> ---
> lib/Target/R600/AMDGPUInstructions.td | 11 +
> lib/Target/R600/AMDGPUTargetMachine.cpp | 1 -
> lib/Target/R600/AMDILPeepholeOptimizer.cpp | 1215 ----------------------------
> lib/Target/R600/CMakeLists.txt | 1 -
> lib/Target/R600/R600Instructions.td | 1 +
> test/CodeGen/R600/bfe_uint.ll | 26 +
> 6 files changed, 38 insertions(+), 1217 deletions(-)
> delete mode 100644 lib/Target/R600/AMDILPeepholeOptimizer.cpp
> create mode 100644 test/CodeGen/R600/bfe_uint.ll
>
> diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
> index b44d248..d2620b2 100644
> --- a/lib/Target/R600/AMDGPUInstructions.td
> +++ b/lib/Target/R600/AMDGPUInstructions.td
> @@ -284,6 +284,17 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
> (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
> >;
>
> +// Bitfield extract patterns
> +
> +def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
> +def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
> + SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
> +
> +class BFEPattern <Instruction BFE> : Pat <
> + (and (srl i32:$x, legalshift32:$y), bfemask:$z),
> + (BFE $x, $y, $z)
> +>;
> +
> include "R600Instructions.td"
>
> include "SIInstrInfo.td"
> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> index 0ec67ce..31fbf32 100644
> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> @@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() {
> }
>
> bool AMDGPUPassConfig::addInstSelector() {
> - addPass(createAMDGPUPeepholeOpt(*TM));
> addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
>
> const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
> diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
> deleted file mode 100644
> index 3a28038..0000000
> --- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
> +++ /dev/null
> @@ -1,1215 +0,0 @@
> -//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
> -//
> -// The LLVM Compiler Infrastructure
> -//
> -// This file is distributed under the University of Illinois Open Source
> -// License. See LICENSE.TXT for details.
> -//
> -/// \file
> -//==-----------------------------------------------------------------------===//
> -
> -#define DEBUG_TYPE "PeepholeOpt"
> -#ifdef DEBUG
> -#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
> -#else
> -#define DEBUGME 0
> -#endif
> -
> -#include "AMDILDevices.h"
> -#include "AMDGPUInstrInfo.h"
> -#include "llvm/ADT/Statistic.h"
> -#include "llvm/ADT/StringExtras.h"
> -#include "llvm/ADT/StringRef.h"
> -#include "llvm/ADT/Twine.h"
> -#include "llvm/IR/Constants.h"
> -#include "llvm/CodeGen/MachineFunction.h"
> -#include "llvm/CodeGen/MachineFunctionAnalysis.h"
> -#include "llvm/IR/Function.h"
> -#include "llvm/IR/Instructions.h"
> -#include "llvm/IR/Module.h"
> -#include "llvm/Support/Debug.h"
> -#include "llvm/Support/MathExtras.h"
> -
> -#include <sstream>
> -
> -#if 0
> -STATISTIC(PointerAssignments, "Number of dynamic pointer "
> - "assigments discovered");
> -STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
> -#endif
> -
> -using namespace llvm;
> -// The Peephole optimization pass is used to do simple last minute optimizations
> -// that are required for correct code or to remove redundant functions
> -namespace {
> -
> -class OpaqueType;
> -
> -class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
> -public:
> - TargetMachine &TM;
> - static char ID;
> - AMDGPUPeepholeOpt(TargetMachine &tm);
> - ~AMDGPUPeepholeOpt();
> - const char *getPassName() const;
> - bool runOnFunction(Function &F);
> - bool doInitialization(Module &M);
> - bool doFinalization(Module &M);
> - void getAnalysisUsage(AnalysisUsage &AU) const;
> -protected:
> -private:
> - // Function to initiate all of the instruction level optimizations.
> - bool instLevelOptimizations(BasicBlock::iterator *inst);
> - // Quick check to see if we need to dump all of the pointers into the
> - // arena. If this is correct, then we set all pointers to exist in arena. This
> - // is a workaround for aliasing of pointers in a struct/union.
> - bool dumpAllIntoArena(Function &F);
> - // Because I don't want to invalidate any pointers while in the
> - // safeNestedForEachFunction. I push atomic conversions to a vector and handle
> - // it later. This function does the conversions if required.
> - void doAtomicConversionIfNeeded(Function &F);
> - // Because __amdil_is_constant cannot be properly evaluated if
> - // optimizations are disabled, the call's are placed in a vector
> - // and evaluated after the __amdil_image* functions are evaluated
> - // which should allow the __amdil_is_constant function to be
> - // evaluated correctly.
> - void doIsConstCallConversionIfNeeded();
> - bool mChanged;
> - bool mDebug;
> - bool mConvertAtomics;
> - CodeGenOpt::Level optLevel;
> - // Run a series of tests to see if we can optimize a CALL instruction.
> - bool optimizeCallInst(BasicBlock::iterator *bbb);
> - // A peephole optimization to optimize bit extract sequences.
> - bool optimizeBitExtract(Instruction *inst);
> - // A peephole optimization to optimize bit insert sequences.
> - bool optimizeBitInsert(Instruction *inst);
> - bool setupBitInsert(Instruction *base,
> - Instruction *&src,
> - Constant *&mask,
> - Constant *&shift);
> - // Expand the bit field insert instruction on versions of OpenCL that
> - // don't support it.
> - bool expandBFI(CallInst *CI);
> - // Expand the bit field mask instruction on version of OpenCL that
> - // don't support it.
> - bool expandBFM(CallInst *CI);
> - // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
> - // this case we need to expand them. These functions check for 24bit functions
> - // and then expand.
> - bool isSigned24BitOps(CallInst *CI);
> - void expandSigned24BitOps(CallInst *CI);
> - // One optimization that can occur is that if the required workgroup size is
> - // specified then the result of get_local_size is known at compile time and
> - // can be returned accordingly.
> - bool isRWGLocalOpt(CallInst *CI);
> - // On northern island cards, the division is slightly less accurate than on
> - // previous generations, so we need to utilize a more accurate division. So we
> - // can translate the accurate divide to a normal divide on all other cards.
> - bool convertAccurateDivide(CallInst *CI);
> - void expandAccurateDivide(CallInst *CI);
> - // If the alignment is set incorrectly, it can produce really inefficient
> - // code. This checks for this scenario and fixes it if possible.
> - bool correctMisalignedMemOp(Instruction *inst);
> -
> - // If we are in no opt mode, then we need to make sure that
> - // local samplers are properly propagated as constant propagation
> - // doesn't occur and we need to know the value of kernel defined
> - // samplers at compile time.
> - bool propagateSamplerInst(CallInst *CI);
> -
> - // Helper functions
> -
> - // Group of functions that recursively calculate the size of a structure based
> - // on it's sub-types.
> - size_t getTypeSize(Type * const T, bool dereferencePtr = false);
> - size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
> - size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
> - size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
> - size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
> - size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
> - size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
> - size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
> -
> - LLVMContext *mCTX;
> - Function *mF;
> - const AMDGPUSubtarget *mSTM;
> - SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
> - SmallVector<CallInst *, 16> isConstVec;
> -}; // class AMDGPUPeepholeOpt
> - char AMDGPUPeepholeOpt::ID = 0;
> -
> -// A template function that has two levels of looping before calling the
> -// function with a pointer to the current iterator.
> -template<class InputIterator, class SecondIterator, class Function>
> -Function safeNestedForEach(InputIterator First, InputIterator Last,
> - SecondIterator S, Function F) {
> - for ( ; First != Last; ++First) {
> - SecondIterator sf, sl;
> - for (sf = First->begin(), sl = First->end();
> - sf != sl; ) {
> - if (!F(&sf)) {
> - ++sf;
> - }
> - }
> - }
> - return F;
> -}
> -
> -} // anonymous namespace
> -
> -namespace llvm {
> - FunctionPass *
> - createAMDGPUPeepholeOpt(TargetMachine &tm) {
> - return new AMDGPUPeepholeOpt(tm);
> - }
> -} // llvm namespace
> -
> -AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
> - : FunctionPass(ID), TM(tm) {
> - mDebug = DEBUGME;
> - optLevel = TM.getOptLevel();
> -
> -}
> -
> -AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
> -}
> -
> -const char *
> -AMDGPUPeepholeOpt::getPassName() const {
> - return "AMDGPU PeepHole Optimization Pass";
> -}
> -
> -bool
> -containsPointerType(Type *Ty) {
> - if (!Ty) {
> - return false;
> - }
> - switch(Ty->getTypeID()) {
> - default:
> - return false;
> - case Type::StructTyID: {
> - const StructType *ST = dyn_cast<StructType>(Ty);
> - for (StructType::element_iterator stb = ST->element_begin(),
> - ste = ST->element_end(); stb != ste; ++stb) {
> - if (!containsPointerType(*stb)) {
> - continue;
> - }
> - return true;
> - }
> - break;
> - }
> - case Type::VectorTyID:
> - case Type::ArrayTyID:
> - return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
> - case Type::PointerTyID:
> - return true;
> - };
> - return false;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
> - bool dumpAll = false;
> - for (Function::const_arg_iterator cab = F.arg_begin(),
> - cae = F.arg_end(); cab != cae; ++cab) {
> - const Argument *arg = cab;
> - const PointerType *PT = dyn_cast<PointerType>(arg->getType());
> - if (!PT) {
> - continue;
> - }
> - Type *DereferencedType = PT->getElementType();
> - if (!dyn_cast<StructType>(DereferencedType)
> - ) {
> - continue;
> - }
> - if (!containsPointerType(DereferencedType)) {
> - continue;
> - }
> - // FIXME: Because a pointer inside of a struct/union may be aliased to
> - // another pointer we need to take the conservative approach and place all
> - // pointers into the arena until more advanced detection is implemented.
> - dumpAll = true;
> - }
> - return dumpAll;
> -}
> -void
> -AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
> - if (isConstVec.empty()) {
> - return;
> - }
> - for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
> - CallInst *CI = isConstVec[x];
> - Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
> - Type *aType = Type::getInt32Ty(*mCTX);
> - Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
> - : ConstantInt::get(aType, 0);
> - CI->replaceAllUsesWith(Val);
> - CI->eraseFromParent();
> - }
> - isConstVec.clear();
> -}
> -void
> -AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
> - // Don't do anything if we don't have any atomic operations.
> - if (atomicFuncs.empty()) {
> - return;
> - }
> - // Change the function name for the atomic if it is required
> - uint32_t size = atomicFuncs.size();
> - for (uint32_t x = 0; x < size; ++x) {
> - atomicFuncs[x].first->setOperand(
> - atomicFuncs[x].first->getNumOperands()-1,
> - atomicFuncs[x].second);
> -
> - }
> - mChanged = true;
> - if (mConvertAtomics) {
> - return;
> - }
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
> - mChanged = false;
> - mF = &MF;
> - mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
> - if (mDebug) {
> - MF.dump();
> - }
> - mCTX = &MF.getType()->getContext();
> - mConvertAtomics = true;
> - safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
> - std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
> - this));
> -
> - doAtomicConversionIfNeeded(MF);
> - doIsConstCallConversionIfNeeded();
> -
> - if (mDebug) {
> - MF.dump();
> - }
> - return mChanged;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
> - Instruction *inst = (*bbb);
> - CallInst *CI = dyn_cast<CallInst>(inst);
> - if (!CI) {
> - return false;
> - }
> - if (isSigned24BitOps(CI)) {
> - expandSigned24BitOps(CI);
> - ++(*bbb);
> - CI->eraseFromParent();
> - return true;
> - }
> - if (propagateSamplerInst(CI)) {
> - return false;
> - }
> - if (expandBFI(CI) || expandBFM(CI)) {
> - ++(*bbb);
> - CI->eraseFromParent();
> - return true;
> - }
> - if (convertAccurateDivide(CI)) {
> - expandAccurateDivide(CI);
> - ++(*bbb);
> - CI->eraseFromParent();
> - return true;
> - }
> -
> - StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
> - if (calleeName.startswith("__amdil_is_constant")) {
> - // If we do not have optimizations, then this
> - // cannot be properly evaluated, so we add the
> - // call instruction to a vector and process
> - // them at the end of processing after the
> - // samplers have been correctly handled.
> - if (optLevel == CodeGenOpt::None) {
> - isConstVec.push_back(CI);
> - return false;
> - } else {
> - Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
> - Type *aType = Type::getInt32Ty(*mCTX);
> - Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
> - : ConstantInt::get(aType, 0);
> - CI->replaceAllUsesWith(Val);
> - ++(*bbb);
> - CI->eraseFromParent();
> - return true;
> - }
> - }
> -
> - if (calleeName.equals("__amdil_is_asic_id_i32")) {
> - ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
> - Type *aType = Type::getInt32Ty(*mCTX);
> - Value *Val = CV;
> - if (Val) {
> - Val = ConstantInt::get(aType,
> - mSTM->device()->getDeviceFlag() & CV->getZExtValue());
> - } else {
> - Val = ConstantInt::get(aType, 0);
> - }
> - CI->replaceAllUsesWith(Val);
> - ++(*bbb);
> - CI->eraseFromParent();
> - return true;
> - }
> - Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
> - if (!F) {
> - return false;
> - }
> - if (F->getName().startswith("__atom") && !CI->getNumUses()
> - && F->getName().find("_xchg") == StringRef::npos) {
> - std::string buffer(F->getName().str() + "_noret");
> - F = dyn_cast<Function>(
> - F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
> - atomicFuncs.push_back(std::make_pair(CI, F));
> - }
> -
> - if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
> - && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
> - return false;
> - }
> - if (!mConvertAtomics) {
> - return false;
> - }
> - StringRef name = F->getName();
> - if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
> - mConvertAtomics = false;
> - }
> - return false;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
> - Instruction *&src,
> - Constant *&mask,
> - Constant *&shift) {
> - if (!base) {
> - if (mDebug) {
> - dbgs() << "Null pointer passed into function.\n";
> - }
> - return false;
> - }
> - bool andOp = false;
> - if (base->getOpcode() == Instruction::Shl) {
> - shift = dyn_cast<Constant>(base->getOperand(1));
> - } else if (base->getOpcode() == Instruction::And) {
> - mask = dyn_cast<Constant>(base->getOperand(1));
> - andOp = true;
> - } else {
> - if (mDebug) {
> - dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
> - }
> - // If the base is neither a Shl or a And, we don't fit any of the patterns above.
> - return false;
> - }
> - src = dyn_cast<Instruction>(base->getOperand(0));
> - if (!src) {
> - if (mDebug) {
> - dbgs() << "Failed setup since the base operand is not an instruction!\n";
> - }
> - return false;
> - }
> - // If we find an 'and' operation, then we don't need to
> - // find the next operation as we already know the
> - // bits that are valid at this point.
> - if (andOp) {
> - return true;
> - }
> - if (src->getOpcode() == Instruction::Shl && !shift) {
> - shift = dyn_cast<Constant>(src->getOperand(1));
> - src = dyn_cast<Instruction>(src->getOperand(0));
> - } else if (src->getOpcode() == Instruction::And && !mask) {
> - mask = dyn_cast<Constant>(src->getOperand(1));
> - }
> - if (!mask && !shift) {
> - if (mDebug) {
> - dbgs() << "Failed setup since both mask and shift are NULL!\n";
> - }
> - // Did not find a constant mask or a shift.
> - return false;
> - }
> - return true;
> -}
> -bool
> -AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
> - if (!inst) {
> - return false;
> - }
> - if (!inst->isBinaryOp()) {
> - return false;
> - }
> - if (inst->getOpcode() != Instruction::Or) {
> - return false;
> - }
> - if (optLevel == CodeGenOpt::None) {
> - return false;
> - }
> - // We want to do an optimization on a sequence of ops that in the end equals a
> - // single ISA instruction.
> - // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
> - // Some simplified versions of this pattern are as follows:
> - // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
> - // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
> - // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
> - // (A & B) | (D << F) when (1 << F) >= B
> - // (A << C) | (D & E) when (1 << C) >= E
> - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
> - // The HD4XXX hardware doesn't support the ubit_insert instruction.
> - return false;
> - }
> - Type *aType = inst->getType();
> - bool isVector = aType->isVectorTy();
> - int numEle = 1;
> - // This optimization only works on 32bit integers.
> - if (aType->getScalarType()
> - != Type::getInt32Ty(inst->getContext())) {
> - return false;
> - }
> - if (isVector) {
> - const VectorType *VT = dyn_cast<VectorType>(aType);
> - numEle = VT->getNumElements();
> - // We currently cannot support more than 4 elements in a intrinsic and we
> - // cannot support Vec3 types.
> - if (numEle > 4 || numEle == 3) {
> - return false;
> - }
> - }
> - // TODO: Handle vectors.
> - if (isVector) {
> - if (mDebug) {
> - dbgs() << "!!! Vectors are not supported yet!\n";
> - }
> - return false;
> - }
> - Instruction *LHSSrc = NULL, *RHSSrc = NULL;
> - Constant *LHSMask = NULL, *RHSMask = NULL;
> - Constant *LHSShift = NULL, *RHSShift = NULL;
> - Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
> - Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
> - if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
> - if (mDebug) {
> - dbgs() << "Found an OR Operation that failed setup!\n";
> - inst->dump();
> - if (LHS) { LHS->dump(); }
> - if (LHSSrc) { LHSSrc->dump(); }
> - if (LHSMask) { LHSMask->dump(); }
> - if (LHSShift) { LHSShift->dump(); }
> - }
> - // There was an issue with the setup for BitInsert.
> - return false;
> - }
> - if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
> - if (mDebug) {
> - dbgs() << "Found an OR Operation that failed setup!\n";
> - inst->dump();
> - if (RHS) { RHS->dump(); }
> - if (RHSSrc) { RHSSrc->dump(); }
> - if (RHSMask) { RHSMask->dump(); }
> - if (RHSShift) { RHSShift->dump(); }
> - }
> - // There was an issue with the setup for BitInsert.
> - return false;
> - }
> - if (mDebug) {
> - dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
> - dbgs() << "Op: "; inst->dump();
> - dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
> - dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
> - }
> - Constant *offset = NULL;
> - Constant *width = NULL;
> - uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
> - uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
> - uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
> - uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
> - lhsMaskVal = (LHSMask
> - ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
> - rhsMaskVal = (RHSMask
> - ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
> - lhsShiftVal = (LHSShift
> - ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
> - rhsShiftVal = (RHSShift
> - ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
> - lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
> - rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
> - lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
> - rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
> - // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
> - if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
> - return false;
> - }
> - if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
> - offset = ConstantInt::get(aType, lhsMaskOffset, false);
> - width = ConstantInt::get(aType, lhsMaskWidth, false);
> - RHSSrc = RHS;
> - if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
> - return false;
> - }
> - if (!LHSShift) {
> - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
> - "MaskShr", LHS);
> - } else if (lhsShiftVal != lhsMaskOffset) {
> - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
> - "MaskShr", LHS);
> - }
> - if (mDebug) {
> - dbgs() << "Optimizing LHS!\n";
> - }
> - } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
> - offset = ConstantInt::get(aType, rhsMaskOffset, false);
> - width = ConstantInt::get(aType, rhsMaskWidth, false);
> - LHSSrc = RHSSrc;
> - RHSSrc = LHS;
> - if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
> - return false;
> - }
> - if (!RHSShift) {
> - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
> - "MaskShr", RHS);
> - } else if (rhsShiftVal != rhsMaskOffset) {
> - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
> - "MaskShr", RHS);
> - }
> - if (mDebug) {
> - dbgs() << "Optimizing RHS!\n";
> - }
> - } else {
> - if (mDebug) {
> - dbgs() << "Failed constraint 3!\n";
> - }
> - return false;
> - }
> - if (mDebug) {
> - dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
> - dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
> - dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
> - dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
> - }
> - if (!offset || !width) {
> - if (mDebug) {
> - dbgs() << "Either width or offset are NULL, failed detection!\n";
> - }
> - return false;
> - }
> - // Lets create the function signature.
> - std::vector<Type *> callTypes;
> - callTypes.push_back(aType);
> - callTypes.push_back(aType);
> - callTypes.push_back(aType);
> - callTypes.push_back(aType);
> - FunctionType *funcType = FunctionType::get(aType, callTypes, false);
> - std::string name = "__amdil_ubit_insert";
> - if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
> - Function *Func =
> - dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
> - getOrInsertFunction(StringRef(name), funcType));
> - Value *Operands[4] = {
> - width,
> - offset,
> - LHSSrc,
> - RHSSrc
> - };
> - CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
> - if (mDebug) {
> - dbgs() << "Old Inst: ";
> - inst->dump();
> - dbgs() << "New Inst: ";
> - CI->dump();
> - dbgs() << "\n\n";
> - }
> - CI->insertBefore(inst);
> - inst->replaceAllUsesWith(CI);
> - return true;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
> - if (!inst) {
> - return false;
> - }
> - if (!inst->isBinaryOp()) {
> - return false;
> - }
> - if (inst->getOpcode() != Instruction::And) {
> - return false;
> - }
> - if (optLevel == CodeGenOpt::None) {
> - return false;
> - }
> - // We want to do some simple optimizations on Shift right/And patterns. The
> - // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
> - // value smaller than 32 and C is a mask. If C is a constant value, then the
> - // following transformation can occur. For signed integers, it turns into the
> - // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
> - // integers, it turns into the function call dst =
> - // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
> - // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
> - // Evergreen hardware.
> - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
> - // This does not work on HD4XXX hardware.
> - return false;
> - }
> - Type *aType = inst->getType();
> - bool isVector = aType->isVectorTy();
> -
> - // XXX Support vector types
> - if (isVector) {
> - return false;
> - }
> - int numEle = 1;
> - // This only works on 32bit integers
> - if (aType->getScalarType()
> - != Type::getInt32Ty(inst->getContext())) {
> - return false;
> - }
> - if (isVector) {
> - const VectorType *VT = dyn_cast<VectorType>(aType);
> - numEle = VT->getNumElements();
> - // We currently cannot support more than 4 elements in a intrinsic and we
> - // cannot support Vec3 types.
> - if (numEle > 4 || numEle == 3) {
> - return false;
> - }
> - }
> - BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
> - // If the first operand is not a shift instruction, then we can return as it
> - // doesn't match this pattern.
> - if (!ShiftInst || !ShiftInst->isShift()) {
> - return false;
> - }
> - // If we are a shift left, then we need don't match this pattern.
> - if (ShiftInst->getOpcode() == Instruction::Shl) {
> - return false;
> - }
> - bool isSigned = ShiftInst->isArithmeticShift();
> - Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
> - Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
> - // Lets make sure that the shift value and the and mask are constant integers.
> - if (!AndMask || !ShrVal) {
> - return false;
> - }
> - Constant *newMaskConst;
> - Constant *shiftValConst;
> - if (isVector) {
> - // Handle the vector case
> - std::vector<Constant *> maskVals;
> - std::vector<Constant *> shiftVals;
> - ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
> - ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
> - Type *scalarType = AndMaskVec->getType()->getScalarType();
> - assert(AndMaskVec->getNumOperands() ==
> - ShrValVec->getNumOperands() && "cannot have a "
> - "combination where the number of elements to a "
> - "shift and an and are different!");
> - for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
> - ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
> - ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
> - if (!AndCI || !ShiftIC) {
> - return false;
> - }
> - uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
> - if (!isMask_32(maskVal)) {
> - return false;
> - }
> - maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
> - uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
> - // If the mask or shiftval is greater than the bitcount, then break out.
> - if (maskVal >= 32 || shiftVal >= 32) {
> - return false;
> - }
> - // If the mask val is greater than the the number of original bits left
> - // then this optimization is invalid.
> - if (maskVal > (32 - shiftVal)) {
> - return false;
> - }
> - maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
> - shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
> - }
> - newMaskConst = ConstantVector::get(maskVals);
> - shiftValConst = ConstantVector::get(shiftVals);
> - } else {
> - // Handle the scalar case
> - uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
> - // This must be a mask value where all lower bits are set to 1 and then any
> - // bit higher is set to 0.
> - if (!isMask_32(maskVal)) {
> - return false;
> - }
> - maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
> - // Count the number of bits set in the mask, this is the width of the
> - // resulting bit set that is extracted from the source value.
> - uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
> - // If the mask or shift val is greater than the bitcount, then break out.
> - if (maskVal >= 32 || shiftVal >= 32) {
> - return false;
> - }
> - // If the mask val is greater than the the number of original bits left then
> - // this optimization is invalid.
> - if (maskVal > (32 - shiftVal)) {
> - return false;
> - }
> - newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
> - shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
> - }
> - // Lets create the function signature.
> - std::vector<Type *> callTypes;
> - callTypes.push_back(aType);
> - callTypes.push_back(aType);
> - callTypes.push_back(aType);
> - FunctionType *funcType = FunctionType::get(aType, callTypes, false);
> - std::string name = "llvm.AMDGPU.bit.extract.u32";
> - if (isVector) {
> - name += ".v" + itostr(numEle) + "i32";
> - } else {
> - name += ".";
> - }
> - // Lets create the function.
> - Function *Func =
> - dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
> - getOrInsertFunction(StringRef(name), funcType));
> - Value *Operands[3] = {
> - ShiftInst->getOperand(0),
> - shiftValConst,
> - newMaskConst
> - };
> - // Lets create the Call with the operands
> - CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
> - CI->setDoesNotAccessMemory();
> - CI->insertBefore(inst);
> - inst->replaceAllUsesWith(CI);
> - return true;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
> - if (!CI) {
> - return false;
> - }
> - Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
> - if (!LHS->getName().startswith("__amdil_bfi")) {
> - return false;
> - }
> - Type* type = CI->getOperand(0)->getType();
> - Constant *negOneConst = NULL;
> - if (type->isVectorTy()) {
> - std::vector<Constant *> negOneVals;
> - negOneConst = ConstantInt::get(CI->getContext(),
> - APInt(32, StringRef("-1"), 10));
> - for (size_t x = 0,
> - y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
> - negOneVals.push_back(negOneConst);
> - }
> - negOneConst = ConstantVector::get(negOneVals);
> - } else {
> - negOneConst = ConstantInt::get(CI->getContext(),
> - APInt(32, StringRef("-1"), 10));
> - }
> - // __amdil_bfi => (A & B) | (~A & C)
> - BinaryOperator *lhs =
> - BinaryOperator::Create(Instruction::And, CI->getOperand(0),
> - CI->getOperand(1), "bfi_and", CI);
> - BinaryOperator *rhs =
> - BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
> - "bfi_not", CI);
> - rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
> - "bfi_and", CI);
> - lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
> - CI->replaceAllUsesWith(lhs);
> - return true;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
> - if (!CI) {
> - return false;
> - }
> - Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
> - if (!LHS->getName().startswith("__amdil_bfm")) {
> - return false;
> - }
> - // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
> - Constant *newMaskConst = NULL;
> - Constant *newShiftConst = NULL;
> - Type* type = CI->getOperand(0)->getType();
> - if (type->isVectorTy()) {
> - std::vector<Constant*> newMaskVals, newShiftVals;
> - newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
> - newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
> - for (size_t x = 0,
> - y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
> - newMaskVals.push_back(newMaskConst);
> - newShiftVals.push_back(newShiftConst);
> - }
> - newMaskConst = ConstantVector::get(newMaskVals);
> - newShiftConst = ConstantVector::get(newShiftVals);
> - } else {
> - newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
> - newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
> - }
> - BinaryOperator *lhs =
> - BinaryOperator::Create(Instruction::And, CI->getOperand(0),
> - newMaskConst, "bfm_mask", CI);
> - lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
> - lhs, "bfm_shl", CI);
> - lhs = BinaryOperator::Create(Instruction::Sub, lhs,
> - newShiftConst, "bfm_sub", CI);
> - BinaryOperator *rhs =
> - BinaryOperator::Create(Instruction::And, CI->getOperand(1),
> - newMaskConst, "bfm_mask", CI);
> - lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
> - CI->replaceAllUsesWith(lhs);
> - return true;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
> - Instruction *inst = (*bbb);
> - if (optimizeCallInst(bbb)) {
> - return true;
> - }
> - if (optimizeBitExtract(inst)) {
> - return false;
> - }
> - if (optimizeBitInsert(inst)) {
> - return false;
> - }
> - if (correctMisalignedMemOp(inst)) {
> - return false;
> - }
> - return false;
> -}
> -bool
> -AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
> - LoadInst *linst = dyn_cast<LoadInst>(inst);
> - StoreInst *sinst = dyn_cast<StoreInst>(inst);
> - unsigned alignment;
> - Type* Ty = inst->getType();
> - if (linst) {
> - alignment = linst->getAlignment();
> - Ty = inst->getType();
> - } else if (sinst) {
> - alignment = sinst->getAlignment();
> - Ty = sinst->getValueOperand()->getType();
> - } else {
> - return false;
> - }
> - unsigned size = getTypeSize(Ty);
> - if (size == alignment || size < alignment) {
> - return false;
> - }
> - if (!Ty->isStructTy()) {
> - return false;
> - }
> - if (alignment < 4) {
> - if (linst) {
> - linst->setAlignment(0);
> - return true;
> - } else if (sinst) {
> - sinst->setAlignment(0);
> - return true;
> - }
> - }
> - return false;
> -}
> -bool
> -AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
> - if (!CI) {
> - return false;
> - }
> - Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
> - std::string namePrefix = LHS->getName().substr(0, 14);
> - if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
> - && namePrefix != "__amdil__imul24_high") {
> - return false;
> - }
> - if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
> - return false;
> - }
> - return true;
> -}
> -
> -void
> -AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
> - assert(isSigned24BitOps(CI) && "Must be a "
> - "signed 24 bit operation to call this function!");
> - Value *LHS = CI->getOperand(CI->getNumOperands()-1);
> - // On 7XX and 8XX we do not have signed 24bit, so we need to
> - // expand it to the following:
> - // imul24 turns into 32bit imul
> - // imad24 turns into 32bit imad
> - // imul24_high turns into 32bit imulhigh
> - if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
> - Type *aType = CI->getOperand(0)->getType();
> - bool isVector = aType->isVectorTy();
> - int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
> - std::vector<Type*> callTypes;
> - callTypes.push_back(CI->getOperand(0)->getType());
> - callTypes.push_back(CI->getOperand(1)->getType());
> - callTypes.push_back(CI->getOperand(2)->getType());
> - FunctionType *funcType =
> - FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
> - std::string name = "__amdil_imad";
> - if (isVector) {
> - name += "_v" + itostr(numEle) + "i32";
> - } else {
> - name += "_i32";
> - }
> - Function *Func = dyn_cast<Function>(
> - CI->getParent()->getParent()->getParent()->
> - getOrInsertFunction(StringRef(name), funcType));
> - Value *Operands[3] = {
> - CI->getOperand(0),
> - CI->getOperand(1),
> - CI->getOperand(2)
> - };
> - CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
> - nCI->insertBefore(CI);
> - CI->replaceAllUsesWith(nCI);
> - } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
> - BinaryOperator *mulOp =
> - BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
> - CI->getOperand(1), "imul24", CI);
> - CI->replaceAllUsesWith(mulOp);
> - } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
> - Type *aType = CI->getOperand(0)->getType();
> -
> - bool isVector = aType->isVectorTy();
> - int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
> - std::vector<Type*> callTypes;
> - callTypes.push_back(CI->getOperand(0)->getType());
> - callTypes.push_back(CI->getOperand(1)->getType());
> - FunctionType *funcType =
> - FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
> - std::string name = "__amdil_imul_high";
> - if (isVector) {
> - name += "_v" + itostr(numEle) + "i32";
> - } else {
> - name += "_i32";
> - }
> - Function *Func = dyn_cast<Function>(
> - CI->getParent()->getParent()->getParent()->
> - getOrInsertFunction(StringRef(name), funcType));
> - Value *Operands[2] = {
> - CI->getOperand(0),
> - CI->getOperand(1)
> - };
> - CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
> - nCI->insertBefore(CI);
> - CI->replaceAllUsesWith(nCI);
> - }
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
> - return (CI != NULL
> - && CI->getOperand(CI->getNumOperands() - 1)->getName()
> - == "__amdil_get_local_size_int");
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
> - if (!CI) {
> - return false;
> - }
> - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
> - && (mSTM->getDeviceName() == "cayman")) {
> - return false;
> - }
> - return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
> - == "__amdil_improved_div";
> -}
> -
> -void
> -AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
> - assert(convertAccurateDivide(CI)
> - && "expanding accurate divide can only happen if it is expandable!");
> - BinaryOperator *divOp =
> - BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
> - CI->getOperand(1), "fdiv32", CI);
> - CI->replaceAllUsesWith(divOp);
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
> - if (optLevel != CodeGenOpt::None) {
> - return false;
> - }
> -
> - if (!CI) {
> - return false;
> - }
> -
> - unsigned funcNameIdx = 0;
> - funcNameIdx = CI->getNumOperands() - 1;
> - StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
> - if (calleeName != "__amdil_image2d_read_norm"
> - && calleeName != "__amdil_image2d_read_unnorm"
> - && calleeName != "__amdil_image3d_read_norm"
> - && calleeName != "__amdil_image3d_read_unnorm") {
> - return false;
> - }
> -
> - unsigned samplerIdx = 2;
> - samplerIdx = 1;
> - Value *sampler = CI->getOperand(samplerIdx);
> - LoadInst *lInst = dyn_cast<LoadInst>(sampler);
> - if (!lInst) {
> - return false;
> - }
> -
> - if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
> - return false;
> - }
> -
> - GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
> - // If we are loading from what is not a global value, then we
> - // fail and return.
> - if (!gv) {
> - return false;
> - }
> -
> - // If we don't have an initializer or we have an initializer and
> - // the initializer is not a 32bit integer, we fail.
> - if (!gv->hasInitializer()
> - || !gv->getInitializer()->getType()->isIntegerTy(32)) {
> - return false;
> - }
> -
> - // Now that we have the global variable initializer, lets replace
> - // all uses of the load instruction with the samplerVal and
> - // reparse the __amdil_is_constant() function.
> - Constant *samplerVal = gv->getInitializer();
> - lInst->replaceAllUsesWith(samplerVal);
> - return true;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::doInitialization(Module &M) {
> - return false;
> -}
> -
> -bool
> -AMDGPUPeepholeOpt::doFinalization(Module &M) {
> - return false;
> -}
> -
> -void
> -AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
> - AU.addRequired<MachineFunctionAnalysis>();
> - FunctionPass::getAnalysisUsage(AU);
> - AU.setPreservesAll();
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
> - size_t size = 0;
> - if (!T) {
> - return size;
> - }
> - switch (T->getTypeID()) {
> - case Type::X86_FP80TyID:
> - case Type::FP128TyID:
> - case Type::PPC_FP128TyID:
> - case Type::LabelTyID:
> - assert(0 && "These types are not supported by this backend");
> - default:
> - case Type::FloatTyID:
> - case Type::DoubleTyID:
> - size = T->getPrimitiveSizeInBits() >> 3;
> - break;
> - case Type::PointerTyID:
> - size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
> - break;
> - case Type::IntegerTyID:
> - size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
> - break;
> - case Type::StructTyID:
> - size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
> - break;
> - case Type::ArrayTyID:
> - size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
> - break;
> - case Type::FunctionTyID:
> - size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
> - break;
> - case Type::VectorTyID:
> - size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
> - break;
> - };
> - return size;
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
> - bool dereferencePtr) {
> - size_t size = 0;
> - if (!ST) {
> - return size;
> - }
> - Type *curType;
> - StructType::element_iterator eib;
> - StructType::element_iterator eie;
> - for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
> - curType = *eib;
> - size += getTypeSize(curType, dereferencePtr);
> - }
> - return size;
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
> - bool dereferencePtr) {
> - return IT ? (IT->getBitWidth() >> 3) : 0;
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
> - bool dereferencePtr) {
> - assert(0 && "Should not be able to calculate the size of an function type");
> - return 0;
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
> - bool dereferencePtr) {
> - return (size_t)(AT ? (getTypeSize(AT->getElementType(),
> - dereferencePtr) * AT->getNumElements())
> - : 0);
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
> - bool dereferencePtr) {
> - return VT ? (VT->getBitWidth() >> 3) : 0;
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
> - bool dereferencePtr) {
> - if (!PT) {
> - return 0;
> - }
> - Type *CT = PT->getElementType();
> - if (CT->getTypeID() == Type::StructTyID &&
> - PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
> - return getTypeSize(dyn_cast<StructType>(CT));
> - } else if (dereferencePtr) {
> - size_t size = 0;
> - for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
> - size += getTypeSize(PT->getContainedType(x), dereferencePtr);
> - }
> - return size;
> - } else {
> - return 4;
> - }
> -}
> -
> -size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
> - bool dereferencePtr) {
> - //assert(0 && "Should not be able to calculate the size of an opaque type");
> - return 4;
> -}
> diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
> index 2ad2047..97f0a40 100644
> --- a/lib/Target/R600/CMakeLists.txt
> +++ b/lib/Target/R600/CMakeLists.txt
> @@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen
> AMDILISelDAGToDAG.cpp
> AMDILISelLowering.cpp
> AMDILNIDevice.cpp
> - AMDILPeepholeOptimizer.cpp
> AMDILSIDevice.cpp
> AMDGPUAsmPrinter.cpp
> AMDGPUFrameLowering.cpp
> diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
> index 8a60add..8f47523 100644
> --- a/lib/Target/R600/R600Instructions.td
> +++ b/lib/Target/R600/R600Instructions.td
> @@ -1615,6 +1615,7 @@ let Predicates = [isEGorCayman] in {
> i32:$src2))],
> VecALU
> >;
> + def : BFEPattern <BFE_UINT_eg>;
>
> def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
> defm : BFIPatterns <BFI_INT_eg>;
> diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
> new file mode 100644
> index 0000000..92570c3
> --- /dev/null
> +++ b/test/CodeGen/R600/bfe_uint.ll
> @@ -0,0 +1,26 @@
> +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> +
> +; CHECK: @bfe_def
> +; CHECK: BFE_UINT
> +define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
> +entry:
> + %0 = lshr i32 %x, 5
> + %1 = and i32 %0, 15 ; 0xf
> + store i32 %1, i32 addrspace(1)* %out
> + ret void
> +}
> +
> +; This program could be implemented using a BFE_UINT instruction, however
> +; since the lshr constant + number of bits in the mask is >= 32, it can also be
> +; implmented with a LSHR instruction, which is better, because LSHR has less
> +; operands and requires less constants.
> +
> +; CHECK: @bfe_shift
> +; CHECK-NOT: BFE_UINT
> +define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
> +entry:
> + %0 = lshr i32 %x, 16
> + %1 = and i32 %0, 65535 ; 0xffff
> + store i32 %1, i32 addrspace(1)* %out
> + ret void
> +}
More information about the llvm-commits
mailing list