[clang-tools-extra] [llvm] [clang] [LLVM] Add IRNormalizer Pass (PR #68176)
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 4 04:36:19 PDT 2023
================
@@ -0,0 +1,637 @@
+//===--------------- IRNormalizer.cpp - IR Normalizer ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the IRNormalizer class which aims to transform LLVM
+/// Modules into a canonical form by reordering and renaming instructions while
+/// preserving the same semantics. The normalizer makes it easier to spot
+/// semantic differences while diffing two modules which have undergone
+/// different passes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/IRNormalizer.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "normalize"
+
+using namespace llvm;
+
+namespace {
+/// IRNormalizer aims to transform LLVM IR into canonical form.
+class IRNormalizer {
+public:
+ /// \name Normalizer flags.
+ /// @{
+ /// Preserves original order of instructions.
+ static cl::opt<bool> PreserveOrder;
+ /// Renames all instructions (including user-named).
+ static cl::opt<bool> RenameAll;
+ /// Folds all regular instructions (including pre-outputs).
+ static cl::opt<bool> FoldPreoutputs;
+ /// Sorts and reorders operands in commutative instructions.
+ static cl::opt<bool> ReorderOperands;
+ /// @}
+
+ bool runOnFunction(Function &F);
+
+private:
+ // Random constant for hashing, so the state isn't zero.
+ const uint64_t MagicHashConstant = 0x6acaa36bef8325c5ULL;
+ DenseSet<const Instruction *> NamedInstructions;
+
+ /// \name Naming.
+ /// @{
+ void nameFunctionArguments(Function &F);
+ void nameBasicBlocks(Function &F);
+ void nameInstruction(Instruction *I);
+ void nameAsInitialInstruction(Instruction *I);
+ void nameAsRegularInstruction(Instruction *I);
+ void foldInstructionName(Instruction *I);
+ /// @}
+
+ /// \name Reordering.
+ /// @{
+ void reorderInstructions(SmallVector<Instruction *, 16> &Outputs);
+ void reorderInstruction(Instruction *Used, Instruction *User,
+ SmallPtrSet<const Instruction *, 32> &Visited);
+ void reorderInstructionOperandsByNames(Instruction *I);
+ void reorderPHIIncomingValues(PHINode *PN);
+ /// @}
+
+ /// \name Utility methods.
+ /// @{
+ SmallVector<Instruction *, 16> collectOutputInstructions(Function &F);
+ bool isOutput(const Instruction *I);
+ bool isInitialInstruction(const Instruction *I);
+ bool hasOnlyImmediateOperands(const Instruction *I);
+ SetVector<int>
+ getOutputFootprint(Instruction *I,
+ SmallPtrSet<const Instruction *, 32> &Visited);
+ /// @}
+};
+} // namespace
+
+cl::opt<bool> IRNormalizer::PreserveOrder(
+ "preserve-order", cl::Hidden,
+ cl::desc("Preserves original instruction order"));
+cl::opt<bool> IRNormalizer::RenameAll(
+ "rename-all", cl::Hidden,
+ cl::desc("Renames all instructions (including user-named)"));
+cl::opt<bool> IRNormalizer::FoldPreoutputs(
+ "fold-all", cl::Hidden,
+ cl::desc("Folds all regular instructions (including pre-outputs)"));
+cl::opt<bool> IRNormalizer::ReorderOperands(
+ "reorder-operands", cl::Hidden,
+ cl::desc("Sorts and reorders operands in commutative instructions"));
+
+/// Entry method to the IRNormalizer.
+///
+/// \param M Module to normalize.
+bool IRNormalizer::runOnFunction(Function &F) {
+ nameFunctionArguments(F);
+ nameBasicBlocks(F);
+
+ SmallVector<Instruction *, 16> Outputs = collectOutputInstructions(F);
+
+ if (!PreserveOrder)
+ reorderInstructions(Outputs);
+
+ for (auto &I : Outputs)
+ nameInstruction(I);
+
+ for (auto &I : instructions(F)) {
+ if (!PreserveOrder) {
+ if (ReorderOperands && I.isCommutative())
+ reorderInstructionOperandsByNames(&I);
+
+ if (auto *PN = dyn_cast<PHINode>(&I))
+ reorderPHIIncomingValues(PN);
+ }
+
+ foldInstructionName(&I);
+ }
+
+ return true;
+}
+
+/// Numbers arguments.
+///
+/// \param F Function whose arguments will be renamed.
+void IRNormalizer::nameFunctionArguments(Function &F) {
+ int ArgumentCounter = 0;
+ for (auto &A : F.args()) {
+ if (RenameAll || A.getName().empty()) {
+ A.setName("a" + Twine(ArgumentCounter));
+ ++ArgumentCounter;
+ }
+ }
+}
+
+/// Names basic blocks using a generated hash for each basic block in
+/// a function considering the opcode and the order of output instructions.
+///
+/// \param F Function containing basic blocks to rename.
+void IRNormalizer::nameBasicBlocks(Function &F) {
+ for (auto &B : F) {
+ // Initialize to a magic constant, so the state isn't zero.
+ uint64_t Hash = MagicHashConstant;
+
+ // Hash considering output instruction opcodes.
+ for (auto &I : B)
+ if (isOutput(&I))
+ Hash = hashing::detail::hash_16_bytes(Hash, I.getOpcode());
+
+ if (RenameAll || B.getName().empty()) {
+ // Name basic block. Substring hash to make diffs more readable.
+ B.setName("bb" + std::to_string(Hash).substr(0, 5));
+ }
+ }
+}
+
+/// Names instructions graphically (recursive) in accordance with the
+/// def-use tree, starting from the initial instructions (defs), finishing at
+/// the output (top-most user) instructions (depth-first).
+///
+/// \param I Instruction to be renamed.
+void IRNormalizer::nameInstruction(Instruction *I) {
+ // Ensure instructions are not renamed. This is done
+ // to prevent situation where instructions are used
+ // before their definition (in phi nodes)
+ if (NamedInstructions.contains(I))
+ return;
+ NamedInstructions.insert(I);
+ // Determine the type of instruction to name.
+ if (isInitialInstruction(I)) {
+ // This is an initial instruction.
+ nameAsInitialInstruction(I);
+ } else {
+ // This must be a regular instruction.
+ nameAsRegularInstruction(I);
+ }
+}
+
+/// Names instruction following the scheme:
+/// vl00000Callee(Operands)
+///
+/// Where 00000 is a hash calculated considering instruction's opcode and output
+/// footprint. Callee's name is only included when instruction's type is
+/// CallInst. In cases where instruction is commutative, operands list is also
+/// sorted.
+///
+/// Renames instruction only when RenameAll flag is raised or instruction is
+/// unnamed.
+///
+/// \see getOutputFootprint()
+/// \param I Instruction to be renamed.
+void IRNormalizer::nameAsInitialInstruction(Instruction *I) {
+ if (I->getType()->isVoidTy() || (!I->getName().empty() && !RenameAll))
+ return;
+
+ // Instruction operands for further sorting.
+ SmallVector<SmallString<64>, 4> Operands;
+
+ // Collect operands.
+ for (auto &OP : I->operands()) {
+ if (!isa<Function>(OP)) {
+ std::string TextRepresentation;
+ raw_string_ostream Stream(TextRepresentation);
+ OP->printAsOperand(Stream, false);
+ Operands.push_back(StringRef(Stream.str()));
+ }
+ }
+
+ if (I->isCommutative())
+ llvm::sort(Operands);
+
+ // Initialize to a magic constant, so the state isn't zero.
+ uint64_t Hash = MagicHashConstant;
+
+ // Consider instruction's opcode in the hash.
+ Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode());
+
+ SmallPtrSet<const Instruction *, 32> Visited;
+ // Get output footprint for I.
+ SetVector<int> OutputFootprint = getOutputFootprint(I, Visited);
+
+ // Consider output footprint in the hash.
+ for (const int &Output : OutputFootprint)
+ Hash = hashing::detail::hash_16_bytes(Hash, Output);
+
+ // Base instruction name.
+ SmallString<256> Name;
+ Name.append("vl" + std::to_string(Hash).substr(0, 5));
+
+ // In case of CallInst, consider callee in the instruction name.
+ if (const auto *CI = dyn_cast<CallInst>(I)) {
+ Function *F = CI->getCalledFunction();
+
+ if (F != nullptr) {
+ Name.append(F->getName());
+ }
+ }
+
+ Name.append("(");
+ for (unsigned long i = 0; i < Operands.size(); ++i) {
+ Name.append(Operands[i]);
+
+ if (i < Operands.size() - 1)
+ Name.append(", ");
+ }
+ Name.append(")");
+
+ I->setName(Name);
+}
+
+/// Names instruction following the scheme:
+/// op00000Callee(Operands)
+///
+/// Where 00000 is a hash calculated considering instruction's opcode, its
+/// operands' opcodes and order. Callee's name is only included when
+/// instruction's type is CallInst. In cases where instruction is commutative,
+/// operand list is also sorted.
+///
+/// Names instructions recursively in accordance with the def-use tree,
+/// starting from the initial instructions (defs), finishing at
+/// the output (top-most user) instructions (depth-first).
+///
+/// Renames instruction only when RenameAll flag is raised or instruction is
+/// unnamed.
+///
+/// \see getOutputFootprint()
+/// \param I Instruction to be renamed.
+void IRNormalizer::nameAsRegularInstruction(Instruction *I) {
+ // Instruction operands for further sorting.
+ SmallVector<SmallString<128>, 4> Operands;
+
+ // The name of a regular instruction depends
+ // on the names of its operands. Hence, all
+ // operands must be named first in the use-def
+ // walk.
+
+ // Collect operands.
+ for (auto &OP : I->operands()) {
+ if (auto *IOP = dyn_cast<Instruction>(OP)) {
+ // Walk down the use-def chain.
+ nameInstruction(IOP);
+ Operands.push_back(IOP->getName());
+ } else if (isa<Value>(OP) && !isa<Function>(OP)) {
+ // This must be an immediate value.
+ std::string TextRepresentation;
+ raw_string_ostream Stream(TextRepresentation);
+ OP->printAsOperand(Stream, false);
+ Operands.push_back(StringRef(Stream.str()));
+ }
+ }
+
+ if (I->isCommutative())
+ llvm::sort(Operands.begin(), Operands.end());
+
+ // Initialize to a magic constant, so the state isn't zero.
+ uint64_t Hash = MagicHashConstant;
+
+ // Consider instruction opcode in the hash.
+ Hash = hashing::detail::hash_16_bytes(Hash, I->getOpcode());
+
+ // Operand opcodes for further sorting (commutative).
+ SmallVector<int, 4> OperandsOpcodes;
+
+ // Collect operand opcodes for hashing.
+ for (auto &OP : I->operands())
+ if (auto *IOP = dyn_cast<Instruction>(OP))
+ OperandsOpcodes.push_back(IOP->getOpcode());
+
+ if (I->isCommutative())
+ llvm::sort(OperandsOpcodes.begin(), OperandsOpcodes.end());
+
+ // Consider operand opcodes in the hash.
+ for (const int Code : OperandsOpcodes)
+ Hash = hashing::detail::hash_16_bytes(Hash, Code);
+
+ // Base instruction name.
+ SmallString<512> Name;
+ Name.append("op" + std::to_string(Hash).substr(0, 5));
+
+ // In case of CallInst, consider callee in the instruction name.
+ if (const auto *CI = dyn_cast<CallInst>(I))
+ if (const Function *F = CI->getCalledFunction())
+ Name.append(F->getName());
+
+ Name.append("(");
+ for (unsigned long i = 0; i < Operands.size(); ++i) {
+ Name.append(Operands[i]);
+
+ if (i < Operands.size() - 1)
+ Name.append(", ");
+ }
+ Name.append(")");
+
+ if ((I->getName().empty() || RenameAll) && !I->getType()->isVoidTy())
+ I->setName(Name);
+}
+
+/// Shortens instruction's name. This method removes called function name from
+/// the instruction name and substitutes the call chain with a corresponding
+/// list of operands.
+///
+/// Examples:
+/// op00000Callee(op00001Callee(...), vl00000Callee(1, 2), ...) ->
+/// op00000(op00001, vl00000, ...) vl00000Callee(1, 2) -> vl00000(1, 2)
+///
+/// This method omits output instructions and pre-output (instructions directly
+/// used by an output instruction) instructions (by default). By default it also
+/// does not affect user named instructions.
+///
+/// \param I Instruction whose name will be folded.
+void IRNormalizer::foldInstructionName(Instruction *I) {
+ // If this flag is raised, fold all regular
+ // instructions (including pre-outputs).
+ if (!FoldPreoutputs) {
+ // Don't fold if one of the users is an output instruction.
+ for (auto *U : I->users())
+ if (auto *IU = dyn_cast<Instruction>(U))
+ if (isOutput(IU))
+ return;
+ }
+
+ // Don't fold if it is an output instruction or has no op prefix.
+ if (isOutput(I) || I->getName().substr(0, 2) != "op")
+ return;
+
+ // Instruction operands.
+ SmallVector<SmallString<64>, 4> Operands;
+
+ for (auto &OP : I->operands()) {
+ if (const auto *IOP = dyn_cast<Instruction>(OP)) {
+ bool HasCanonicalName = I->getName().substr(0, 2) == "op" ||
+ I->getName().substr(0, 2) == "vl";
+
+ Operands.push_back(HasCanonicalName ? IOP->getName().substr(0, 7)
+ : IOP->getName());
+ }
+ }
+
+ if (I->isCommutative())
+ llvm::sort(Operands.begin(), Operands.end());
+
+ SmallString<256> Name;
+ Name.append(I->getName().substr(0, 7));
+
+ Name.append("(");
+ for (unsigned long i = 0; i < Operands.size(); ++i) {
+ Name.append(Operands[i]);
+
+ if (i < Operands.size() - 1)
+ Name.append(", ");
+ }
+ Name.append(")");
+
+ I->setName(Name);
+}
+
+/// Reorders instructions by walking up the tree from each operand of an output
+/// instruction and reducing the def-use distance.
+/// This method assumes that output instructions were collected top-down,
+/// otherwise the def-use chain may be broken.
+/// This method is a wrapper for recursive reorderInstruction().
+///
+/// \see reorderInstruction()
+/// \param Outputs Vector of pointers to output instructions collected top-down.
+void IRNormalizer::reorderInstructions(
+ SmallVector<Instruction *, 16> &Outputs) {
+ // This method assumes output instructions were collected top-down,
+ // otherwise the def-use chain may be broken.
+
+ SmallPtrSet<const Instruction *, 32> Visited;
+
+ // Walk up the tree.
+ for (auto &I : Outputs)
+ for (auto &OP : I->operands())
+ if (auto *IOP = dyn_cast<Instruction>(OP))
+ reorderInstruction(IOP, I, Visited);
+}
+
+/// Reduces def-use distance or places instruction at the end of the basic
+/// block. Continues to walk up the def-use tree recursively. Used by
+/// reorderInstructions().
+///
+/// \see reorderInstructions()
+/// \param Used Pointer to the instruction whose value is used by the \p User.
+/// \param User Pointer to the instruction which uses the \p Used.
+/// \param Visited Set of visited instructions.
+void IRNormalizer::reorderInstruction(
+ Instruction *Used, Instruction *User,
+ SmallPtrSet<const Instruction *, 32> &Visited) {
+ if (isa<PHINode>(Used))
+ return;
+ if (Visited.contains(Used))
+ return;
+ Visited.insert(Used);
+
+ if (Used->getParent() == User->getParent()) {
+ // If Used and User share the same basic block move Used just before User.
+ Used->moveBefore(User);
+ } else {
+ // Otherwise move Used to the very end of its basic block.
+ Used->moveBefore(&Used->getParent()->back());
+ }
+
+ for (auto &OP : Used->operands()) {
+ if (auto *IOP = dyn_cast<Instruction>(OP)) {
+ // Walk up the def-use tree.
+ reorderInstruction(IOP, Used, Visited);
+ }
+ }
+}
----------------
nikic wrote:
Am I missing something, or can this reorder side-effecting instructions?
https://github.com/llvm/llvm-project/pull/68176
More information about the llvm-commits
mailing list