//===- lib/Orca/OrcaInlinerPassManager.h - ----------------------*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
// 
//===----------------------------------------------------------------------===//
// Copyright 2013-2019 Azul Systems, Inc.  All Rights Reserved.
// http://www.azul.com
// Azul Systems is a contributor to the LLVM Team.
// Distributed under the same license terms detailed in LICENSE.TXT above.
//===----------------------------------------------------------------------===//
//
// Here's an overview of the design which allows us to use `ModulePassManagers`
// as simplification PMs in inliner:
// 0. Why do we need that? There are optimization that have to modify the module
//    (e.g. add some global values) which means they can't be done by function
//    passes, but we still want to use them as a part of simplification
//    pipeline. Such optimizations can range from simple stuff like
//    OptimizeKnownValues and up to inlining into inline candidates.
// 1. We have an `InlinerStack` stored in `AzulState` object. That also means
//    a valid `AzulState` has to be "installed" for the inliner to function.
//    a) Why it has to be a stack?
//       That's because, depending on configuration, additionally to inlining
//       into top-level method, we can inline into inline candidates (to
//       simplify them) and even into candidates for inlining into top-level's
//       inline candidates. Inliners can be nested, so using a stack is a
//       natural way to represent their "current function" states.
//    b) Why the stack has to be stored in `AzulState` object?
//       It has to be a location accessible from each inliner object and every
//       module pass, so we can't pass it as a parameter to passes'
//       constructors (without modifying a lot of upstream code), which means it
//       has to be some sort of a global value.
//       `AzulState` is already a thread-local value accessible from
//       LLVMContext, which makes it a convenient place to store the stack.
// 2. When an inliner starts working on a function, it adds that function to
//    InlinerStack. "Working on a function" here means inlining into it or
//    simplifying it.
// 3. We expect all module passes that are in simplification pass managers to
//    run only on the function that is on top of the stack. For function passes
//    InlinerModuleToFunctionPassAdaptor must be used.
//    To achieve the desired behavior it is strongly recommended to use
//    `runWithInlinerStackSupport` when implementing `::run(M, MAM)` method
//    of your module pass.
//
//===----------------------------------------------------------------------===//

#ifndef ORCAINLINERPASSMANAGER_H
#define ORCAINLINERPASSMANAGER_H

#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/Orca/AvailableDefTracker.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Orca/InlinerUtils.h"
#include "llvm/Orca/Utils.h"
#include "llvm/Support/Allocator.h"
#include <functional>
#include <memory>
#include <optional>
#include <set>

namespace azul {
namespace orca {
class OrcaPipeline;
}
} // namespace azul
namespace llvm {

// These options are used in InlinerOptions which handles their initialization.
// It either assigns a default value for them or uses corresponding values from
// Orca features.
extern cl::opt<bool> CollectAvailableDefs;
extern cl::opt<azul::orca::SimplifyWithInlining> SimplifyCandidatesWithInlining;
extern cl::opt<double> OrcaInlinerIRSizeCap;
extern cl::opt<unsigned> OrcalInlinerIRSizeThresholdPower;
extern cl::opt<unsigned> OrcaInlinerIRSizeThresholdMul;
extern cl::opt<unsigned> OrcaInlinerMaxIterations;
extern cl::opt<bool> NestedSimplifyWithInlining;
extern cl::opt<int> AzulInlineThreshold;

class InlineFunctionInfo;
class Module;
class ProfileSummaryInfo;

/// If InlinerStack is empty, it behaves like a regular
/// ModuleToFunctionPassAdaptor, runs the function pass across every function in
/// the module. But if InlinerStack isn't empty, it runs the function pass only
/// on the function that is on top of the stack.
/// The advantage of using this adaptor instead of the combination of the
/// regular adaptor and a PassInstrumentation callback allowing passes to run
/// only on one function is that this adaptor won't run any passes on other
/// functions, while the regular one will run passes with `isRequired() == true`
/// on all functions. The problem with that approach is that one of those passes
/// is FunctionToLoopPassAdaptor. That adaptor triggers computation of many
/// analyzes before running its pass, so it spends a lot of compile time even if
/// that pass is disabled.
class InlinerModuleToFunctionPassAdaptor
    : public PassInfoMixin<InlinerModuleToFunctionPassAdaptor> {
public:
  using PassConceptT = detail::PassConcept<Function, FunctionAnalysisManager>;

  InlinerModuleToFunctionPassAdaptor(std::unique_ptr<PassConceptT> Pass,
                                     bool EagerlyInvalidate)
      : Pass(std::move(Pass)), EagerlyInvalidate(EagerlyInvalidate) {}

  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);

  void printPipeline(raw_ostream &OS,
                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
    // Parser isn't aware of this adaptor at the moment, so we'll print the
    // pipeline with regular adaptor instead.
    OS << "function";
    if (EagerlyInvalidate)
      OS << "<eager-inv>";
    OS << "(";
    Pass->printPipeline(OS, MapClassName2PassName);
    OS << ")";
  }

  static bool isRequired() { return true; }

private:
  std::unique_ptr<PassConceptT> Pass;
  bool EagerlyInvalidate;
};

/// A function to deduce a function pass type and wrap it in the
/// templated adaptor.
template <typename FunctionPassT>
InlinerModuleToFunctionPassAdaptor
createInlinerModuleToFunctionPassAdaptor(FunctionPassT &&Pass,
                                         bool EagerlyInvalidate = false) {
  using PassModelT =
      detail::PassModel<Function, FunctionPassT, PreservedAnalyses,
                        FunctionAnalysisManager>;
  // Do not use make_unique, it causes too many template instantiations,
  // causing terrible compile times.
  return InlinerModuleToFunctionPassAdaptor(
      std::unique_ptr<InlinerModuleToFunctionPassAdaptor::PassConceptT>(
          new PassModelT(std::forward<FunctionPassT>(Pass))),
      EagerlyInvalidate);
}

/// Cost and Benefit of inlining serve as an input to various inliner
/// decisions.
struct CostBenefit {
  InlineCost IC = InlineCost::get(0, 0);
  unsigned NumInstructions = 0;

  CostBenefit() = default;
  CostBenefit(int Cost, int Benefit, unsigned NumInstructions) :
    IC(InlineCost::get(Cost, Benefit)), NumInstructions(NumInstructions) {}
  CostBenefit(InlineCost IC, unsigned NumInstructions) :
    IC(IC), NumInstructions(NumInstructions) {
    assert((isVariable() || NumInstructions == 0) && 
           "Num intructions must be 0 for non-variable costs");    
  }

  static CostBenefit getCost(int Cost) {
    return CostBenefit(Cost, 0, 0);
  }
  static CostBenefit getBenefit(int Benefit) {
    return CostBenefit(0, Benefit, 0);
  }

  static CostBenefit getAlways(StringRef Reason) {
    return CostBenefit(InlineCost::getAlways(Reason.data()), 0);
  }
  static CostBenefit getNever(StringRef Reason) {
    return CostBenefit(InlineCost::getNever(Reason.data()), 0);
  }

  bool standardDecision() const { return (bool)IC; }

  bool isVariable() const { return IC.isVariable(); }
  bool isNever() const { return IC.isNever(); }
  bool isAlways() const { return IC.isAlways(); }
  int getCost() const { return IC.getCost(); }
  int getStandardThreshold() const { return IC.getThreshold(); }

  /// Benefit is our synthetic estimate of how "beneficial" the inlining is in
  /// terms of final run-time savings for the caller. The intent is to use
  /// benefit in all the places where we need to differentiate between
  /// call-sites/inlining decisions.
  int getBenefit() const {
    // FIXME: For now we treat the threshold as the measure of benefits.
    // InlineCost has extensive logic for adjusting the threshold to account
    // for benefits of inlining. E.g. InlineCost scales the threshold for the
    // frequency of the call site. It also applies various bonuses to the
    // threshold, when a beneficial simplification is encountered.
    //
    // Treating threshold as benefit is a transition step. Moving forward we
    // should separate the two. 
    return IC.getThreshold();
  }

  int getNumInstructions() const {
    assert(IC.isVariable() && "only valid for variable cost");
    return NumInstructions;
  }

  double getPriority() const {
    if (isNever())
      return -std::numeric_limits<double>::infinity();
    if (isAlways() || getCost() <= 0)
      return std::numeric_limits<double>::infinity();
    return (double) getBenefit() / getCost();
  }

  CostBenefit &operator+=(const CostBenefit &RHS) {
    assert(isVariable() && "Doesn't make sense for non-variable!");
    assert(RHS.isVariable() && "Doesn't make sense to add non-variable!");
    IC = InlineCost::get(IC.getCost() + RHS.IC.getCost(), 
                         IC.getThreshold() + RHS.IC.getThreshold());
    NumInstructions += RHS.NumInstructions;      
    return *this;
  }

  CostBenefit &operator-=(const CostBenefit &RHS) {
    assert(isVariable() && "Doesn't make sense for non-variable!");
    assert(RHS.isVariable() && "Doesn't make sense to sub non-variable!");
    IC = InlineCost::get(IC.getCost() - RHS.IC.getCost(), 
                         IC.getThreshold() - RHS.IC.getThreshold());
    assert(NumInstructions >= RHS.NumInstructions && 
           "NumInstructions can't become negative!");
    NumInstructions -= RHS.NumInstructions;      
    return *this;
  }

  void print(raw_ostream &OS) const;
};

class ClusterCallSite;
class MethodCallSites;
class CandidateCluster;

using ClusterCallSiteUniquePtrT =
    azul::UniqueBumpPtrAllocatorPtr<ClusterCallSite>;
using MethodCallSitesUniquePtrT =
    azul::UniqueBumpPtrAllocatorPtr<MethodCallSites>;
using CandidateClusterUniquePtrT =
    azul::UniqueBumpPtrAllocatorPtr<CandidateCluster>;

/// Represents a value in a cluster. Consists of the value and the context in
/// cluster where this value appears. The cluster context is represented as
/// a pointer to the MethodCallSites of the corresponding ClusterCallSite.
/// Note that the same 'Value *' can appear in different contexts in the cluster
/// if the cluster inlines the parent function of the value more than once.
template <class T> struct ClusterValue {
  T *V; 
  MethodCallSites *ClusterContext;
  ClusterValue(T *V, MethodCallSites *ClusterContext) : 
    V(V), ClusterContext(ClusterContext) {}
  template <class OtherT> ClusterValue(const ClusterValue<OtherT> &Other) :
    V(Other.V), ClusterContext(Other.ClusterContext) {} 

  bool operator<(const ClusterValue& Other) const {
     return std::make_pair(V, ClusterContext) < 
            std::make_pair(Other.V, Other.ClusterContext);
  }
  bool operator==(const ClusterValue& Other) const {
     return V == Other.V && ClusterContext == Other.ClusterContext;
  }
  bool operator!=(const ClusterValue& Other) const {
     return !(*this == Other);
  }
};

template <class T>
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                              const ClusterValue<T> &CV) {
  OS << *CV.V << " at " << CV.ClusterContext;
  return OS;
}

class MethodCallSites {
  BumpPtrAllocator &Allocator;
  
  SmallDenseMap<AssertingVH<CallBase>, ClusterCallSiteUniquePtrT, 1>
      CallSites;

  // Holds a pointer to a parent function call and its MethodCallSites that
  // belongs to the corresponding ClusterCallSite.
  // This field is empty for MethodCallSites that represent callsites in a
  // top-level function.
  std::optional<ClusterValue<CallBase>> Parent;

  static ClusterCallSiteUniquePtrT &
  getClusterCallSite(decltype(CallSites)::iterator::value_type &It) {
     return It.second;
  }

  static const ClusterCallSiteUniquePtrT &
  getClusterCallSiteConst(const decltype(CallSites)::iterator::value_type &It) {
     return It.second;
  }

  ClusterCallSiteUniquePtrT &getCallSite(CallBase *Call) {
    assert(contains(Call));
    auto It = CallSites.find(Call);
    assert(It != CallSites.end());
    return It->second;
  }

public:
  MethodCallSites(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}

  MethodCallSites(MethodCallSites &Other);

  MethodCallSites &operator=(const MethodCallSites &Other);

  using iterator = mapped_iterator<decltype(CallSites)::iterator,
                                   decltype(&getClusterCallSite)>;
  using const_iterator = mapped_iterator<decltype(CallSites)::const_iterator,
                                         decltype(&getClusterCallSiteConst)>;

  auto begin() { return iterator(CallSites.begin(), &getClusterCallSite); }
  auto end() { return iterator(CallSites.end(), &getClusterCallSite); }

  auto begin() const {
    return const_iterator(CallSites.begin(), &getClusterCallSiteConst);
  }
  auto end() const {
    return const_iterator(CallSites.end(), &getClusterCallSiteConst);
  }

  unsigned size() const { return CallSites.size(); }

  ClusterCallSite &operator[](CallBase *Call) {
    return *getCallSite(Call);
  }

  const ClusterCallSite &operator[](CallBase *Call) const {
    return const_cast<MethodCallSites *>(this)->operator[](Call);
  }

  ClusterCallSite &try_emplace(ClusterCallSiteUniquePtrT &&CCS);

  ClusterCallSite &try_emplace(CallBase *Call, CostBenefit CB = CostBenefit(),
                               std::optional<double> Frequency = std::nullopt,
                               bool InitializeCallSites = true);

  // Gets the callsite for the given call instruction and erases it from this
  // MethodCallSites transferring ownership of the callsite to the caller of
  // this function.
  ClusterCallSiteUniquePtrT extractCallSite(CallBase *Call) {
    auto CallSite = std::move(getCallSite(Call));
    erase(Call);
    return CallSite;
  }

  void registerNestedCallSites(ClusterCallSite &CS);

  std::optional<ClusterValue<CallBase>> getParent() const { return Parent; }

  void getClusterStack(SmallVectorImpl<CallBase *> &Result) const {
    if (!Parent.has_value())
      return;
    // Recursion here is limited by MaxClusterDepth.
    Parent->ClusterContext->getClusterStack(Result);
    Result.push_back(Parent->V);
  }

  // Moves all call sites into the given vector. Should be used before inlining
  // the call sites. This way the asserting value handles that key the
  // underlying map will not be triggered once the call sites are inlined.
  void
  extractCallSites(SmallVectorImpl<ClusterCallSiteUniquePtrT> &Result) {
    for (auto &It : CallSites)
      Result.emplace_back(std::move(It.second));
    CallSites.clear();
  }

  bool contains(CallBase *Call) const { return CallSites.count(Call); }

  void erase(CallBase *Call) { CallSites.erase(Call); }

#ifndef NDEBUG
  void verify(Function *TopLevelFunction, Function *Caller,
              unsigned Depth) const;
#endif
};

/// Represents a single call site in a CandidateCluster.
class ClusterCallSite {
  BumpPtrAllocator &Allocator;

  // As we inline other call sites in the worklist, the pointer to the call
  // should survive. According to the documentation of InlineFunction, it is
  // not allowed to modify existing instructions in the caller function.
  // Wrap the call into poisoning value handle, so as to catch inadvertent
  // modifications.
  PoisoningVH<CallBase> CallVH;

  // The cost/benefit contribution of this particular call site into the whole
  // cluster.
  CostBenefit CB;

  // The frequency of the call site at the moment of enqueueing. Used only in
  // getCallSiteRemark.
  std::optional<double> Frequency;

  MethodCallSitesUniquePtrT CallSites;

public:
  ClusterCallSite(BumpPtrAllocator &Allocator, CallBase *Call, CostBenefit CB,
                  std::optional<double> Frequency, bool InitializeCallSites)
      : Allocator(Allocator), CallVH(Call), CB(CB), Frequency(Frequency) {
    if (InitializeCallSites)
      CallSites = azul::make_unique_bump_ptr_alloc<MethodCallSites>(Allocator,
                                                                    Allocator);
  }
  ClusterCallSite(const ClusterCallSite &Other) : Allocator(Other.Allocator) {
    CallVH = Other.CallVH;
    CB = Other.CB;
    Frequency = Other.Frequency;
    if (Other.CallSites) {
      // Deep copy the callsites from the other ClusterCallSite.
      MethodCallSites &OtherMCS = *Other.CallSites.get();
      CallSites = azul::make_unique_bump_ptr_alloc<MethodCallSites>(Allocator,
                                                                    OtherMCS);
    }
  }
  ClusterCallSite(ClusterCallSite &&Other) = default;
  ClusterCallSite &operator=(ClusterCallSite &&Other) = default;

  CallBase *getCallInstruction() const {
    return CallVH;
  }

  const CostBenefit &getCostBenefit() const {
    return CB;
  }

  MethodCallSites *getCallSites() {
    return CallSites.get();
  }

  std::string getCallSiteRemark(const std::string &Msg = std::string()) const;
  static std::string getCallSiteRemark(const InlineCost &IC,
                                       std::optional<double> Frequency);

  void setCallInstruction(CallBase *Call) {
    CallVH = Call;
  }

  void setCostBenefit(CostBenefit &NewCB) {
    CB = NewCB;
  }

  void print(raw_ostream &OS, unsigned Indent = 0) const;
#ifndef NDEBUG
  void verify(Function *TopLevelFunction, unsigned Depth) const;
#endif
};

/// Represents a unit of work for InlineCostBasedWorklist. This unit of work 
/// can be either a single call site or a cluster of call sites which need to
/// be inlined together.
class CandidateCluster {
  BumpPtrAllocator &Allocator;
  
  // Cost/Benefit for inlining of this cluster.
  CostBenefit CB;

  // Just an ordinal number that helps ordering candidates in the absence
  // of other information.
  unsigned Ordinal;

  std::string Remark;

  MethodCallSitesUniquePtrT CallSites;

  /// These methods are hidden to draw user's attention to the fact that the
  /// presence of an ordinal number restricts how a copy can be used.
  CandidateCluster(const CandidateCluster &Other)
      : Allocator(Other.Allocator), CB(Other.CB) {
    // Deep copy the callsites from the other cluster.
    MethodCallSites &OtherMCS = *Other.CallSites.get();
    CallSites =
        azul::make_unique_bump_ptr_alloc<MethodCallSites>(Allocator, OtherMCS);
  }

  CandidateCluster &operator=(const CandidateCluster &Other) {
    CB = Other.CB;
    // Deep copy the callsites from the other cluster.
    MethodCallSites &OtherMCS = *Other.CallSites.get();
    CallSites =
        azul::make_unique_bump_ptr_alloc<MethodCallSites>(Allocator, OtherMCS);
    return *this;
  }

  // Make the allocation function friend so it has access to the hidden copy
  // constructor.
  friend CandidateClusterUniquePtrT
  azul::make_unique_bump_ptr_alloc<CandidateCluster>(llvm::BumpPtrAllocator &,
                                                     const CandidateCluster &);

public:
  CandidateCluster(BumpPtrAllocator &Allocator, unsigned Ordinal,
                   std::string Remark = std::string())
      : Allocator(Allocator), Ordinal(Ordinal), Remark(Remark),
        CallSites(azul::make_unique_bump_ptr_alloc<MethodCallSites>(
            Allocator, Allocator)) {}

  /// Single call cluster constructor.
  CandidateCluster(BumpPtrAllocator &Allocator, CallBase *Call, CostBenefit CB,
                   unsigned Ordinal,
                   std::optional<double> Frequency = std::nullopt)
      : Allocator(Allocator), CB(CB), Ordinal(Ordinal), Remark(std::string()),
        CallSites(azul::make_unique_bump_ptr_alloc<MethodCallSites>(
            Allocator, Allocator)) {
    CallSites->try_emplace(Call, CB, Frequency,
                           /*InitializeCallSites = */ false);
    assert(!CB.isNever() && "Should not be enqueued!");
  }

  CandidateCluster(CandidateCluster &&) = default;
  CandidateCluster &operator=(CandidateCluster &&) = default;

  /// Thanks to the different ordinal number, the cluster returned by this
  /// function can be enqueued alongside the original cluster.
  CandidateClusterUniquePtrT
  duplicate(unsigned NewOrdinal, std::string NewRemark = std::string()) const {
    assert(Ordinal != NewOrdinal);
    // Create a deep copy of this cluster.
    const CandidateCluster &ThisCluster = *this;
    auto Copy = azul::make_unique_bump_ptr_alloc<CandidateCluster>(Allocator,
                                                                   ThisCluster);
    Copy->Ordinal = NewOrdinal;
    Copy->Remark = std::move(NewRemark);
    return Copy;
  }

  /// Comparing two candidate call-sites - return true if \p this call-site
  /// is smaller in priority that \p Other call-site.
  bool operator<(const CandidateCluster &Other) const;

  const CostBenefit &getCostBenefit() const {
    return CB;
  }

  CostBenefit &getCostBenefit() {
    return CB;
  }

  bool isNeverInline() const { return CB.isNever(); }

  bool isAlwaysInline() const {
    return CB.isAlways();
  }

  std::string getClusterRemark(const std::string &Msg = std::string()) const {
    std::string Result;    
    if (!Remark.empty()) {
      Result.append(Remark).append("; ");
      raw_string_ostream ResultOS(Result);
      CB.print(ResultOS);
      ResultOS.str();
    }
    if (!Msg.empty())
      Result.append("; ").append(Msg);
    return Result;
  }

  void print(raw_ostream &OS) const;

  double getPriority() const {
    assert(!CB.isNever() && "Should not be enqueued!");
    return CB.getPriority();
  }

  unsigned getOrdinal() const {
    return Ordinal;
  }

  unsigned getNumCallSites() const {
    return CallSites->size();
  }

  auto callsites_begin() const {
    return CallSites->begin();
  }

  auto callsites_end() const {
    return CallSites->end();
  }

  auto callsites() const {
    return make_range(CallSites->begin(), CallSites->end());
  }

  auto callsites() {
    return make_range(CallSites->begin(), CallSites->end());
  }

  MethodCallSites &getCallSites() {
    return *CallSites.get();
  }

  void callSiteInlined(
      ClusterCallSite &InlinedCallSite, InlineFunctionInfo *IFI,
      function_ref<CostBenefit(CallBase *, const CostBenefit &CB)>
        GetCostBenefit);

  ClusterCallSite &operator[](CallBase *Call) { return (*CallSites)[Call]; }

  const ClusterCallSite &operator[](CallBase *Call) const {
    return const_cast<CandidateCluster *>(this)->operator[](Call);
  }

  ClusterCallSiteUniquePtrT extractCallSite(CallBase *Call) {
    return CallSites->extractCallSite(Call);
  }

  bool contains(CallBase *Call) const { return CallSites->contains(Call); }

  void erase(CallBase *Call) { CallSites->erase(Call); }

  void addCostBenefit(CostBenefit AddCB) {
    CB += AddCB;
  }

#ifndef NDEBUG
  void verify(Function *TopLevelFunction) const;
#endif
};

/// A priority queue for CandidateClusters. It also maintains a mapping
/// between CallBase instructions and the lists of clusters that contain
/// this instruction.
class PriorityQueue {
  BumpPtrAllocator &Allocator;

  /// We use std::set as a priority queue. The default ordering in std::set
  /// is ascending, i.e. begin() points to the smallest element. We want the
  /// descending order here, i.e. begin() points to the best cluster. Use an
  /// custom comparator to inverse the ordering of an std::set.
  std::function<bool(const CandidateClusterUniquePtrT &A,
                     const CandidateClusterUniquePtrT &B)>
      QueuePred =
          [](const CandidateClusterUniquePtrT &A,
             const CandidateClusterUniquePtrT &B) { return !(*A < *B); };

  using WorklistTy =
      std::set<CandidateClusterUniquePtrT, decltype(QueuePred)>;
  WorklistTy Worklist;

  /// The mapping between a CallBase instruction and the list of clusters that
  /// contain this instruction. We store the worklist iterator here so we can
  /// modify the worklist, e.g. remove the clusters.
  DenseMap<AssertingVH<CallBase>, SmallVector<WorklistTy::iterator, 4>> CallMap;

  Function *CallerFunction;
  
public:
  PriorityQueue(BumpPtrAllocator &Allocator, Function *CallerFunction)
      : Allocator(Allocator), Worklist(QueuePred),
        CallerFunction(CallerFunction) {}

  void reset() {
    Worklist.clear();
    CallMap.shrink_and_clear();
  }

  bool empty() const {
    return Worklist.empty();
  }

  auto begin() const {
    return Worklist.begin();
  }

  auto end() const {
    return Worklist.end();
  }

  bool contains(CallBase *Call) const {
    return CallMap.count(Call);
  }

  CandidateClusterUniquePtrT pop() {
    auto TopIt = Worklist.begin();
    forgetCluster(TopIt);
    // Transfer the ownership from the Worklist.
    CandidateClusterUniquePtrT Cluster =
        std::move(const_cast<CandidateClusterUniquePtrT &>(*TopIt));
    Worklist.erase(TopIt);
#ifndef NDEBUG
    verify();
#endif
    return Cluster;
  }

  void emplace(CandidateClusterUniquePtrT &&Cluster) {
    auto Inserted = Worklist.emplace(std::move(Cluster));
    assert(Inserted.second && "Must insert!");
    registerCluster(Inserted.first);
#ifndef NDEBUG
    verify();
#endif
  }

  template <class... ArgTys>
  void emplace(ArgTys&&... Args) {
    auto Cluster = azul::make_unique_bump_ptr_alloc<CandidateCluster>(
        Allocator, std::forward<ArgTys>(Args)...);
    emplace(std::move(Cluster));
  }

  /// Given the CallBase instruction, extracts all clusters that contain this
  /// call. These clusters are removed from the worklist and returned to the
  /// caller. For every returned cluster the ClusterCallSite corresponding to
  /// the call instruction is extracted from the cluster and returned in a
  /// separate field of a pair.
  ///
  /// To be used before the call is inlined, so as to invalidate the clusters
  /// that contain the call.
  SmallVector<std::pair<CandidateClusterUniquePtrT,
                        ClusterCallSiteUniquePtrT>,
              4>
  extractClusters(CallBase *Call);

  void print(raw_ostream &OS) const {
    for (auto &C : Worklist)
      C->print(OS);
  }

  void printClustersForCall(CallBase *Call, raw_ostream &OS) const {
    auto CallToClusterIt = CallMap.find(Call);
    if (CallToClusterIt == CallMap.end())
      return;
    auto &Clusters = CallToClusterIt->second;
    for (auto &ClusterIt : Clusters) {
      const CandidateClusterUniquePtrT &Cluster = *ClusterIt;
      Cluster->print(OS);
    }
  }

private:
  void registerCluster(WorklistTy::iterator ClusterIt);
  void forgetCluster(WorklistTy::iterator ClusterIt);
  void verify();
};

class PriorityWorklistPrinter : public PassInfoMixin<PriorityWorklistPrinter> {
  raw_ostream &OS;

public:
  explicit PriorityWorklistPrinter(raw_ostream &OS) : OS(OS) {}
  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

class OrcaInlinerEscapeTrackerCache
    : public AnalysisInfoMixin<OrcaInlinerEscapeTrackerCache> {
public:
  struct EscapeTrackerResult {
    bool TooManyUses = false;
    bool EscapesThroughUnhandledUse = false;
    SmallVector<const Use *, 4> EscapeUses;
    SmallVector<const Use *, 4> MonitorUses;
    bool operator==(const EscapeTrackerResult& Other) const {
       return TooManyUses == Other.TooManyUses && 
              EscapesThroughUnhandledUse == Other.EscapesThroughUnhandledUse &&
              EscapeUses == Other.EscapeUses &&
              MonitorUses == Other.MonitorUses;
    }
  };
  struct Result {
    DenseMap<Value *, EscapeTrackerResult> ResultCache;
    /// Instead of returning a reference to the EscapeTrackerResult in
    /// ResultCache, return a copy. This way we don't need to worry about
    /// reference stability of the underlying cache data structure.
    /// Hopefully the returned EscapeTrackerResult is rather small.  
    EscapeTrackerResult getEscapeTrackerResult(Value *V, const DataLayout &DL);
  };
  Result run(Function &F, FunctionAnalysisManager &FAM) {
    return Result();
  }

private:
  friend AnalysisInfoMixin<OrcaInlinerEscapeTrackerCache>;
  static AnalysisKey Key;
};

// It's a simple adapter for OrcaNestedCandidateInliner that turns it into a
// module pass that can be used in inliner simplification PMs.
class OrcaNestedCandidateInlinerPass :
    public PassInfoMixin<OrcaNestedCandidateInlinerPass> {
  ModulePassManager SimplificationPM;
  std::optional<ModulePassManager> MaybeCustomInlineDevirtPM;

  azul::orca::OrcaPipeline *Pipeline = nullptr;

public:
  OrcaNestedCandidateInlinerPass(
      ModulePassManager SimplificationPM,
      std::optional<ModulePassManager> MaybeCustomInlineDevirtPM,
      azul::orca::OrcaPipeline *Pipeline)
      : SimplificationPM(std::move(SimplificationPM)),
        MaybeCustomInlineDevirtPM(std::move(MaybeCustomInlineDevirtPM)),
        Pipeline(Pipeline) {}

  PreservedAnalyses runOnOneFunction(Function &F, ModuleAnalysisManager &MAM);

  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
    return runWithInlinerStackSupport(*this, M, MAM, MAM);
  }
};

// It's a simple adapter for OrcaCandidateInliner that turns it into a module
// pass that can be used in inliner simplification PMs.
class OrcaCandidateInlinerPass
    : public PassInfoMixin<OrcaCandidateInlinerPass> {
  ModulePassManager SimplificationPM;
  std::optional<ModulePassManager> MaybeCustomInlineDevirtPM;
  ModulePassManager NestedCandidateInlinerPM;

  azul::orca::OrcaPipeline *Pipeline = nullptr;

public:
  OrcaCandidateInlinerPass(
      ModulePassManager SimplificationPM,
      std::optional<ModulePassManager> MaybeCustomInlineDevirtPM,
      ModulePassManager NestedCandidateInlinerPM,
      azul::orca::OrcaPipeline *Pipeline)
      : SimplificationPM(std::move(SimplificationPM)),
        MaybeCustomInlineDevirtPM(std::move(MaybeCustomInlineDevirtPM)),
        NestedCandidateInlinerPM(std::move(NestedCandidateInlinerPM)),
        Pipeline(Pipeline) {}

  PreservedAnalyses runOnOneFunction(Function &F, ModuleAnalysisManager &MAM);

  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
    return runWithInlinerStackSupport(*this, M, MAM, MAM);
  }

  // OrcaCandidateInlinerPass looks as auxiliary pass for
  // OrcaInlinerPassManager.
  // It is created in constructor of OrcaInlinerPassManager, is added to
  // candidatesimplification pipeline and needs some settings to work.
  // Right now we don't want to launch it separately.
  // Let's hide it from PassBuilder and print only SimplificationPM which
  // can be used as CandidateSimplificationPM in OrcaInlinerPassManager
  // constructor.
  void printPipeline(raw_ostream &OS,
                     function_ref<StringRef(StringRef)> MapClassName2PassName) {
    SimplificationPM.printPipeline(OS, MapClassName2PassName);
  }
};

class InlinerOptions {

#define INLINER_OPTIONS_DO(MACRO)                                              \
  MACRO(bool, CollectAvailableDefs, ::CollectAvailableDefs,                    \
        orca::OrcaFeatureID::InlinerCollectAvailableDefs)                      \
  MACRO(azul::orca::SimplifyWithInlining, SimplifyCandidatesWithInlining,      \
        ::SimplifyCandidatesWithInlining,                                      \
        orca::OrcaFeatureID::SimplifyCandidatesWithInlining)                   \
  MACRO(double, IRSizeCap, ::OrcaInlinerIRSizeCap,                             \
        orca::OrcaFeatureID::InlinerIRSizeCap)                                 \
  MACRO(unsigned, IRSizeThresholdPower, ::OrcalInlinerIRSizeThresholdPower,    \
        orca::OrcaFeatureID::InlinerIRSizeThresholdPower)                      \
  MACRO(unsigned, IRSizeThresholdMul, ::OrcaInlinerIRSizeThresholdMul,         \
        orca::OrcaFeatureID::InlinerIRSizeThresholdMul)                        \
  MACRO(unsigned, InlinerMaxIterations, ::OrcaInlinerMaxIterations,            \
        orca::OrcaFeatureID::InlinerMaxIterations)                             \
  MACRO(bool, NestedSimplifyWithInlining, ::NestedSimplifyWithInlining,        \
        orca::OrcaFeatureID::NestedSimplifyWithInlining)                       \
  MACRO(int, AzulInlineThreshold, ::AzulInlineThreshold,                       \
        orca::OrcaFeatureID::AzulInlineThreshold)

#define DECLARE_INLINER_OPTION(Type, Name, DefaultValue, OrcaFeatureID)        \
private:                                                                       \
  Type Name;                                                                   \
                                                                               \
public:                                                                        \
  InlinerOptions &set##Name(std::optional<Type> Opt) {                         \
    if (Opt)                                                                   \
      this->Name = *Opt;                                                       \
    return *this;                                                              \
  }                                                                            \
  Type get##Name() const { return this->Name; }

  INLINER_OPTIONS_DO(DECLARE_INLINER_OPTION)

public:
  InlinerOptions();

  explicit InlinerOptions(azul::orca::OrcaPipeline *Pipeline);
};

class OrcaInlinerWorklist;

class OrcaInliner {
  const unsigned Depth;

protected:
  OrcaInliner(unsigned Depth, azul::orca::OrcaPipeline *Pipeline, Module &M,
              ModuleAnalysisManager &AM)
      : Depth(Depth),
        Pipeline(Pipeline), M(M), AM(AM),
        FAM(AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
        PI(AM.getResult<PassInstrumentationAnalysis>(M)),
        PSI(AM.getCachedResult<ProfileSummaryAnalysis>(M)),
        Opts(Pipeline) {}

public:
  virtual ~OrcaInliner() = default;

  void run();

  void runOnFunction(Function &F);

  PreservedAnalyses getPreservedAnalyses() {
    return ResultPA;
  }

  azul::orca::SimplifyWithInlining getSimplifyCandidatesWithInlining() const {
    return Opts.getSimplifyCandidatesWithInlining();
  }

  double getIRSizeCap() const {
    return Opts.getIRSizeCap();
  }

  unsigned getIRSizeThresholdPower() const {
    return Opts.getIRSizeThresholdPower();
  }

  unsigned getIRSizeThresholdMul() const {
    return Opts.getIRSizeThresholdMul();
  }

  unsigned getInlinerMaxIterations() const {
    return Opts.getInlinerMaxIterations();
  }

  bool getNestedSimplifyWithInlining() const {
    return Opts.getNestedSimplifyWithInlining();
  }

  int getAzulInlineThreshold() const {
    return Opts.getAzulInlineThreshold();
  }

  FunctionAnalysisManager &getFunctionAnalysisManager() const {
    return FAM;
  }
  ProfileSummaryInfo *getProfileSummaryInfo() const {
    return PSI;
  }

  auto getBFI() {
    return [&](Function &F) -> BlockFrequencyInfo & {
      return FAM.getResult<BlockFrequencyAnalysis>(F);
    };
  }

  void invalidateModuleAnalyses(const PreservedAnalyses &PA) {
    AM.invalidate(M, PA);
  }

  std::optional<double> getCallSiteFrequency(CallBase &Call) {
    return azul::Utils::getCallSiteFrequency(Call, {getBFI()});
  }

  BumpPtrAllocator &getAllocator() { return Allocator; }

  /// If orca-inliner-use-vm-inlining-advice is enabled, asks VM's
  /// advice on inlining decision for the given call site.
  static std::optional<bool> getInliningAdvice(const CallBase *Call);

  InlineCost getInlineCost(CallBase *Call,
      const azul::AvailableDefTracker &DefTracker,
      InlineParams &Params);

  /// Generates the inlining candidate for the call.
  void generateCandidate(CallBase *Call);

  // Simplifications to be done on a candidate before attempting to inline
  void simplifyCandidate(Function *Candidate);

  bool isSimplified(Function *Candidate) const {
    return getFunctionState(Candidate) == FunctionState::Simplified &&
           getFunctionSimplifiedAtDepth(Candidate) <= Depth;
  }

  // Not marking it as `const` because the fact it doesn't modify the state of
  // this class is an implementation detail. So it makes sense to prohibit such
  // modification in read-only contexts.
  void markSimplified(Function *Candidate) {
    setFunctionState(Candidate, FunctionState::Simplified);
    setFunctionSimplifiedAtDepth(Candidate, Depth);
  }

  // Returns whether the function was simplified after the last call was inlined
  // into it.
  static bool isFunctionSimplifiedAfterInlining(Function *F) {
    return F->hasFnAttribute(
        azul::FunctionSimplifiedAfterInliningAttributeName);
  }
  static void markFunctionSimplifiedAfterInlining(Function *F) {
    F->addFnAttr(azul::FunctionSimplifiedAfterInliningAttributeName);
  }
  static void dropFunctionSimplifiedAfterInlining(Function *F) {
    F->removeFnAttr(azul::FunctionSimplifiedAfterInliningAttributeName);
  }

  enum class FunctionState : unsigned {
    None = 0, Generated, Simplified, NumFunctionStates
  };

protected:
  azul::orca::OrcaPipeline *Pipeline;

  Module &M;

  ModuleAnalysisManager &AM;
  FunctionAnalysisManager &FAM;
  PassInstrumentation &PI;
  ProfileSummaryInfo *PSI;

  PreservedAnalyses ResultPA = PreservedAnalyses::all();

  const InlinerOptions Opts;

  BumpPtrAllocator Allocator;

  static FunctionState getFunctionState(Function *F);
  static unsigned getFunctionSimplifiedAtDepth(Function *F);
  static void setFunctionState(Function *F, FunctionState State);
  static void setFunctionSimplifiedAtDepth(Function *F, unsigned Depth);

  unsigned devirtualizeCalls(Function &F, OrcaInlinerWorklist &Worklist,
                             const azul::AvailableDefTracker &DefTracker);

  /// Returns true if a change was made.
  bool simplifyFunction(Function &F, ModulePassManager *PM);

public:
  // Inlining policy methods. The policy depends on the current depth.

  /// The worklist responsible for prioritization and inlining heuristics.
  virtual std::unique_ptr<OrcaInlinerWorklist> createWorklist(Function *F) = 0;

  /// Passes we will use to simplify the inline candidates. This may or may not
  /// include other inliners.
  virtual ModulePassManager &getCandidateSimplificationPM() = 0;

  /// Should the given call site be considered for generation/simplification/
  /// inlining?
  virtual bool shouldConsider(CallBase *Call) = 0;

  /// Simplification passes to run on function after all inlining into this
  /// function is done.
  virtual ModulePassManager *getPostInliningPM() = 0;

  /// Simplification passes to run on function during inline-devirtualize
  /// iteration. The purpose of this pipeline is to uncover new inline and
  /// devirtualize opportunities after inlining.
  virtual ModulePassManager *getInlineDevirtualizePM() = 0;

  // Maximum devirtualize-inline iterations to perform.
  virtual unsigned getMaxIterations() = 0;

  // Should available defs be used to compute InlineCost?
  virtual bool useAvailableDefs() = 0;
};

/// The abstract interface for the inliner worklist.
///
/// Concrete implementation of this class knows how to prioritize the call
/// sites and decides what to inline and what not. Note, prioritization
/// and inlining heuristics are usually tightly coupled, that's why these
/// aspects are managed by the same entity.
class OrcaInlinerWorklist {
protected:
  OrcaInliner &Inliner;
  Function *CallerFunction;

  bool inlineCallIfPossible(CallBase *Call,
    std::string Remark, unsigned &IRSizeEstimate,
    azul::AvailableDefTracker &DefTracker, InlineFunctionInfo &IFI);

public:
  /// Generates and simplifies the callee. Returns a pair:
  ///   first - whether the call should be considered for inlining.
  ///   second - the message if the call should not be considered for inlining.
  std::pair<bool, std::string> prepareCallee(CallBase *Call);

  OrcaInlinerWorklist(OrcaInliner &Inliner, Function *F) :
    Inliner(Inliner), CallerFunction(F) {}

  virtual ~OrcaInlinerWorklist() {}

  virtual bool empty() = 0;

  virtual void reset() = 0;

  /// Initialize the worklist for the given function.
  /// Returns true if at least on call site was enqueued, false otherwise.
  bool initializeWorklist(const azul::AvailableDefTracker &DefTracker);

  /// Adds a new call site into the worklist.
  /// Returns true if the call site was enqueued, false otherwise.
  virtual bool enqueueCallSite(CallBase *Call,
                               const azul::AvailableDefTracker &DefTracker) = 0;

  /// Iterates over the worklist in priority order attempting to inline enqueued
  /// call sites. Upon return the worklist is empty. Returns the number of
  /// successfully inlined call sites.
  virtual unsigned
  inlineCallSitesInWorklist(unsigned &IRSizeEstimate,
                            azul::AvailableDefTracker &DefTracker) = 0;

  virtual void print(raw_ostream &OS) = 0;
};

class InlineCostBasedWorklist : public OrcaInlinerWorklist {
private:
  unsigned NumCandidateClusters = 0;
  
public:
  InlineCostBasedWorklist(OrcaInliner &Inliner, Function *F)
      : OrcaInlinerWorklist(Inliner, F),
        CandidateWorklist(Inliner.getAllocator(), F) {}

  bool empty() override;

  void reset() override;

  /// Adds a new call site into the worklist.
  /// Returns true if the call site was enqueued, false otherwise.
  bool enqueueCallSite(CallBase *Call,
                       const azul::AvailableDefTracker &DefTracker) override;

  /// Iterates over the worklist in priority order attempting to inline enqueued
  /// call sites. Upon return the worklist is empty. Returns the number of
  /// successfully inlined call sites.
  unsigned
  inlineCallSitesInWorklist(unsigned &IRSizeEstimate,
                            azul::AvailableDefTracker &DefTracker) override;

  MethodCallSites *tryInsertIntoCluster(MethodCallSites &CallSites,
                                        CallBase *Call);

  bool updateClusterCostBenefit(
    CandidateCluster &Cluster, MethodCallSites &CurrentCallSites,
    const azul::AvailableDefTracker &DefTracker);

  unsigned getNumCandidateClusters() { return NumCandidateClusters; }

  void emplaceCandidateCluster(CandidateCluster &&Cluster);

  void print(raw_ostream &OS) override;

  virtual ~InlineCostBasedWorklist() { reset(); }

  const PriorityQueue &getPriorityQueue() const {
    return CandidateWorklist;
  }

private:
  bool isOptimizableCall(CallBase *Call) const;

  std::optional<double> getCallSiteFrequency(CallBase &Call);

  PriorityQueue CandidateWorklist;

  bool canInlineInCluster(CallBase *Call,
                          const SmallVectorImpl<CallBase *> &ClusterStack);
  bool enqueueSingleCallSite(CallBase *Call,
                             const azul::AvailableDefTracker &DefTracker);
  bool canOptimizeWithImprovedValueInfo(
    CallBase *Call, const DenseMap<Value *, azul::ValueInfo> &ImprovedValueInfo)
    const;
  bool enqueueNewAllocationClusters(CallBase *Call,
                                    const azul::AvailableDefTracker &DefTracker);
  bool enqueueExposeReturnedNewAllocationCluster(
      CandidateClusterUniquePtrT &&NewAllocationCluster,
      const azul::AvailableDefTracker &DefTracker);
  bool enqueueReturnedUnescapedNewAllocationCluster(
      ClusterValue<Value> CV, CandidateClusterUniquePtrT &&Cluster,
      const azul::AvailableDefTracker &DefTracker, const DataLayout &DL);
  bool enqueueNewAllocationUnescapedCluster(
    CallBase *NewI, const azul::AvailableDefTracker &DefTracker);
  int computeUnescapedAllocationBenefit(
      ClusterValue<Value> Allocation,
      const SmallVectorImpl<ClusterValue<CallBase>> &MonitorUsers,
      const DataLayout &DL);
  bool
  collectNoEscapeCluster(ClusterValue<Value> CV,
                         SmallVectorImpl<ClusterValue<CallBase>> &MonitorUsers,
                         const DataLayout &DL);
  std::optional<ClusterValue<Value>> collectExposeReturnedNewAllocationCluster(
      ClusterValue<Value> CV, const DataLayout &DL, unsigned Depth);

  bool shouldInline(unsigned NumTopLevelCallSites, CostBenefit CB,
                    unsigned SizeEstimateBeforeInlining, std::string &Reason);

  unsigned tryInliningCluster(CandidateCluster &CS,
                              unsigned &IRSizeEstimate,
                              azul::AvailableDefTracker &DefTracker);
  bool tryInliningCallSite(
      ClusterCallSite &CCS, std::string ClusterRemark, unsigned &IRSizeEstimate,
      azul::AvailableDefTracker &DefTracker,
      SmallVectorImpl<ClusterCallSiteUniquePtrT> &Worklist);

  CostBenefit getCostBenefitInCaller(CallBase *Call,
                                     MethodCallSites *MCS = nullptr) const;
  CostBenefit
  getCostBenefitInCluster(CallBase *CallI,
                          const azul::AvailableDefTracker &DefTracker,
                          MethodCallSites *MCS = nullptr) const;
  CostBenefit getCostBenefit(CallBase *Call,
                             const azul::AvailableDefTracker &DefTracker,
                             MethodCallSites *MCS = nullptr) const;
  double calculateRelFrequency(CallBase *Call) const;
};

// OrcaInlinerPassManager is a pass manager which performs inlining into the top
// level compile method. In order to make smarter inlining decisions this pass
// manager also simplifies inlining candidates and the top level method.
class OrcaInlinerPassManager : public PassInfoMixin<OrcaInlinerPassManager> {
  // Important note: these ModulePassManagers can contain only such module
  // passes that are aware of the InlinerStack, i.e. can be told to run only on
  // the function inliner is currently working on.
  explicit OrcaInlinerPassManager(azul::orca::OrcaPipeline *Pipeline);

public:
  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);

  using GetMPMFunc = std::function<std::optional<ModulePassManager>()>;

  /// The note from the constructor also applies to ModulePassManagers returned
  /// by these callbacks.
  /// This function is a workaround for the fact that we can't simply copy
  /// ModulePassManagers. To construct inliner we may need way more than just
  /// two pipelines, so to avoid cluttering the interface with something like
  /// "we need 3 copies of top-level PM and 4 copies of candidate PM" we just
  /// take the callbacks and call them as many times as we need.
  /// Returns std::nullopt only when either of the callbacks returns
  ///  std::nullopt, otherwise it is guaranteed to return a properly constructed
  ///  OrcaInlinerPassManager.
  static std::optional<OrcaInlinerPassManager>
  tryBuildInliner(GetMPMFunc GetTopLevelPM, GetMPMFunc GetCandidatePM,
                  azul::orca::OrcaPipeline *Pipeline);

  void printPipeline(raw_ostream &OS,
                     function_ref<StringRef(StringRef)> MapClassName2PassName);
  static std::optional<ModulePassManager>
  tryBuildCandidateInlinerPM(const GetMPMFunc &GetCandidatePM,
                             azul::orca::OrcaPipeline *Pipeline);

  static std::optional<ModulePassManager>
  tryBuildNestedCandidateInlinerPM(const GetMPMFunc &GetCandidatePM,
                                   azul::orca::OrcaPipeline *Pipeline);

private:
  ModulePassManager TopLevelSimplificationPM;
  ModulePassManager CandidateSimplificationPM;

  std::optional<ModulePassManager> MaybeCustomInlineDevirtPM;

  azul::orca::OrcaPipeline *Pipeline;
};

/// Populates an InlineCostBasedWorklist for the given function and then invokes
/// the visitor callback. This function should only be used for diagnostic 
/// purposes and should not be used in the normal optimization pipeline.
void visitDummyInlineCostBasedWorklist(
  llvm::Function &F, llvm::ModuleAnalysisManager &AM,
  llvm::function_ref<void(llvm::InlineCostBasedWorklist *)> Visitor);
} // namespace llvm

#endif /* ORCAINLINERPASSMANAGER_H */

