NNVM的优化

概述

  • NNVM优化器的目标是减少内存和设备分配,同时保留最初的计算语义。
  • 转换计算图以最大限度地减少内存利用率,优化数据布局并融合不同硬件后端的计算模式。

实现

NNVM用Symbol同时用来抽象计算图中的OperatorOperand

Symbol这个抽象概念表示一种具有多个输入和输出的过程Symbol对象仅仅包含一个vector<NodeEntry> outputs成员,用来记录在图中它的所有输出节点;而每一个Node记录的则是自己的属性和所有的输入inputs。真正负责建图的对象是Node,它能够看到自己的所有输入;然后在计算图之上所定义的抽象概念,例如OperandOperator使用Symbol来表示。

在图的表示中,可以用Operator来将一些Operandcompose为新的一个Operand,也就是图中的节点。最后我们拿到一个Symbol,从它就可以回溯出整个图,转换为Graph对象来完成建图操作。

优化

def optimize(graph, shape, dtype="float32"):
#执行目标和参数不变的图优化
#graph:被优化的图
#返回一个优化好的图
    cfg = BuildConfig.current
    if cfg.pass_enabled("SimplifyInference"):#简化推导
        #在图形属性中设置输入图形节点的形状
        graph = graph_attr.set_shape_inputs(graph, shape)
        #将passes应用于graph,返回转换的图形
        graph = graph.apply(["InferShape", "SimplifyInference"])

    if cfg.pass_enabled("FoldScaleAxis"):#折叠
        graph = graph_attr.set_shape_inputs(graph, shape)
        graph = graph.apply(["InferShape", "FoldScaleAxis"])
    return graph

SimplifyInference

  • Batch Normalization
  • Dropout
Graph SimplifyInference(nnvm::Graph src) {
  // Get attributes from the graph
  const IndexedGraph& idx = src.indexed_graph();
  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
  auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
    if (n->is_variable()) return false;
    static const Op* bn_op = Op::Get("batch_norm");//批规范化
    static const Op* dropout_op = Op::Get("dropout");//Dropout技术
    if (n->op() == bn_op) {
      *ret = BatchNormToInferUnpack(
          n->attrs,
          n->inputs[0],
          n->inputs[1],
          n->inputs[2],
          n->inputs[3],
          n->inputs[4],
          shape_vec[idx.entry_id(nid, 0)]);
      return true;
    } else if (n->op() == dropout_op) {
      NodeEntry undef = MakeNode("__undef__", "undef", {});
      *ret = {n->inputs[0], undef};
      return true;
    } else {
      return false;
    }
  };
  return GraphTransform(src, transform);
}

FoldScaleAxis

enum FoldScaleKind {
  // No folding is applied
  kNone,
  // The folding decision is pending
  kPending,
  // The original operator that contains the scale.
  kProvider,
  // Pass through the scale to parent/child to the first axis.
  kPassTroughFirst,
  // The final conumer of axis scale using multiply
  // Likely be a conv or dense operator.
  kMulConsumer,
  // The final conumer of axis scale using division
  kDivConsumer
};
Graph FoldScaleAxis(Graph src) {
  // Operator pattern
  static auto& fbackward =
      nnvm::Op::GetAttr<FScaleAxisBackward>("FScaleAxisBackward");
  const IndexedGraph& idx = src.indexed_graph();
  const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
  std::vector<FoldChainEntry> bwd_chain(idx.num_nodes());
  // shape hint for the inference.
  std::vector<TShape> in_shape, out_shape;
  // 执行后端的折叠
  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
    uint32_t nid = i - 1;
    const auto& inode = idx[nid];
    if (inode.source->is_variable()) continue;
    if (DetectScaleAxis(idx, nid, shape_vec,
                        ref_count, false, &bwd_chain)) continue;
    if (bwd_chain[nid].kind != kPending) continue;
    if (ref_count[nid] != 1 || !fbackward.count(inode.source->op())) {
      bwd_chain[nid].kind = kNone; continue;
    }
    // get input shape and output shape.
    in_shape.clear(); out_shape.clear();
    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
      in_shape.push_back(shape_vec[idx.entry_id(e)]);
    }
    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
      out_shape.push_back(shape_vec[idx.entry_id(nid, i)]);
    }
    std::vector<std::pair<uint32_t, int> > in_axis;
    FoldScaleKind kind =
        fbackward[inode.source->op()](
            inode.source->attrs, bwd_chain[nid].axis,
            in_shape, out_shape, &in_axis);
    bwd_chain[nid].kind = kind;
    if (kind == kNone) continue;
    CHECK_GE(in_axis.size(), 1U);
    CHECK(kind == kPassTroughFirst || kMulConsumer);
    // 反向传播
    bool can_prop = true;
    for (size_t i = 0; i < in_axis.size(); ++i) {
      const IndexedGraph::NodeEntry& e = inode.inputs[in_axis[0].first];
      if (ref_count[e.node_id] != 1 ||
          idx[e.node_id].source->num_outputs() != 1) {
        can_prop = false; break;
      }
    }
    if (!can_prop) continue;
    for (size_t i = 0; i < in_axis.size(); ++i) {
      const IndexedGraph::NodeEntry& e = inode.inputs[in_axis[i].first];
      if (kind == kPassTroughFirst && i == 0) {
        bwd_chain[e.node_id].kind = kPending;
      } else {
        bwd_chain[nid].kind = kNone;
        bwd_chain[e.node_id].kind = kMulConsumer;
      }
      bwd_chain[e.node_id].axis = in_axis[i].second;
      bwd_chain[e.node_id].source = bwd_chain[nid].source;
    }
    if (kind == kMulConsumer) {
      bwd_chain[bwd_chain[nid].source].kind = kProvider;
    }
  }
  auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
    const FoldChainEntry& e = bwd_chain[nid];
    if (e.kind == kMulConsumer && bwd_chain[e.source].kind == kProvider) {
      const FoldChainEntry& se = bwd_chain[e.source];
      CHECK_EQ(n->num_outputs(), 1);
      NodeEntry scale = ExpandBiasToMatchAxis(
          se.scale_entry,
          shape_vec[idx.entry_id(nid, 0)].ndim(),
          shape_vec[idx.entry_id(se.scale_entry)].ndim(),
          e.axis);
      *ret = {MakeNode("broadcast_mul", n->attrs.name + "_sc",
                       {NodeEntry{n, 0, 0}, scale})};
      return true;
    } else if (e.kind == kProvider) {
      *ret = {n->inputs[e.fold_input_index]};
      return true;
    } else {
      return false;
    }
  };
  return GraphTransform(src, transform);
}

build

  • 优化图,并进行编译
  • 编译器可能会将图分裂来预先计算某些值

参数:

graph
target 可构建目标
shape 可选的图的输入形状
dtype 图的输入类型
params 用于预计算折叠的优化
target_host 用于指定主机端代码生成目标
def build(graph, target=None, shape=None, dtype="float32", params=None, target_host=None):
    target = target if target else tvm.target.current_target()
    if target is None:
        raise ValueError("Target is not set in env or passed as argument.")
    target = tvm.target.create(target)

    shape = shape if shape else {}
    if not isinstance(shape, dict):
        raise TypeError("require shape to be dict")
    cfg = BuildConfig.current
    graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
    shape, dtype = _update_shape_dtype(shape, dtype, params)
    # Initial pass do shape type inference
    ishape, _ = graph_util.infer_shape(graph, **shape)
    shape.update(zip(graph.index.input_names, ishape))
    if not isinstance(dtype, str):
        idtype, _ = graph_util.infer_dtype(graph, **dtype)
        dtype.update(zip(graph.index.input_names, idtype))
    # 执行优化
    graph = optimize(graph, shape, dtype)
    # 预精简
    if params and cfg.pass_enabled("PrecomputePrune"):
        graph, params = precompute_prune(graph, params)
        shape, dtype = _update_shape_dtype(shape, dtype, params)
    # Operator融合和生成
    graph = graph_attr.set_shape_inputs(graph, shape)
    graph = graph_attr.set_dtype_inputs(graph, dtype)
    graph._set_json_attr("target", str(target), "str")
    if target_host is not None:
        graph._set_json_attr("target_host", str(target_host), "str")
    if cfg.pass_enabled("OpFusion"):
        graph._set_json_attr("opt_level", 1, "int")
    else:
        graph._set_json_attr("opt_level", 0, "int")
    graph = graph.apply("InferShape").apply("InferType")
    with target:
        graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
    libmod = graph_attr._move_out_module(graph, "module")
    return graph, libmod, params

附结点、图的结构**

Node

using NodePtr = std::shared_ptr<Node>;
//表示一个节点输出数据的项,一个NodeEntry以node的其中一个输出的视角来描述
struct NodeEntry {
  //数据的源节点
  NodePtr node;
  //该输出的索引
  uint32_t index;
   //输入变量的版本
   //node是一个变量节点时,这个值只能是非0,变量每次参与一个修改Op时version都会加1,
   //这个信息在一个修改序列发生时对于决定操作的顺序有帮助
  uint32_t version;
};
struct NodeAttrs {
  const Op *op{nullptr};
  std::string name;
  //位置属性的向量表示
  std::vector<double> scalars;
  //属性的字典表示
  std::unordered_map<std::string, std::string> dict;
   //any是任意类,如果注册了OpProperty.attr_parser就会生成,可快速访问属性
  any parsed;
};
class Node {
 public:
  NodeAttrs attrs;
  std::vector<NodeEntry> inputs;
   //当前节点操作之前必须完成的操作
  std::vector<NodePtr> control_deps;
  ~Node();
  inline const Op* op() const;
  inline bool is_variable() const;
  inline uint32_t num_outputs() const;
  inline uint32_t num_inputs() const;
  static NodePtr Create();
};
class any {
 public:
  inline any() = default;
  inline any(any&& other);  // NOLINT(*)
  inline any(const any& other);  // NOLINT(*)
  template<typename T>
  inline any(T&& other);  // NOLINT(*)
  inline ~any();
  inline any& operator=(any&& other);
  inline any& operator=(const any& other);
  template<typename T>
  inline any& operator=(T&& other);
  inline bool empty() const;
  inline void clear();
  inline void swap(any& other); // NOLINT(*)
  inline const std::type_info& type() const;

 private:
  template<typename T>
  class TypeOnHeap;
  template<typename T>
  class TypeOnStack;
  template<typename T>
  class TypeInfo;
  //栈空间大小,一个any类型32比特
  static const size_t kStack = sizeof(void*) * 3;
  static const size_t kAlign = sizeof(void*);
  // container use dynamic storage only when space runs lager
  //当空间变得更大时容器只使用动态内存
  union Data {
    std::aligned_storage<kStack, kAlign>::type stack;// 栈空间    
    void* pheap;// 指向堆空间
  };
  struct Type {
    void (*destroy)(Data* data);
    void (*create_from_data)(Data* dst, const Data& src);
    const std::type_info* ptype_info;
  };
  //检查数据是否能存储在堆空间的常数
  template<typename T>
  struct data_on_stack {
    static const bool value = alignof(T) <= kAlign && sizeof(T) <= kStack;
  };
  template<typename T>
  friend T& get(any& src);  // NOLINT(*)
  template<typename T>
  friend const T& get(const any& src);
  inline void construct(any&& other);
  inline void construct(const any& other);
  template<typename T>
  inline void check_type() const;
  const Type* type_{nullptr};
  // 核心数据
  Data data_;
};

Symbol

//symbol是面向前端接收前端定义的网络,然后在后端转化成优化需要的计算图Graph
class Symbol {
 public:
  enum ListAttrOption {
    kRecursive = 0,
    kShallow = 1
  };
  enum ListInputOption {
    kAll = 0,
    kReadOnlyArgs = 1,
    kAuxiliaryStates = 2
  };

  std::vector<NodeEntry> outputs;//输出项,对应于原来的 heads_

  Symbol Copy() const;
  void Print(std::ostream &os) const; // NOLINT(*)
  Symbol operator[] (size_t index) const;
  std::vector<NodePtr> ListInputs(ListInputOption option) const;
  std::vector<std::string> ListInputNames(ListInputOption option) const;
  std::vector<std::string> ListOutputNames() const;
  void Compose(const array_view<const Symbol*>& args,
               const std::unordered_map<std::string, const Symbol*>& kwargs,
               const std::string& name);
  Symbol operator () (const array_view<const Symbol*>& args,
                      const std::unordered_map<std::string, const Symbol*>& kwargs,
                      const std::string& name) const;
  void AddControlDeps(const Symbol& src);
  Symbol GetInternals() const;
  Symbol GetChildren() const;
  void SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs);
  bool GetAttr(const std::string& key, std::string* out) const;
  std::unordered_map<std::string, std::string> ListAttrs(ListAttrOption option) const;
  std::vector<std::tuple<std::string, std::string, std::string> >
      ListAttrsRecursive() const;
  static Symbol CreateFunctor(const Op* op,
                              std::unordered_map<std::string, std::string> attrs);
  //三种symbol组件的创建
  static Symbol CreateFunctor(const NodeAttrs& attrs);
  static Symbol CreateVariable(const std::string& name);
  static Symbol CreateGroup(const std::vector<Symbol>& symbols);
};

Graph

class Graph {
 public:
  std::vector<NodeEntry> outputs;
  std::unordered_map<std::string, std::shared_ptr<any> > attrs;
  template<typename T>
  inline const T& GetAttr(const std::string& attr_name) const;
  inline bool HasAttr(const std::string& attr_name) const;
  //获取属性的移动拷贝,实现了写时拷贝的场景。引用计数为1时在调用后从attrs中擦除。
  template<typename T>
  inline T MoveCopyAttr(const std::string& attr_name);
  const IndexedGraph& indexed_graph() const;

 private://索引图的内部结构
  mutable std::shared_ptr<const IndexedGraph> indexed_graph_;
};

results matching ""

    No results matching ""