NNVM的优化
概述
- NNVM优化器的目标是减少内存和设备分配,同时保留最初的计算语义。
- 转换计算图以最大限度地减少内存利用率,优化数据布局并融合不同硬件后端的计算模式。
-
实现
NNVM用Symbol
同时用来抽象计算图中的Operator
和Operand
Symbol
这个抽象概念表示一种具有多个输入和输出的过程。Symbol
对象仅仅包含一个vector<NodeEntry> outputs
成员,用来记录在图中它的所有输出节点;而每一个Node
记录的则是自己的属性和所有的输入inputs
。真正负责建图的对象是Node
,它能够看到自己的所有输入;然后在计算图之上所定义的抽象概念,例如Operand
和Operator
使用Symbol
来表示。
在图的表示中,可以用Operator
来将一些Operand
给compose
为新的一个Operand
,也就是图中的节点。最后我们拿到一个Symbol
,从它就可以回溯出整个图,转换为Graph
对象来完成建图操作。
优化
def optimize(graph, shape, dtype="float32"):
#执行目标和参数不变的图优化
#graph:被优化的图
#返回一个优化好的图
cfg = BuildConfig.current
if cfg.pass_enabled("SimplifyInference"):#简化推导
#在图形属性中设置输入图形节点的形状
graph = graph_attr.set_shape_inputs(graph, shape)
#将passes应用于graph,返回转换的图形
graph = graph.apply(["InferShape", "SimplifyInference"])
if cfg.pass_enabled("FoldScaleAxis"):#折叠
graph = graph_attr.set_shape_inputs(graph, shape)
graph = graph.apply(["InferShape", "FoldScaleAxis"])
return graph
SimplifyInference
- Batch Normalization
- Dropout
Graph SimplifyInference(nnvm::Graph src) {
// Get attributes from the graph
const IndexedGraph& idx = src.indexed_graph();
const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
if (n->is_variable()) return false;
static const Op* bn_op = Op::Get("batch_norm");//批规范化
static const Op* dropout_op = Op::Get("dropout");//Dropout技术
if (n->op() == bn_op) {
*ret = BatchNormToInferUnpack(
n->attrs,
n->inputs[0],
n->inputs[1],
n->inputs[2],
n->inputs[3],
n->inputs[4],
shape_vec[idx.entry_id(nid, 0)]);
return true;
} else if (n->op() == dropout_op) {
NodeEntry undef = MakeNode("__undef__", "undef", {});
*ret = {n->inputs[0], undef};
return true;
} else {
return false;
}
};
return GraphTransform(src, transform);
}
FoldScaleAxis
enum FoldScaleKind {
// No folding is applied
kNone,
// The folding decision is pending
kPending,
// The original operator that contains the scale.
kProvider,
// Pass through the scale to parent/child to the first axis.
kPassTroughFirst,
// The final conumer of axis scale using multiply
// Likely be a conv or dense operator.
kMulConsumer,
// The final conumer of axis scale using division
kDivConsumer
};
Graph FoldScaleAxis(Graph src) {
// Operator pattern
static auto& fbackward =
nnvm::Op::GetAttr<FScaleAxisBackward>("FScaleAxisBackward");
const IndexedGraph& idx = src.indexed_graph();
const ShapeVector& shape_vec = src.GetAttr<ShapeVector>("shape");
std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
std::vector<FoldChainEntry> bwd_chain(idx.num_nodes());
// shape hint for the inference.
std::vector<TShape> in_shape, out_shape;
// 执行后端的折叠
for (uint32_t i = idx.num_nodes(); i != 0; --i) {
uint32_t nid = i - 1;
const auto& inode = idx[nid];
if (inode.source->is_variable()) continue;
if (DetectScaleAxis(idx, nid, shape_vec,
ref_count, false, &bwd_chain)) continue;
if (bwd_chain[nid].kind != kPending) continue;
if (ref_count[nid] != 1 || !fbackward.count(inode.source->op())) {
bwd_chain[nid].kind = kNone; continue;
}
// get input shape and output shape.
in_shape.clear(); out_shape.clear();
for (const IndexedGraph::NodeEntry& e : inode.inputs) {
in_shape.push_back(shape_vec[idx.entry_id(e)]);
}
for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
out_shape.push_back(shape_vec[idx.entry_id(nid, i)]);
}
std::vector<std::pair<uint32_t, int> > in_axis;
FoldScaleKind kind =
fbackward[inode.source->op()](
inode.source->attrs, bwd_chain[nid].axis,
in_shape, out_shape, &in_axis);
bwd_chain[nid].kind = kind;
if (kind == kNone) continue;
CHECK_GE(in_axis.size(), 1U);
CHECK(kind == kPassTroughFirst || kMulConsumer);
// 反向传播
bool can_prop = true;
for (size_t i = 0; i < in_axis.size(); ++i) {
const IndexedGraph::NodeEntry& e = inode.inputs[in_axis[0].first];
if (ref_count[e.node_id] != 1 ||
idx[e.node_id].source->num_outputs() != 1) {
can_prop = false; break;
}
}
if (!can_prop) continue;
for (size_t i = 0; i < in_axis.size(); ++i) {
const IndexedGraph::NodeEntry& e = inode.inputs[in_axis[i].first];
if (kind == kPassTroughFirst && i == 0) {
bwd_chain[e.node_id].kind = kPending;
} else {
bwd_chain[nid].kind = kNone;
bwd_chain[e.node_id].kind = kMulConsumer;
}
bwd_chain[e.node_id].axis = in_axis[i].second;
bwd_chain[e.node_id].source = bwd_chain[nid].source;
}
if (kind == kMulConsumer) {
bwd_chain[bwd_chain[nid].source].kind = kProvider;
}
}
auto transform = [&](uint32_t nid, const NodePtr& n, std::vector<NodeEntry>* ret) {
const FoldChainEntry& e = bwd_chain[nid];
if (e.kind == kMulConsumer && bwd_chain[e.source].kind == kProvider) {
const FoldChainEntry& se = bwd_chain[e.source];
CHECK_EQ(n->num_outputs(), 1);
NodeEntry scale = ExpandBiasToMatchAxis(
se.scale_entry,
shape_vec[idx.entry_id(nid, 0)].ndim(),
shape_vec[idx.entry_id(se.scale_entry)].ndim(),
e.axis);
*ret = {MakeNode("broadcast_mul", n->attrs.name + "_sc",
{NodeEntry{n, 0, 0}, scale})};
return true;
} else if (e.kind == kProvider) {
*ret = {n->inputs[e.fold_input_index]};
return true;
} else {
return false;
}
};
return GraphTransform(src, transform);
}
build
- 优化图,并进行编译
- 编译器可能会将图分裂来预先计算某些值
参数:
graph | 图 |
---|---|
target | 可构建目标 |
shape | 可选的图的输入形状 |
dtype | 图的输入类型 |
params | 用于预计算折叠的优化 |
target_host | 用于指定主机端代码生成目标 |
def build(graph, target=None, shape=None, dtype="float32", params=None, target_host=None):
target = target if target else tvm.target.current_target()
if target is None:
raise ValueError("Target is not set in env or passed as argument.")
target = tvm.target.create(target)
shape = shape if shape else {}
if not isinstance(shape, dict):
raise TypeError("require shape to be dict")
cfg = BuildConfig.current
graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
shape, dtype = _update_shape_dtype(shape, dtype, params)
# Initial pass do shape type inference
ishape, _ = graph_util.infer_shape(graph, **shape)
shape.update(zip(graph.index.input_names, ishape))
if not isinstance(dtype, str):
idtype, _ = graph_util.infer_dtype(graph, **dtype)
dtype.update(zip(graph.index.input_names, idtype))
# 执行优化
graph = optimize(graph, shape, dtype)
# 预精简
if params and cfg.pass_enabled("PrecomputePrune"):
graph, params = precompute_prune(graph, params)
shape, dtype = _update_shape_dtype(shape, dtype, params)
# Operator融合和生成
graph = graph_attr.set_shape_inputs(graph, shape)
graph = graph_attr.set_dtype_inputs(graph, dtype)
graph._set_json_attr("target", str(target), "str")
if target_host is not None:
graph._set_json_attr("target_host", str(target_host), "str")
if cfg.pass_enabled("OpFusion"):
graph._set_json_attr("opt_level", 1, "int")
else:
graph._set_json_attr("opt_level", 0, "int")
graph = graph.apply("InferShape").apply("InferType")
with target:
graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
libmod = graph_attr._move_out_module(graph, "module")
return graph, libmod, params
附结点、图的结构**
Node
using NodePtr = std::shared_ptr<Node>;
//表示一个节点输出数据的项,一个NodeEntry以node的其中一个输出的视角来描述
struct NodeEntry {
//数据的源节点
NodePtr node;
//该输出的索引
uint32_t index;
//输入变量的版本
//node是一个变量节点时,这个值只能是非0,变量每次参与一个修改Op时version都会加1,
//这个信息在一个修改序列发生时对于决定操作的顺序有帮助
uint32_t version;
};
struct NodeAttrs {
const Op *op{nullptr};
std::string name;
//位置属性的向量表示
std::vector<double> scalars;
//属性的字典表示
std::unordered_map<std::string, std::string> dict;
//any是任意类,如果注册了OpProperty.attr_parser就会生成,可快速访问属性
any parsed;
};
class Node {
public:
NodeAttrs attrs;
std::vector<NodeEntry> inputs;
//当前节点操作之前必须完成的操作
std::vector<NodePtr> control_deps;
~Node();
inline const Op* op() const;
inline bool is_variable() const;
inline uint32_t num_outputs() const;
inline uint32_t num_inputs() const;
static NodePtr Create();
};
class any {
public:
inline any() = default;
inline any(any&& other); // NOLINT(*)
inline any(const any& other); // NOLINT(*)
template<typename T>
inline any(T&& other); // NOLINT(*)
inline ~any();
inline any& operator=(any&& other);
inline any& operator=(const any& other);
template<typename T>
inline any& operator=(T&& other);
inline bool empty() const;
inline void clear();
inline void swap(any& other); // NOLINT(*)
inline const std::type_info& type() const;
private:
template<typename T>
class TypeOnHeap;
template<typename T>
class TypeOnStack;
template<typename T>
class TypeInfo;
//栈空间大小,一个any类型32比特
static const size_t kStack = sizeof(void*) * 3;
static const size_t kAlign = sizeof(void*);
// container use dynamic storage only when space runs lager
//当空间变得更大时容器只使用动态内存
union Data {
std::aligned_storage<kStack, kAlign>::type stack;// 栈空间
void* pheap;// 指向堆空间
};
struct Type {
void (*destroy)(Data* data);
void (*create_from_data)(Data* dst, const Data& src);
const std::type_info* ptype_info;
};
//检查数据是否能存储在堆空间的常数
template<typename T>
struct data_on_stack {
static const bool value = alignof(T) <= kAlign && sizeof(T) <= kStack;
};
template<typename T>
friend T& get(any& src); // NOLINT(*)
template<typename T>
friend const T& get(const any& src);
inline void construct(any&& other);
inline void construct(const any& other);
template<typename T>
inline void check_type() const;
const Type* type_{nullptr};
// 核心数据
Data data_;
};
Symbol
//symbol是面向前端接收前端定义的网络,然后在后端转化成优化需要的计算图Graph
class Symbol {
public:
enum ListAttrOption {
kRecursive = 0,
kShallow = 1
};
enum ListInputOption {
kAll = 0,
kReadOnlyArgs = 1,
kAuxiliaryStates = 2
};
std::vector<NodeEntry> outputs;//输出项,对应于原来的 heads_
Symbol Copy() const;
void Print(std::ostream &os) const; // NOLINT(*)
Symbol operator[] (size_t index) const;
std::vector<NodePtr> ListInputs(ListInputOption option) const;
std::vector<std::string> ListInputNames(ListInputOption option) const;
std::vector<std::string> ListOutputNames() const;
void Compose(const array_view<const Symbol*>& args,
const std::unordered_map<std::string, const Symbol*>& kwargs,
const std::string& name);
Symbol operator () (const array_view<const Symbol*>& args,
const std::unordered_map<std::string, const Symbol*>& kwargs,
const std::string& name) const;
void AddControlDeps(const Symbol& src);
Symbol GetInternals() const;
Symbol GetChildren() const;
void SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs);
bool GetAttr(const std::string& key, std::string* out) const;
std::unordered_map<std::string, std::string> ListAttrs(ListAttrOption option) const;
std::vector<std::tuple<std::string, std::string, std::string> >
ListAttrsRecursive() const;
static Symbol CreateFunctor(const Op* op,
std::unordered_map<std::string, std::string> attrs);
//三种symbol组件的创建
static Symbol CreateFunctor(const NodeAttrs& attrs);
static Symbol CreateVariable(const std::string& name);
static Symbol CreateGroup(const std::vector<Symbol>& symbols);
};
Graph
class Graph {
public:
std::vector<NodeEntry> outputs;
std::unordered_map<std::string, std::shared_ptr<any> > attrs;
template<typename T>
inline const T& GetAttr(const std::string& attr_name) const;
inline bool HasAttr(const std::string& attr_name) const;
//获取属性的移动拷贝,实现了写时拷贝的场景。引用计数为1时在调用后从attrs中擦除。
template<typename T>
inline T MoveCopyAttr(const std::string& attr_name);
const IndexedGraph& indexed_graph() const;
private://索引图的内部结构
mutable std::shared_ptr<const IndexedGraph> indexed_graph_;
};