From 334d50ed008e95a0295a932b8bf2b70e875cd361 Mon Sep 17 00:00:00 2001
From: Xin Li <xlidc@cse.ust.hk>
Date: Sat, 11 Mar 2017 03:28:57 +0800
Subject: [PATCH 1/5] Copy mxnet.cpp to cpp-package sub directory

---
 cpp-package/.travis.yml                       |   48 +
 cpp-package/LICENSE                           |   13 +
 cpp-package/Makefile                          |   11 +
 cpp-package/README.md                         |    8 +
 cpp-package/example/Makefile                  |   72 +
 cpp-package/example/alexnet.cpp               |  301 +
 cpp-package/example/charRNN.cpp               |  714 ++
 cpp-package/example/feature_extract/Makefile  |   26 +
 .../feature_extract/feature_extract.cpp       |  120 +
 .../prepare_data_with_opencv.cpp              |   37 +
 cpp-package/example/feature_extract/run.sh    |   12 +
 cpp-package/example/googlenet.cpp             |  163 +
 cpp-package/example/inception_bn.cpp          |  188 +
 cpp-package/example/lenet.cpp                 |  233 +
 cpp-package/example/lenet_with_mxdataiter.cpp |  119 +
 cpp-package/example/mlp.cpp                   |  162 +
 cpp-package/example/resnet.cpp                |  191 +
 .../example/run_lenet_with_mxdataiter.sh      |    6 +
 cpp-package/include/mxnet-cpp/CPPLINT.cfg     |    2 +
 cpp-package/include/mxnet-cpp/MxNetCpp.h      |   23 +
 cpp-package/include/mxnet-cpp/base.h          |   38 +
 cpp-package/include/mxnet-cpp/executor.h      |  137 +
 cpp-package/include/mxnet-cpp/executor.hpp    |   92 +
 cpp-package/include/mxnet-cpp/initializer.h   |  130 +
 cpp-package/include/mxnet-cpp/io.h            |  128 +
 cpp-package/include/mxnet-cpp/io.hpp          |   87 +
 cpp-package/include/mxnet-cpp/kvstore.h       |   49 +
 cpp-package/include/mxnet-cpp/kvstore.hpp     |  178 +
 cpp-package/include/mxnet-cpp/metric.h        |   91 +
 cpp-package/include/mxnet-cpp/model.h         |   58 +
 cpp-package/include/mxnet-cpp/ndarray.h       |  407 +
 cpp-package/include/mxnet-cpp/ndarray.hpp     |  331 +
 cpp-package/include/mxnet-cpp/op.h            | 7629 +++++++++++++++++
 cpp-package/include/mxnet-cpp/op_map.h        |   92 +
 cpp-package/include/mxnet-cpp/op_suppl.h      |  188 +
 cpp-package/include/mxnet-cpp/operator.h      |  188 +
 cpp-package/include/mxnet-cpp/operator.hpp    |  155 +
 cpp-package/include/mxnet-cpp/optimizer.h     |  122 +
 cpp-package/include/mxnet-cpp/optimizer.hpp   |  134 +
 cpp-package/include/mxnet-cpp/shape.h         |  389 +
 cpp-package/include/mxnet-cpp/symbol.h        |  257 +
 cpp-package/include/mxnet-cpp/symbol.hpp      |  339 +
 cpp-package/scripts/lint.py                   |  174 +
 .../OpWrapperGenerator/OpWrapperGenerator.py  |  367 +
 .../OpWrapperGenerator.pyproj                 |   28 +
 .../OpWrapperGenerator/OpWrapperGenerator.sln |   20 +
 cpp-package/src/OpWrapperGenerator/README.md  |    1 +
 cpp-package/tests/travis/run_test.sh          |   24 +
 cpp-package/tests/travis/setup.sh             |    5 +
 49 files changed, 14287 insertions(+)
 create mode 100644 cpp-package/.travis.yml
 create mode 100755 cpp-package/LICENSE
 create mode 100644 cpp-package/Makefile
 create mode 100644 cpp-package/README.md
 create mode 100644 cpp-package/example/Makefile
 create mode 100644 cpp-package/example/alexnet.cpp
 create mode 100644 cpp-package/example/charRNN.cpp
 create mode 100644 cpp-package/example/feature_extract/Makefile
 create mode 100644 cpp-package/example/feature_extract/feature_extract.cpp
 create mode 100644 cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
 create mode 100755 cpp-package/example/feature_extract/run.sh
 create mode 100644 cpp-package/example/googlenet.cpp
 create mode 100644 cpp-package/example/inception_bn.cpp
 create mode 100644 cpp-package/example/lenet.cpp
 create mode 100644 cpp-package/example/lenet_with_mxdataiter.cpp
 create mode 100644 cpp-package/example/mlp.cpp
 create mode 100644 cpp-package/example/resnet.cpp
 create mode 100755 cpp-package/example/run_lenet_with_mxdataiter.sh
 create mode 100644 cpp-package/include/mxnet-cpp/CPPLINT.cfg
 create mode 100644 cpp-package/include/mxnet-cpp/MxNetCpp.h
 create mode 100644 cpp-package/include/mxnet-cpp/base.h
 create mode 100644 cpp-package/include/mxnet-cpp/executor.h
 create mode 100644 cpp-package/include/mxnet-cpp/executor.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/initializer.h
 create mode 100644 cpp-package/include/mxnet-cpp/io.h
 create mode 100644 cpp-package/include/mxnet-cpp/io.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/kvstore.h
 create mode 100644 cpp-package/include/mxnet-cpp/kvstore.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/metric.h
 create mode 100644 cpp-package/include/mxnet-cpp/model.h
 create mode 100644 cpp-package/include/mxnet-cpp/ndarray.h
 create mode 100644 cpp-package/include/mxnet-cpp/ndarray.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/op.h
 create mode 100644 cpp-package/include/mxnet-cpp/op_map.h
 create mode 100644 cpp-package/include/mxnet-cpp/op_suppl.h
 create mode 100644 cpp-package/include/mxnet-cpp/operator.h
 create mode 100644 cpp-package/include/mxnet-cpp/operator.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/optimizer.h
 create mode 100644 cpp-package/include/mxnet-cpp/optimizer.hpp
 create mode 100644 cpp-package/include/mxnet-cpp/shape.h
 create mode 100644 cpp-package/include/mxnet-cpp/symbol.h
 create mode 100644 cpp-package/include/mxnet-cpp/symbol.hpp
 create mode 100644 cpp-package/scripts/lint.py
 create mode 100755 cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
 create mode 100755 cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj
 create mode 100755 cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln
 create mode 100644 cpp-package/src/OpWrapperGenerator/README.md
 create mode 100755 cpp-package/tests/travis/run_test.sh
 create mode 100755 cpp-package/tests/travis/setup.sh

diff --git a/cpp-package/.travis.yml b/cpp-package/.travis.yml
new file mode 100644
index 000000000000..e7a332d09125
--- /dev/null
+++ b/cpp-package/.travis.yml
@@ -0,0 +1,48 @@
+sudo: false
+
+language: cpp
+
+os:
+  - linux
+# disable for now since clang doesn't support openmp
+#  - osx
+
+env:
+  # code analysis
+  - TASK=lint
+  # TODO: build example
+  - TASK=build
+
+# dependent apt packages
+addons:
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - gcc-4.8
+      - g++-4.8
+#      - wget
+#      - git
+#      - libcurl4-openssl-dev
+#      - unzip
+#      - libatlas-dev
+#      - libopencv-dev
+
+before_install:
+
+install:
+  - source tests/travis/setup.sh
+
+script:
+  - tests/travis/run_test.sh
+
+cache:
+  directories:
+    - ${HOME}/.cache/usr
+
+notifications:
+# Emails are sent to the committer's git-configured email address by default,
+  email:
+    on_success: change
+    on_failure: always
+  #slack: dmlc:NmroCzntCiWOuxUZpii40USd
diff --git a/cpp-package/LICENSE b/cpp-package/LICENSE
new file mode 100755
index 000000000000..2525650c621b
--- /dev/null
+++ b/cpp-package/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2015 by Contributors 
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/cpp-package/Makefile b/cpp-package/Makefile
new file mode 100644
index 000000000000..2c40d952f4d1
--- /dev/null
+++ b/cpp-package/Makefile
@@ -0,0 +1,11 @@
+ifndef LINT_LANG
+	LINT_LANG="all"
+endif
+
+.PHONY: lint example
+
+lint:
+	python scripts/lint.py dmlc ${LINT_LANG} include example
+
+example:
+	make -C example travis
diff --git a/cpp-package/README.md b/cpp-package/README.md
new file mode 100644
index 000000000000..dcfcbc81f3a7
--- /dev/null
+++ b/cpp-package/README.md
@@ -0,0 +1,8 @@
+# MxNet C++ Package
+
+[![Build Status](https://travis-ci.org/dmlc/MXNet.cpp.svg?branch=master)](https://travis-ci.org/dmlc/MXNet.cpp)
+[![Build status](https://ci.appveyor.com/api/projects/status/ckfq6j53sg5ll01d/branch/master?svg=true)](https://ci.appveyor.com/project/lx75249/mxnet-cpp/branch/master)
+
+The examples dir containers examples for you to get started.
+The lib dir should contain the compiled mxnet library.
+Windows dir contains Visual C++ solution files and project files.
diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
new file mode 100644
index 000000000000..45f0c896aab9
--- /dev/null
+++ b/cpp-package/example/Makefile
@@ -0,0 +1,72 @@
+BLAS=-L /opt/openblas/lib -lopenblas -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 
+CUDA=-DMSHADOW_USE_CUDA=1
+
+#COMMFLAGS=-static -static-libgcc -static-libstdc++
+
+CFLAGS=$(COMMFLAGS) -I ../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -I ../../nnvm/include -I ../../include -I ../../dmlc-core/include
+LDFLAGS=$(COMMFLAGS) -L ../lib/linux -lmxnet $(BLAS) $(CUDA) -pthread
+
+ifneq ($(OS), Windows_NT)
+	OS := $(shell uname)
+endif
+ifneq ($(OS), Darwin)
+	CFLAGS += -fopenmp
+	LDFLAGS += -lgomp
+endif
+
+all: mlp lenet lenet_with_mxdataiter alexnet googlenet inception_bn resnet
+
+lenet_with_mxdataiter: ./lenet_with_mxdataiter.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+lenet: ./lenet.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+mlp: ./mlp.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+alexnet: ./alexnet.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+googlenet: ./googlenet.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+inception_bn: ./inception_bn.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+resnet: ./resnet.cpp
+	$(CXX) -c -std=c++11 $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+# For simplicity, no link here
+travis:
+	$(CXX) -c -std=c++11 $(CFLAGS) ./mlp.cpp && rm -f mlp.o
+	$(CXX) -c -std=c++11 $(CFLAGS) ./lenet.cpp && rm -f lenet.o
+	$(CXX) -c -std=c++11 $(CFLAGS) ./lenet_with_mxdataiter.cpp && rm -f lenet_with_mxdataiter.o
+	$(CXX) -c -std=c++11 $(CFLAGS) ./alexnet.cpp && rm -f alexnet.o
+	$(CXX) -c -std=c++11 $(CFLAGS) ./googlenet.cpp && rm -f googlenet.o
+	$(CXX) -c -std=c++11 $(CFLAGS) ./inception_bn.cpp && rm -f inception_bn.o
+	$(CXX) -c -std=c++11 $(CFLAGS) ./resnet.cpp && rm -f resnet.o
+
+
+clean:
+	-rm -f mlp
+	-rm -f lenet
+	-rm -f lenet_with_mxdataiter
+	-rm -f alexnet
+	-rm -f googlenet
+	-rm -f inception_bn
+	-rm -f resnet
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
new file mode 100644
index 000000000000..435c4553c2c4
--- /dev/null
+++ b/cpp-package/example/alexnet.cpp
@@ -0,0 +1,301 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+Symbol AlexnetSymbol(int num_classes) {
+  auto input_data = Symbol::Variable("data");
+  auto target_label = Symbol::Variable("label");
+  /*stage 1*/
+  auto conv1 = Operator("Convolution")
+                   .SetParam("kernel", Shape(11, 11))
+                   .SetParam("num_filter", 96)
+                   .SetParam("stride", Shape(4, 4))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", input_data)
+                   .CreateSymbol("conv1");
+  auto relu1 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv1)
+                   .CreateSymbol("relu1");
+  auto pool1 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max") /*avg,max,sum */
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu1)
+                   .CreateSymbol("pool1");
+  auto lrn1 = Operator("LRN")
+                  .SetParam("nsize", 5)
+                  .SetParam("alpha", 0.0001)
+                  .SetParam("beta", 0.75)
+                  .SetParam("knorm", 1)
+                  .SetInput("data", pool1)
+                  .CreateSymbol("lrn1");
+  /*stage 2*/
+  auto conv2 = Operator("Convolution")
+                   .SetParam("kernel", Shape(5, 5))
+                   .SetParam("num_filter", 256)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(2, 2))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", lrn1)
+                   .CreateSymbol("conv2");
+  auto relu2 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv2)
+                   .CreateSymbol("relu2");
+  auto pool2 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max") /*avg,max,sum */
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu2)
+                   .CreateSymbol("pool2");
+  auto lrn2 = Operator("LRN")
+                  .SetParam("nsize", 5)
+                  .SetParam("alpha", 0.0001)
+                  .SetParam("beta", 0.75)
+                  .SetParam("knorm", 1)
+                  .SetInput("data", pool2)
+                  .CreateSymbol("lrn2");
+  /*stage 3*/
+  auto conv3 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 384)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", lrn2)
+                   .CreateSymbol("conv3");
+  auto relu3 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv3)
+                   .CreateSymbol("relu3");
+  auto conv4 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 384)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", relu3)
+                   .CreateSymbol("conv4");
+  auto relu4 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv4)
+                   .CreateSymbol("relu4");
+  auto conv5 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 256)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", relu4)
+                   .CreateSymbol("conv5");
+  auto relu5 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", conv5)
+                   .CreateSymbol("relu5");
+  auto pool3 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max")
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu5)
+                   .CreateSymbol("pool3");
+  /*stage4*/
+  auto flatten =
+      Operator("Flatten").SetInput("data", pool3).CreateSymbol("flatten");
+  auto fc1 = Operator("FullyConnected")
+                 .SetParam("num_hidden", 4096)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", flatten)
+                 .CreateSymbol("fc1");
+  auto relu6 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", fc1)
+                   .CreateSymbol("relu6");
+  auto dropout1 = Operator("Dropout")
+                      .SetParam("p", 0.5)
+                      .SetInput("data", relu6)
+                      .CreateSymbol("dropout1");
+  /*stage5*/
+  auto fc2 = Operator("FullyConnected")
+                 .SetParam("num_hidden", 4096)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", dropout1)
+                 .CreateSymbol("fc2");
+  auto relu7 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", fc2)
+                   .CreateSymbol("relu7");
+  auto dropout2 = Operator("Dropout")
+                      .SetParam("p", 0.5)
+                      .SetInput("data", relu7)
+                      .CreateSymbol("dropout2");
+  /*stage6*/
+  auto fc3 = Operator("FullyConnected")
+                 .SetParam("num_hidden", num_classes)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", dropout2)
+                 .CreateSymbol("fc3");
+  auto softmax = Operator("SoftmaxOutput")
+                     .SetParam("grad_scale", 1)
+                     .SetParam("ignore_label", -1)
+                     .SetParam("multi_output", false)
+                     .SetParam("use_ignore", false)
+                     .SetParam("normalization", "null") /*batch,null,valid */
+                     .SetInput("data", fc3)
+                     .SetInput("label", target_label)
+                     .CreateSymbol("softmax");
+  return softmax;
+}
+
+int main(int argc, char const *argv[]) {
+  /*basic config*/
+  int batch_size = 256;
+  int max_epo = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  /*context and net symbol*/
+  auto ctx = Context::gpu();
+  auto Net = AlexnetSymbol(10);
+
+  /*args_map and aux_map is used for parameters' saving*/
+  map<string, NDArray> args_map;
+  map<string, NDArray> aux_map;
+
+  /*we should tell mxnet the shape of data and label*/
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
+  args_map["label"] = NDArray(Shape(batch_size), ctx);
+
+  /*with data and label, executor can be generated automatically*/
+  auto *exec = Net.SimpleBind(ctx, args_map);
+  aux_map = exec->aux_dict();
+  args_map = exec->arg_dict();
+
+  /*if fine tune from some pre-trained model, we should load the parameters*/
+  // NDArray::Load("./model/alex_params_3", nullptr, &args_map);
+  /*else, we should use initializer Xavier to init the params*/
+  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+  for (auto &arg : args_map) {
+    /*be careful here, the arg's name must has some specific ends or starts for
+     * initializer to call*/
+    xavier(arg.first, &arg.second);
+  }
+  /*print out to check the shape of the net*/
+  for (const auto &s : Net.ListArguments()) {
+    LG << s;
+    const auto &k = args_map[s].GetShape();
+    for (const auto &i : k) {
+      cout << i << " ";
+    }
+    cout << endl;
+  }
+
+  /*these binary files should be generated using im2rc tools, which can be found
+   * in mxnet/bin*/
+  auto train_iter = MXDataIter("ImageRecordIter")
+                        .SetParam("path_imglist", "./data/train_rec.lst")
+                        .SetParam("path_imgrec", "./data/train_rec.bin")
+                        .SetParam("data_shape", Shape(3, 256, 256))
+                        .SetParam("batch_size", batch_size)
+                        .SetParam("shuffle", 1)
+                        .CreateDataIter();
+  auto val_iter = MXDataIter("ImageRecordIter")
+                      .SetParam("path_imglist", "./data/val_rec.lst")
+                      .SetParam("path_imgrec", "./data/val_rec.bin")
+                      .SetParam("data_shape", Shape(3, 256, 256))
+                      .SetParam("batch_size", batch_size)
+                      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  Accuracy acu_train, acu_val;
+  LogLoss logloss_val;
+  for (int iter = 0; iter < max_epo; ++iter) {
+    LG << "Train Epoch: " << iter;
+    /*reset the metric every epoch*/
+    acu_train.Reset();
+    /*reset the data iter every epoch*/
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto batch = train_iter.GetDataBatch();
+      LG << train_iter.GetDataBatch().index.size();
+      /*use copyto to feed new data and label to the executor*/
+      batch.data.CopyTo(&args_map["data"]);
+      batch.label.CopyTo(&args_map["label"]);
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+      acu_train.Update(batch.label, exec->outputs[0]);
+    }
+    LG << "ITER: " << iter << " Train Accuracy: " << acu_train.Get();
+
+    LG << "Val Epoch: " << iter;
+    acu_val.Reset();
+    val_iter.Reset();
+    logloss_val.Reset();
+    while (val_iter.Next()) {
+      auto batch = val_iter.GetDataBatch();
+      LG << val_iter.GetDataBatch().index.size();
+      batch.data.CopyTo(&args_map["data"]);
+      batch.label.CopyTo(&args_map["label"]);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu_val.Update(batch.label, exec->outputs[0]);
+      logloss_val.Update(batch.label, exec->outputs[0]);
+    }
+    LG << "ITER: " << iter << " Val Accuracy: " << acu_val.Get();
+    LG << "ITER: " << iter << " Val LogLoss: " << logloss_val.Get();
+
+    /*save the parameters*/
+    stringstream ss;
+    ss << iter;
+    string iter_str;
+    ss >> iter_str;
+    string save_path_param = "./model/alex_param_" + iter_str;
+    auto save_args = args_map;
+    /*we do not want to save the data and label*/
+    save_args.erase(save_args.find("data"));
+    save_args.erase(save_args.find("label"));
+    /*the alexnet does not get any aux array, so we do not need to save
+     * aux_map*/
+    LG << "ITER: " << iter << " Saving to..." << save_path_param;
+    NDArray::Save(save_path_param, save_args);
+  }
+  /*don't foget to release the executor*/
+  delete exec;
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
new file mode 100644
index 000000000000..bce6f9a1a357
--- /dev/null
+++ b/cpp-package/example/charRNN.cpp
@@ -0,0 +1,714 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * Hua Zhang mz24cn@hotmail.com
+ * The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API.
+ * The generated params file is compatiable with python version.
+ * train() and predict() has been verified with original data samples.
+ * 2017/1/23:
+ * Add faster version charRNN based on built-in cuDNN RNN operator, 10 times faster.
+ * Add time major computation graph, although no substantial performance difference.
+ * Support continuing training from last params file.
+ * Rename params file epoch number starts from zero.
+ */
+
+#pragma warning(disable: 4996)  // VS2015 complains on 'std::copy' ...
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <vector>
+#include <string>
+#include <tuple>
+#include <algorithm>
+#include <functional>
+#include <thread>
+#include <chrono>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+struct LSTMState {
+  Symbol C;
+  Symbol h;
+};
+
+struct LSTMParam {
+  Symbol i2h_weight;
+  Symbol i2h_bias;
+  Symbol h2h_weight;
+  Symbol h2h_bias;
+};
+
+bool TIME_MAJOR = true;
+
+// LSTM Cell symbol
+LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& prev_state,
+    const LSTMParam& param, int seqidx, int layeridx, mx_float dropout = 0) {
+  auto input = dropout > 0? Dropout(indata, dropout) : indata;
+  auto prefix = string("t") + to_string(seqidx) + "_l" + to_string(layeridx);
+  auto i2h = FullyConnected(prefix + "_i2h", input, param.i2h_weight, param.i2h_bias,
+      num_hidden * 4);
+  auto h2h = FullyConnected(prefix + "_h2h", prev_state.h, param.h2h_weight, param.h2h_bias,
+      num_hidden * 4);
+  auto gates = i2h + h2h;
+  auto slice_gates = SliceChannel(prefix + "_slice", gates, 4);
+  auto in_gate = Activation(slice_gates[0], ActivationActType::sigmoid);
+  auto in_transform = Activation(slice_gates[1], ActivationActType::tanh);
+  auto forget_gate = Activation(slice_gates[2], ActivationActType::sigmoid);
+  auto out_gate = Activation(slice_gates[3], ActivationActType::sigmoid);
+
+  LSTMState state;
+  state.C = (forget_gate * prev_state.C) + (in_gate * in_transform);
+  state.h = out_gate * Activation(state.C, ActivationActType::tanh);
+  return state;
+}
+
+Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
+        int num_hidden, int num_embed, mx_float dropout = 0) {
+  auto isTrain = sequence_length > 1;
+  auto data = Symbol::Variable("data");
+  if (TIME_MAJOR && isTrain)
+    data = transpose(data);
+  auto embed_weight = Symbol::Variable("embed_weight");
+  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
+  auto wordvec = isTrain? SliceChannel(embed, sequence_length, TIME_MAJOR? 0 : 1, true) : embed;
+
+  vector<LSTMState> last_states;
+  vector<LSTMParam> param_cells;
+  for (int l = 0; l < num_lstm_layer; l++) {
+    string layer = "l" + to_string(l);
+    LSTMParam param;
+    param.i2h_weight = Symbol::Variable(layer + "_i2h_weight");
+    param.i2h_bias = Symbol::Variable(layer + "_i2h_bias");
+    param.h2h_weight = Symbol::Variable(layer + "_h2h_weight");
+    param.h2h_bias = Symbol::Variable(layer + "_h2h_bias");
+    param_cells.push_back(param);
+    LSTMState state;
+    state.C = Symbol::Variable(layer + "_init_c");
+    state.h = Symbol::Variable(layer + "_init_h");
+    last_states.push_back(state);
+  }
+
+  vector<Symbol> hidden_all;
+  for (int i = 0; i < sequence_length; i++) {
+    auto hidden = wordvec[i];
+    for (int layer = 0; layer < num_lstm_layer; layer++) {
+      double dp_ratio = layer == 0? 0 : dropout;
+      auto next_state = LSTM(num_hidden, hidden, last_states[layer], param_cells[layer],
+          i, layer, dp_ratio);
+      hidden = next_state.h;
+      last_states[layer] = next_state;
+    }
+    if (dropout > 0)
+      hidden = Dropout(hidden, dropout);
+    hidden_all.push_back(hidden);
+  }
+
+  auto hidden_concat = isTrain? Concat(hidden_all, hidden_all.size(), 0) : hidden_all[0];
+  auto cls_weight = Symbol::Variable("cls_weight");
+  auto cls_bias = Symbol::Variable("cls_bias");
+  auto pred = FullyConnected("pred", hidden_concat, cls_weight, cls_bias, input_dim);
+
+  auto label = Symbol::Variable("softmax_label");
+  label = transpose(label);
+  label = Reshape(label, Shape(), false, Shape(-1));  // -1: infer from graph
+  auto sm = SoftmaxOutput("softmax", pred, label);
+  if (isTrain)
+    return sm;
+
+  vector<Symbol> outputs = { sm };
+  for (auto& state : last_states) {
+    outputs.push_back(state.C);
+    outputs.push_back(state.h);
+  }
+  return Symbol::Group(outputs);
+}
+
+// Currently mxnet GPU version RNN operator is implemented via *fast* NVIDIA cuDNN.
+Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int input_dim,
+ int num_hidden, int num_embed, mx_float dropout = 0) {
+  auto isTrain = sequence_length > 1;
+  auto data = Symbol::Variable("data");
+  if (TIME_MAJOR && isTrain)
+    data = transpose(data);
+
+  auto embed_weight = Symbol::Variable("embed_weight");
+  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
+  auto label = Symbol::Variable("softmax_label");
+  label = transpose(label);
+  label = Reshape(label, Shape(), false, Shape(-1));  // FullyConnected requires one dimension
+  if (!TIME_MAJOR && isTrain)
+    embed = SwapAxis(embed, 0, 1);  // Change to time-major as cuDNN requires
+
+  // We need not do the SwapAxis op as python version does. Direct and better performance in C++!
+  auto rnn_h_init = Symbol::Variable("LSTM_init_h");
+  auto rnn_c_init = Symbol::Variable("LSTM_init_c");
+  auto rnn_params = Symbol::Variable("LSTM_parameters");  // See explanations near RNNXavier class
+  auto rnn = RNN(embed, rnn_params, rnn_h_init, rnn_c_init, num_hidden, num_lstm_layer,
+      RNNMode::lstm, false, dropout, !isTrain);
+  auto hidden = Reshape(rnn[0], Shape(), false, Shape(-1, num_hidden));
+
+  auto cls_weight = Symbol::Variable("cls_weight");
+  auto cls_bias = Symbol::Variable("cls_bias");
+  auto pred = FullyConnected("pred", hidden, cls_weight, cls_bias, input_dim);
+  /*In rnn-time-major/rnn_cell_demo.py, the author claimed time-major version speeds up
+   * 1.5~2 times versus batch version. I doubts on the conclusion. In my test, the performance
+   * of both codes are almost same. In fact, there are no substantially differences between
+   * two codes. They are both based on time major cuDNN, the computation graph only differs
+   * slightly on the choices of where to put Reshape/SwapAxis/transpose operation. Here I don't
+   * use Reshape on pred and keep label shape on SoftmaxOutput like time major version code,
+   * but Reshape on label for simplification. It doesn't make influence on performacne. */
+
+  auto sm = SoftmaxOutput("softmax", pred, label);
+  if (isTrain)
+    return sm;
+  else
+    return Symbol::Group({ sm, rnn[1/*RNNOpOutputs::kStateOut=1*/],
+    rnn[2/*RNNOpOutputs::kStateCellOut=2*/] });
+}
+
+class Shuffler {
+  vector<int> sequence;
+ public:
+  explicit Shuffler(int size) : sequence(size) {
+    int* p = sequence.data();
+    for (int i = 0; i < size; i++)
+      *p++ = i;
+  }
+  void shuffle(function<void(int, int)> lambda = nullptr) {
+    random_shuffle(sequence.begin(), sequence.end());
+    int n = 0;
+    if (lambda != nullptr)
+      for (int i : sequence)
+        lambda(n++, i);
+  }
+  const int* data() {
+    return sequence.data();
+  }
+};
+
+class BucketSentenceIter : public DataIter {
+  Shuffler* random;
+  int batch, current, end, sequence_length;
+  Context device;
+  vector<vector<mx_float>> sequences;
+  vector<wchar_t> index2chars;
+  unordered_map<wchar_t, mx_float> charIndices;
+
+ public:
+  BucketSentenceIter(string filename, int minibatch, Context context) : batch(minibatch),
+  current(-1), device(context) {
+    auto& content = readContent(filename);
+    buildCharIndex(content);
+    sequences = convertTextToSequences(content, '\n');
+
+    int N = sequences.size() / batch * batch;  // total used samples
+    sequences.resize(N);
+    sort(sequences.begin(), sequences.end(), [](const vector<mx_float>& a,
+        const vector<mx_float>& b) { return a.size() < b.size(); });
+
+    sequence_length = sequences.back().size();
+    random = new Shuffler(N);
+    // We still can get random results if call Reset() firstly
+//    vector<vector<mx_float>>* target = &sequences;
+//    random->shuffle([target](int n, int i) { (*target)[n].swap((*target)[i]); });
+    end = N / batch;
+  }
+  virtual ~BucketSentenceIter() {
+    delete random;
+  }
+
+  unsigned int maxSequenceLength() {
+    return sequence_length;
+  }
+
+  size_t characterSize() {
+    return charIndices.size();
+  }
+
+  virtual bool Next(void) {
+    return ++current < end;
+  }
+  virtual NDArray GetData(void) {
+    const int* indices = random->data();
+    mx_float *data = new mx_float[sequence_length * batch], *pdata = data;
+
+    for (int i = current * batch, end = i + batch; i < end; i++) {
+      memcpy(pdata, sequences[indices[i]].data(), sequences[indices[i]].size() * sizeof(mx_float));
+      if (sequences[indices[i]].size() < sequence_length)
+        memset(pdata + sequences[indices[i]].size(), 0,
+            (sequence_length - sequences[indices[i]].size()) * sizeof(mx_float));
+      pdata += sequence_length;
+    }
+    NDArray array(Shape(batch, sequence_length), device, false);
+    array.SyncCopyFromCPU(data, batch * sequence_length);
+    return array;
+  }
+  virtual NDArray GetLabel(void) {
+    const int* indices = random->data();
+    mx_float *label = new mx_float[sequence_length * batch], *plabel = label;
+
+    for (int i = current * batch, end = i + batch; i < end; i++) {
+      memcpy(plabel, sequences[indices[i]].data() + 1,
+          (sequences[indices[i]].size() - 1) * sizeof(mx_float));
+      memset(plabel + sequences[indices[i]].size() - 1, 0,
+          (sequence_length - sequences[indices[i]].size() + 1) * sizeof(mx_float));
+      plabel += sequence_length;
+    }
+    NDArray array(Shape(batch, sequence_length), device, false);
+    array.SyncCopyFromCPU(label, batch * sequence_length);
+    return array;
+  }
+  virtual int GetPadNum(void) {
+    return sequence_length - sequences[random->data()[current * batch]].size();
+  }
+  virtual std::vector<int> GetIndex(void) {
+    const int* indices = random->data();
+    vector<int> list(indices + current * batch, indices + current * batch + batch);
+    return list;
+  }
+  virtual void BeforeFirst(void) {
+    current = -1;
+    random->shuffle(nullptr);
+  }
+
+  wstring readContent(const string file) {
+    wifstream ifs(file, ios::binary);
+    if (ifs) {
+      wostringstream os;
+      os << ifs.rdbuf();
+      return os.str();
+    }
+    return L"";
+  }
+
+  void buildCharIndex(const wstring& content) {
+  // This version buildCharIndex() Compatiable with python version char_rnn dictionary
+    int n = 1;
+    charIndices['\0'] = 0;  // padding character
+    index2chars.push_back(0);  // padding character index
+    for (auto c : content)
+      if (charIndices.find(c) == charIndices.end()) {
+        charIndices[c] = n++;
+        index2chars.push_back(c);
+      }
+  }
+//  void buildCharIndex(wstring& content) {
+//    for (auto c : content)
+//      charIndices[c]++; // char-frequency map; then char-index map
+//    vector<tuple<wchar_t, mx_float>> characters;
+//    for (auto& iter : charIndices)
+//      characters.push_back(make_tuple(iter.first, iter.second));
+//    sort(characters.begin(), characters.end(), [](const tuple<wchar_t, mx_float>& a,
+//      const tuple<wchar_t, mx_float>& b) { return get<1>(a) > get<1>(b); });
+//    mx_float index = 1; //0 is left for zero-padding
+//    index2chars.clear();
+//    index2chars.push_back(0); //zero-padding
+//    for (auto& t : characters) {
+//      charIndices[get<0>(t)] = index++;
+//      index2chars.push_back(get<0>(t));
+//    }
+//  }
+
+  inline wchar_t character(int i) {
+    return index2chars[i];
+  }
+
+  inline mx_float index(wchar_t c) {
+    return charIndices[c];
+  }
+
+  void saveCharIndices(const string file) {
+    wofstream ofs(file, ios::binary);
+    if (ofs) {
+      ofs.write(index2chars.data() + 1, index2chars.size() - 1);
+      ofs.close();
+    }
+  }
+
+  static tuple<unordered_map<wchar_t, mx_float>, vector<wchar_t>> loadCharIndices(
+      const string file) {
+    wifstream ifs(file, ios::binary);
+    unordered_map<wchar_t, mx_float> map;
+    vector<wchar_t> chars;
+    if (ifs) {
+      wostringstream os;
+      os << ifs.rdbuf();
+      int n = 1;
+      map[L'\0'] = 0;
+      chars.push_back(L'\0');
+      for (auto c : os.str()) {
+        map[c] = (mx_float) n++;
+        chars.push_back(c);
+      }
+    }
+    return {map, chars};
+  }
+
+  vector<vector<mx_float>> convertTextToSequences(const wstring& content, wchar_t spliter) {
+    vector<vector<mx_float>> sequences;
+    sequences.push_back(vector<mx_float>());
+    for (auto c : content)
+      if (c == spliter && !sequences.back().empty())
+        sequences.push_back(vector<mx_float>());
+      else
+        sequences.back().push_back(charIndices[c]);
+    return sequences;
+  }
+};
+
+void OutputPerplexity(NDArray* labels, NDArray* output) {
+  vector<mx_float> charIndices, a;
+  labels->SyncCopyToCPU(&charIndices, 0L);  // 0L indicates all
+  output->SyncCopyToCPU(&a, 0L)/*4128*84*/;
+  mx_float loss = 0;
+  int batchSize = labels->GetShape()[0]/*32*/, sequenceLength = labels->GetShape()[1]/*129*/,
+      nSamples = output->GetShape()[0]/*4128*/, vocabSize = output->GetShape()[1]/*84*/;
+  for (int n = 0; n < nSamples; n++) {
+    int row = n % batchSize, column = n / batchSize, labelOffset = column +
+        row * sequenceLength;  // Search based on column storage: labels.T
+    mx_float safe_value = max(1e-10f, a[vocabSize * n +
+                                    static_cast<int>(charIndices[labelOffset])]);
+    loss += -log(safe_value);  // Calculate negative log-likelihood
+  }
+  loss = exp(loss / nSamples);
+  cout << "Train-Perplexity=" << loss << endl;
+}
+
+void SaveCheckpoint(const string filepath, Symbol net, Executor* exe) {
+  map<string, NDArray> params;
+  for (auto iter : exe->arg_dict())
+    if (iter.first.find("_init_") == string::npos
+        && iter.first.rfind("data") != iter.first.length() - 4
+        && iter.first.rfind("label") != iter.first.length() - 5)
+      params.insert({"arg:" + iter.first, iter.second});
+  for (auto iter : exe->aux_dict())
+      params.insert({"aux:" + iter.first, iter.second});
+  NDArray::Save(filepath, params);
+}
+
+void LoadCheckpoint(const string filepath, Executor* exe) {
+  map<std::string, NDArray> params = NDArray::LoadToMap(filepath);
+  for (auto iter : params) {
+    string type = iter.first.substr(0, 4);
+    string name = iter.first.substr(4);
+    NDArray target;
+    if (type == "arg:")
+      target = exe->arg_dict()[name];
+    else if (type == "aux:")
+      target = exe->aux_dict()[name];
+    else
+      continue;
+    iter.second.CopyTo(&target);
+  }
+}
+
+int input_dim = 0;/*84*/
+int sequence_length_max = 0;/*129*/
+int num_embed = 256;
+int num_lstm_layer = 3;
+int num_hidden = 512;
+mx_float dropout = 0.2;
+void train(const string file, int batch_size, int max_epoch, int start_epoch) {
+  Context device(DeviceType::kGPU, 0);
+  BucketSentenceIter dataIter(file, batch_size, device);
+  string prefix = file.substr(0, file.rfind("."));
+  dataIter.saveCharIndices(prefix + ".dictionary");
+
+  input_dim = static_cast<int>(dataIter.characterSize());
+  sequence_length_max = dataIter.maxSequenceLength();
+
+  auto RNN = LSTMUnroll(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
+      num_embed, dropout);
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  for (int i = 0; i < num_lstm_layer; i++) {
+    string key = "l" + to_string(i) + "_init_";
+    args_map[key + "c"] = NDArray(Shape(batch_size, num_hidden), device, false);
+    args_map[key + "h"] = NDArray(Shape(batch_size, num_hidden), device, false);
+  }
+  vector<mx_float> zeros(batch_size * num_hidden, 0);
+  // RNN.SimpleBind(device, args_map, {}, {{"data", kNullOp}});
+  Executor* exe = RNN.SimpleBind(device, args_map);
+
+  if (start_epoch == -1) {
+    Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+    for (auto &arg : exe->arg_dict())
+      xavier(arg.first, &arg.second);
+  } else {
+    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
+  }
+  start_epoch++;
+
+  mx_float learning_rate = 0.0002;
+  mx_float weight_decay = 0.000002;
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
+//  ->SetParam("clip_gradient", 10);
+
+  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
+    dataIter.Reset();
+    auto tic = chrono::system_clock::now();
+    while (dataIter.Next()) {
+      auto data_batch = dataIter.GetDataBatch();
+      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
+      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
+      for (int l = 0; l < num_lstm_layer; l++) {
+        string key = "l" + to_string(l) + "_init_";
+        exe->arg_dict()[key + "c"].SyncCopyFromCPU(zeros);
+        exe->arg_dict()[key + "h"].SyncCopyFromCPU(zeros);
+      }
+      NDArray::WaitAll();
+
+      exe->Forward(true);
+      exe->Backward();
+      exe->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+    auto toc = chrono::system_clock::now();
+    cout << "Epoch[" << epoch << "] Time Cost:" <<
+        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
+    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
+    string filepath = prefix + "-" + to_string(epoch) + ".params";
+    SaveCheckpoint(filepath, RNN, exe);
+  }
+}
+
+/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
+ * variable name, cannot initialize LSTM_parameters. Thus it was renamed to LSTM_bias,
+ * which can be initialized as zero. But it cannot converge after 100 epochs in this corpus
+ * example. Using RNNXavier, after 15 oscillating epochs,  it rapidly converges like old
+ * LSTMUnroll version. */
+class RNNXavier : public Xavier {
+ public:
+  RNNXavier(RandType rand_type = gaussian, FactorType factor_type = avg,
+    float magnitude = 3) : Xavier(rand_type, factor_type, magnitude) {
+  }
+  virtual ~RNNXavier() {}
+ protected:
+  virtual void InitDefault(NDArray* arr) {
+    Xavier::InitWeight(arr);
+  }
+};
+
+void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int start_epoch) {
+  Context device(DeviceType::kGPU, 0);
+  BucketSentenceIter dataIter(file, batch_size, device);
+  string prefix = file.substr(0, file.rfind("."));
+  dataIter.saveCharIndices(prefix + ".dictionary");
+
+  input_dim = static_cast<int>(dataIter.characterSize());
+  sequence_length_max = dataIter.maxSequenceLength();
+
+  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
+      num_embed, dropout);
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  // Avoiding SwapAxis, batch_size is of second dimension.
+  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
+  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
+  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
+  Executor* exe = RNN.SimpleBind(device, args_map);
+
+  if (start_epoch == -1) {
+    RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34);
+    for (auto &arg : exe->arg_dict())
+      xavier(arg.first, &arg.second);
+  } else {
+    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
+  }
+  start_epoch++;
+
+  mx_float learning_rate = 0.0002;
+  mx_float weight_decay = 0.000002;
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
+//  ->SetParam("clip_gradient", 10);
+
+  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
+    dataIter.Reset();
+    auto tic = chrono::system_clock::now();
+    while (dataIter.Next()) {
+      auto data_batch = dataIter.GetDataBatch();
+      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
+      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
+      exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros);
+      exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros);
+      NDArray::WaitAll();
+
+      exe->Forward(true);
+      exe->Backward();
+      exe->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+    auto toc = chrono::system_clock::now();
+    cout << "Epoch[" << epoch << "] Time Cost:" <<
+        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
+    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
+    string filepath = prefix + "-" + to_string(epoch) + ".params";
+    SaveCheckpoint(filepath, RNN, exe);
+  }
+}
+
+void predict(wstring* ptext, int sequence_length, const string param_file,
+    const string dictionary_file) {
+  Context device(DeviceType::kGPU, 0);
+  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
+  auto dictionary = get<0>(results);
+  auto charIndices = get<1>(results);
+  input_dim = static_cast<int>(charIndices.size());
+  auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
+
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(1, 1), device, false);
+  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
+  vector<mx_float> zeros(1 * num_hidden, 0);
+  for (int l = 0; l < num_lstm_layer; l++) {
+    string key = "l" + to_string(l) + "_init_";
+    args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false);
+    args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false);
+    args_map[key + "c"].SyncCopyFromCPU(zeros);
+    args_map[key + "h"].SyncCopyFromCPU(zeros);
+  }
+  Executor* exe = RNN.SimpleBind(device, args_map);
+  LoadCheckpoint(param_file, exe);
+
+  mx_float index;
+  wchar_t next;
+  vector<mx_float> softmax;
+  softmax.resize(input_dim);
+  for (auto c : *ptext) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    for (int l = 0; l < num_lstm_layer; l++) {
+      string key = "l" + to_string(l) + "_init_";
+      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
+      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
+    }
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+  }
+  ptext->push_back(next);
+
+  for (int i = 0; i < sequence_length; i++) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    for (int l = 0; l < num_lstm_layer; l++) {
+      string key = "l" + to_string(l) + "_init_";
+      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
+      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
+    }
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+    ptext->push_back(next);
+  }
+}
+
+void predictWithBuiltInRNNOp(wstring* ptext, int sequence_length, const string param_file,
+  const string dictionary_file) {
+  Context device(DeviceType::kGPU, 0);
+  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
+  auto dictionary = get<0>(results);
+  auto charIndices = get<1>(results);
+  input_dim = static_cast<int>(charIndices.size());
+  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
+
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(1, 1), device, false);
+  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
+  vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
+  // Avoiding SwapAxis, batch_size=1 is of second dimension.
+  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
+  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
+  args_map["LSTM_init_c"].SyncCopyFromCPU(zeros);
+  args_map["LSTM_init_h"].SyncCopyFromCPU(zeros);
+  Executor* exe = RNN.SimpleBind(device, args_map);
+  LoadCheckpoint(param_file, exe);
+
+  mx_float index;
+  wchar_t next;
+  vector<mx_float> softmax;
+  softmax.resize(input_dim);
+  for (auto c : *ptext) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
+    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+  }
+  ptext->push_back(next);
+
+  for (int i = 0; i < sequence_length; i++) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
+    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+    ptext->push_back(next);
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 5) {
+    cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}"
+            " {batch size} {max epoch} [{starting epoch}]" << endl;
+    cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}"
+            " {dictionary file} {beginning of text}" << endl;
+    cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor"
+            " are not compatible with each other." << endl;
+    return 0;
+  }
+
+  string task = argv[1];
+  bool builtIn = task.find("BuiltIn") != string::npos;
+  TIME_MAJOR = task.find("TimeMajor") != string::npos;
+  cout << "use BuiltIn cuDNN RNN: " << builtIn << endl
+         << "use data as TimeMajor: " << TIME_MAJOR << endl;
+  if (task.find("train") == 0) {
+    cout << "train batch size:      " << argv[3] << endl
+           << "train max epoch:       " << argv[4] << endl;
+    int start_epoch = argc > 5? atoi(argv[5]) : -1;
+    // this function will generate dictionary file and params file.
+    if (builtIn)
+      trainWithBuiltInRNNOp(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);
+    else
+      train(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);  // ditto
+  } else if (task.find("predict") == 0) {
+    wstring text;  // = L"If there is anyone out there who still doubts ";
+    // Considering of extending to Chinese samples in future, use wchar_t instead of char
+    for (char c : string(argv[4]))
+      text.push_back((wchar_t) c);
+    /*Python version predicts text default to random selecltions. Here I didn't write the random
+    code, always choose the 'best' character. So the text length reduced to 600. Longer size often
+    leads to repeated sentances, since training sequence length is only 129 for obama corpus.*/
+    if (builtIn)
+      predictWithBuiltInRNNOp(&text, 600, argv[2], argv[3]);
+    else
+      predict(&text, 600, argv[2], argv[3]);
+    wcout << text << endl;
+  }
+
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/Makefile b/cpp-package/example/feature_extract/Makefile
new file mode 100644
index 000000000000..808f2613b001
--- /dev/null
+++ b/cpp-package/example/feature_extract/Makefile
@@ -0,0 +1,26 @@
+CXX=g++
+BLAS=-L /opt/openblas/lib -lopenblas -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 
+CUDA=-DMSHADOW_USE_CUDA=1
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+#COMMFLAGS=-static -static-libgcc -static-libstdc++
+
+CFLAGS=$(COMMFLAGS) -I ../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -fopenmp 
+LDFLAGS=$(COMMFLAGS) -L ../../lib/linux -lmxnet $(BLAS) $(CUDA) -lgomp -pthread
+
+all: feature_extract prepare_data_with_opencv
+
+feature_extract: ./feature_extract.cpp
+	$(CXX) -c -std=c++0x $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+prepare_data_with_opencv: ./prepare_data_with_opencv.cpp
+	$(CXX) -c -std=c++0x $(OPENCV_CFLAGS) $^ 
+	$(CXX) $(basename $@).o -o $@ $(OPENCV_LDFLAGS)
+	-rm -f $(basename $@).o
+
+clean:
+	-rm -f feature_extract
+	-rm -f prepare_data_with_opencv
diff --git a/cpp-package/example/feature_extract/feature_extract.cpp b/cpp-package/example/feature_extract/feature_extract.cpp
new file mode 100644
index 000000000000..21853a3912e7
--- /dev/null
+++ b/cpp-package/example/feature_extract/feature_extract.cpp
@@ -0,0 +1,120 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace std;
+using namespace mxnet::cpp;
+
+/*
+ * This example shows how to extract features with a pretrained model.
+ * Get the model here:
+ *   https://github.com/dmlc/mxnet-model-gallery
+ * */
+
+/*The global context, change them if necessary*/
+Context global_ctx(kGPU, 0);
+// Context global_ctx(kCPU,0);
+
+class FeatureExtractor {
+ private:
+  /*the mean image, get from the pretrained model*/
+  NDArray mean_img;
+  /*the following two maps store all the paramters need by the model*/
+  map<string, NDArray> args_map;
+  map<string, NDArray> aux_map;
+  Symbol net;
+  Executor *executor;
+  /*Get the feature layer we want to extract*/
+  void GetFeatureSymbol() {
+    /*
+     * use the following to check all the layers' names:
+     * */
+    /*
+    net=Symbol::Load("./model/Inception_BN-symbol.json").GetInternals();
+    for(const auto & layer_name:net.ListOutputs()){
+      LG<<layer_name;
+    }
+    */
+    net = Symbol::Load("./model/Inception_BN-symbol.json")
+              .GetInternals()["global_pool_output"];
+  }
+  /*Fill the trained paramters into the model, a.k.a. net, executor*/
+  void LoadParameters() {
+    map<string, NDArray> paramters;
+    NDArray::Load("./model/Inception_BN-0039.params", 0, &paramters);
+    for (const auto &k : paramters) {
+      if (k.first.substr(0, 4) == "aux:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        aux_map[name] = k.second.Copy(global_ctx);
+      }
+      if (k.first.substr(0, 4) == "arg:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        args_map[name] = k.second.Copy(global_ctx);
+      }
+    }
+    /*WaitAll is need when we copy data between GPU and the main memory*/
+    NDArray::WaitAll();
+  }
+  void GetMeanImg() {
+    mean_img = NDArray(Shape(1, 3, 224, 224), global_ctx, false);
+    mean_img.SyncCopyFromCPU(
+        NDArray::LoadToMap("./model/mean_224.nd")["mean_img"].GetData(),
+        1 * 3 * 224 * 224);
+    NDArray::WaitAll();
+  }
+
+ public:
+  FeatureExtractor() {
+    /*prepare the model, fill the pretrained parameters, get the mean image*/
+    GetFeatureSymbol();
+    LoadParameters();
+    GetMeanImg();
+  }
+
+  void Extract(NDArray data) {
+    /*Normalize the pictures*/
+    data.Slice(0, 1) -= mean_img;
+    data.Slice(1, 2) -= mean_img;
+    args_map["data"] = data;
+    /*bind the excutor*/
+    executor = net.SimpleBind(global_ctx, args_map, map<string, NDArray>(),
+                              map<string, OpReqType>(), aux_map);
+    executor->Forward(false);
+    /*print out the features*/
+    auto array = executor->outputs[0].Copy(Context(kCPU, 0));
+    NDArray::WaitAll();
+    for (int i = 0; i < 1024; ++i) {
+      cout << array.At(0, i) << ",";
+    }
+    cout << endl;
+  }
+};
+
+NDArray Data2NDArray() {
+  NDArray ret(Shape(2, 3, 224, 224), global_ctx, false);
+  ifstream inf("./img.dat", ios::binary);
+  vector<float> data(2 * 3 * 224 * 224);
+  inf.read(reinterpret_cast<char *>data.data(), 2 * 3 * 224 * 224 * sizeof(float));
+  inf.close();
+  ret.SyncCopyFromCPU(data.data(), 2 * 3 * 224 * 224);
+  NDArray::WaitAll();
+  return ret;
+}
+
+int main() {
+  /*
+   * get the data from a binary file ./img.data
+   * this file is generated by ./prepare_data_with_opencv
+   * it stores 2 pictures in NDArray format
+   *
+   */
+  auto data = Data2NDArray();
+  FeatureExtractor fe;
+  fe.Extract(data);
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
new file mode 100644
index 000000000000..20cbe140fc09
--- /dev/null
+++ b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
@@ -0,0 +1,37 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+using namespace std;
+
+/*read images and store them the NDArray format that MXNet.cpp can handle*/
+void Mat2Array() {
+  string file_name_list[] = {"./1.jpg", "./2.jpg"};
+
+  std::vector<float> array;
+  for (auto &t : file_name_list) {
+    cv::Mat mat = cv::imread(t);
+    /*resize pictures to (224, 224) according to the pretrained model*/
+    cv::resize(mat, mat, cv::Size(224, 224));
+    for (int c = 0; c < 3; ++c) {
+      for (int i = 0; i < 224; ++i) {
+        for (int j = 0; j < 224; ++j) {
+          array.push_back(static_cast<float>(mat.data[(i * 224 + j) * 3 + c]));
+        }
+      }
+    }
+  }
+  ofstream outf("./img.dat", ios::binary);
+  outf.write(reinterpret_cast<char *>array.data(), array.size() * sizeof(float));
+  outf.close();
+}
+
+int main(int argc, char *argv[]) {
+  Mat2Array();
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/run.sh b/cpp-package/example/feature_extract/run.sh
new file mode 100755
index 000000000000..afac492b0a9d
--- /dev/null
+++ b/cpp-package/example/feature_extract/run.sh
@@ -0,0 +1,12 @@
+### To run the this example,
+###
+### 1.
+### Get Inseption-BN model first, from here
+###     https://github.com/dmlc/mxnet-model-gallery
+###
+### 2.
+### Then Prepare 2 pictures, 1.jpg 2.jpg to extract
+
+make
+./prepare_data_with_opencv
+LD_LIBRARY_PATH=../../lib/linux ./feature_extract
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
new file mode 100644
index 000000000000..2a70ab39b21c
--- /dev/null
+++ b/cpp-package/example/googlenet.cpp
@@ -0,0 +1,163 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <string>
+#include <vector>
+#include <map>
+
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvFactory(Symbol data, int num_filter,
+                   Shape kernel,
+                   Shape stride = Shape(1, 1),
+                   Shape pad = Shape(0, 0),
+                   const std::string & name = "",
+                   const std::string & suffix = "") {
+  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
+
+  Symbol conv = Convolution("conv_" + name + suffix, data,
+                            conv_w, conv_b, kernel,
+                            num_filter, stride, Shape(1, 1), pad);
+  return Activation("relu_" + name + suffix, conv, "relu");
+}
+
+Symbol InceptionFactory(Symbol data, int num_1x1, int num_3x3red,
+                        int num_3x3, int num_d5x5red, int num_d5x5,
+                        PoolingPoolType pool, int proj, const std::string & name) {
+  Symbol c1x1 = ConvFactory(data, num_1x1, Shape(1, 1),
+                            Shape(1, 1), Shape(0, 0), name + "_1x1");
+
+  Symbol c3x3r = ConvFactory(data, num_3x3red, Shape(1, 1),
+                             Shape(1, 1), Shape(0, 0), name + "_3x3", "_reduce");
+
+  Symbol c3x3 = ConvFactory(c3x3r, num_3x3, Shape(3, 3),
+                            Shape(1, 1), Shape(1, 1), name + "_3x3");
+
+  Symbol cd5x5r = ConvFactory(data, num_d5x5red, Shape(1, 1),
+                              Shape(1, 1), Shape(0, 0), name + "_5x5", "_reduce");
+
+  Symbol cd5x5 = ConvFactory(cd5x5r, num_d5x5, Shape(5, 5),
+                             Shape(1, 1), Shape(2, 2), name + "_5x5");
+
+  Symbol pooling = Pooling(name + "_pool", data, Shape(3, 3), pool,
+                           false, PoolingPoolingConvention::valid,
+                           Shape(1, 1), Shape(1, 1));
+
+  Symbol cproj = ConvFactory(pooling, proj, Shape(1, 1),
+                             Shape(1, 1), Shape(0, 0), name + "_proj");
+
+  std::vector<Symbol> lst;
+  lst.push_back(c1x1);
+  lst.push_back(c3x3);
+  lst.push_back(cd5x5);
+  lst.push_back(cproj);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol GoogleNetSymbol(int num_classes) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  Symbol conv1 = ConvFactory(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
+  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::max,
+                         false, PoolingPoolingConvention::valid, Shape(2, 2));
+  Symbol conv2 = ConvFactory(pool1, 64, Shape(1, 1), Shape(1, 1),
+                             Shape(0, 0), "conv2");
+  Symbol conv3 = ConvFactory(conv2, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv3");
+  Symbol pool3 = Pooling("pool3", conv3, Shape(3, 3), PoolingPoolType::max,
+                         false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  Symbol in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, PoolingPoolType::max, 32, "in3a");
+  Symbol in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, PoolingPoolType::max, 64, "in3b");
+  Symbol pool4 = Pooling("pool4", in3b, Shape(3, 3), PoolingPoolType::max,
+                         false, PoolingPoolingConvention::valid, Shape(2, 2));
+  Symbol in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, PoolingPoolType::max, 64, "in4a");
+  Symbol in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, PoolingPoolType::max, 64, "in4b");
+  Symbol in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, PoolingPoolType::max, 64, "in4c");
+  Symbol in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, PoolingPoolType::max, 64, "in4d");
+  Symbol in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, PoolingPoolType::max, 128, "in4e");
+  Symbol pool5 = Pooling("pool5", in4e, Shape(3, 3), PoolingPoolType::max,
+                         false, PoolingPoolingConvention::valid, Shape(2, 2));
+  Symbol in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, PoolingPoolType::max, 128, "in5a");
+  Symbol in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, PoolingPoolType::max, 128, "in5b");
+  Symbol pool6 = Pooling("pool6", in5b, Shape(7, 7), PoolingPoolType::avg,
+      false, PoolingPoolingConvention::valid, Shape(1, 1));
+
+  Symbol flatten = Flatten("flatten", pool6);
+
+  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, num_classes);
+
+  return SoftmaxOutput("softmax", fc1, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 50;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto googlenet = GoogleNetSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  googlenet.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  auto train_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./train.lst")
+      .SetParam("path_imgrec", "./train.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .CreateDataIter();
+
+  auto val_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./val.lst")
+      .SetParam("path_imgrec", "./_val.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      delete exec;
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+      delete exec;
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
new file mode 100644
index 000000000000..e3a87f2c3dfa
--- /dev/null
+++ b/cpp-package/example/inception_bn.cpp
@@ -0,0 +1,188 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+
+Symbol ConvFactoryBN(Symbol data, int num_filter,
+                     Shape kernel, Shape stride, Shape pad,
+                     const std::string & name,
+                     const std::string & suffix = "") {
+  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
+
+  Symbol conv = Convolution("conv_" + name + suffix, data,
+                            conv_w, conv_b, kernel,
+                            num_filter, stride, Shape(1, 1), pad);
+  Symbol bn = BatchNorm("bn_" + name + suffix, conv);
+  return Activation("relu_" + name + suffix, bn, "relu");
+}
+
+Symbol InceptionFactoryA(Symbol data, int num_1x1, int num_3x3red,
+                         int num_3x3, int num_d3x3red, int num_d3x3,
+                         PoolingPoolType pool, int proj,
+                         const std::string & name) {
+  Symbol c1x1 = ConvFactoryBN(data, num_1x1, Shape(1, 1), Shape(1, 1),
+                              Shape(0, 0), name + "1x1");
+  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1), Shape(1, 1),
+                               Shape(0, 0), name + "_3x3r");
+  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(1, 1),
+                              Shape(1, 1), name + "_3x3");
+  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
+                                Shape(0, 0), name + "_double_3x3", "_reduce");
+  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
+                               Shape(1, 1), name + "_double_3x3_0");
+  cd3x3 = ConvFactoryBN(data = cd3x3, num_d3x3, Shape(3, 3), Shape(1, 1),
+                        Shape(1, 1), name + "_double_3x3_1");
+  Symbol pooling = Pooling(name + "_pool", data,
+                           Shape(3, 3), pool, false,
+                           PoolingPoolingConvention::valid,
+                           Shape(1, 1), Shape(1, 1));
+  Symbol cproj = ConvFactoryBN(pooling, proj, Shape(1, 1), Shape(1, 1),
+                               Shape(0, 0), name + "_proj");
+  std::vector<Symbol> lst;
+  lst.push_back(c1x1);
+  lst.push_back(c3x3);
+  lst.push_back(cd3x3);
+  lst.push_back(cproj);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol InceptionFactoryB(Symbol data, int num_3x3red, int num_3x3,
+                         int num_d3x3red, int num_d3x3, const std::string & name) {
+  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1),
+                               Shape(1, 1), Shape(0, 0),
+                               name + "_3x3", "_reduce");
+  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(2, 2),
+                              Shape(1, 1), name + "_3x3");
+  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
+                                Shape(0, 0), name + "_double_3x3", "_reduce");
+  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
+                               Shape(1, 1), name + "_double_3x3_0");
+  cd3x3 = ConvFactoryBN(cd3x3, num_d3x3, Shape(3, 3), Shape(2, 2),
+                        Shape(1, 1), name + "_double_3x3_1");
+  Symbol pooling = Pooling("max_pool_" + name + "_pool", data,
+                           Shape(3, 3), PoolingPoolType::max,
+                           false, PoolingPoolingConvention::valid, Shape(2, 2));
+  std::vector<Symbol> lst;
+  lst.push_back(c3x3);
+  lst.push_back(cd3x3);
+  lst.push_back(pooling);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol InceptionSymbol(int num_classes) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  // stage 1
+  Symbol conv1 = ConvFactoryBN(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
+  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  // stage 2
+  Symbol conv2red = ConvFactoryBN(pool1, 64, Shape(1, 1), Shape(1, 1),  Shape(0, 0), "conv2red");
+  Symbol conv2 = ConvFactoryBN(conv2red, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv2");
+  Symbol pool2 = Pooling("pool2", conv2, Shape(3, 3), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  // stage 3
+  Symbol in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, PoolingPoolType::avg, 32, "3a");
+  Symbol in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, PoolingPoolType::avg, 64, "3b");
+  Symbol in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c");
+
+  // stage 4
+  Symbol in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, PoolingPoolType::avg, 128, "4a");
+  Symbol in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128,  PoolingPoolType::avg, 128, "4b");
+  Symbol in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, PoolingPoolType::avg, 128, "4c");
+  Symbol in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192,  PoolingPoolType::avg, 128, "4d");
+  Symbol in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e");
+
+  // stage 5
+  Symbol in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, PoolingPoolType::avg, 128, "5a");
+  Symbol in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, PoolingPoolType::max, 128, "5b");
+
+  // average pooling
+  Symbol avg = Pooling("global_pool", in5b, Shape(7, 7), PoolingPoolType::avg);
+
+  // classifier
+  Symbol flatten = Flatten("flatten", avg);
+  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+  Symbol fc1 = FullyConnected("fc1", flatten, conv1_w, conv1_b, num_classes);
+  return SoftmaxOutput("softmax", fc1, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 40;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto inception_bn_net = InceptionSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 224, 224), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  inception_bn_net.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  auto train_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./train.lst")
+      .SetParam("path_imgrec", "./train.rec")
+      .SetParam("data_shape", Shape(3, 224, 224))
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .CreateDataIter();
+
+  auto val_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./val.lst")
+      .SetParam("path_imgrec", "./val.rec")
+      .SetParam("data_shape", Shape(3, 224, 224))
+      .SetParam("batch_size", batch_size)
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  auto *exec = inception_bn_net.SimpleBind(Context::gpu(), args_map);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  delete exec;
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
new file mode 100644
index 000000000000..cdd601b12996
--- /dev/null
+++ b/cpp-package/example/lenet.cpp
@@ -0,0 +1,233 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace std;
+using namespace mxnet::cpp;
+
+class Lenet {
+ public:
+  Lenet()
+      : ctx_cpu(Context(DeviceType::kCPU, 0)),
+        ctx_dev(Context(DeviceType::kGPU, 0)) {}
+  void Run() {
+    /*
+     * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
+     * "Gradient-based learning applied to document recognition."
+     * Proceedings of the IEEE (1998)
+     * */
+
+    /*define the symbolic net*/
+    Symbol data = Symbol::Variable("data");
+    Symbol data_label = Symbol::Variable("data_label");
+    Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+    Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
+    Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
+    Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+    Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
+
+    Symbol conv1 =
+        Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
+    Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::tanh);
+    Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+    Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b,
+      Shape(5, 5), 50);
+    Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::tanh);
+    Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+    Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b,
+      Shape(2, 2), 500);
+    Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::tanh);
+    Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(1, 1));
+
+    Symbol flatten = Flatten("flatten", pool3);
+    Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
+    Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::tanh);
+    Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10);
+
+    Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
+
+    for (auto s : lenet.ListArguments()) {
+      LG << s;
+    }
+
+    /*setup basic configs*/
+    int val_fold = 1;
+    int W = 28;
+    int H = 28;
+    int batch_size = 42;
+    int max_epoch = 100000;
+    float learning_rate = 1e-4;
+    float weight_decay = 1e-4;
+
+    /*prepare the data*/
+    vector<float> data_vec, label_vec;
+    size_t data_count = GetData(&data_vec, &label_vec);
+    const float *dptr = data_vec.data();
+    const float *lptr = label_vec.data();
+    NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu,
+                                 false);  // store in main memory, and copy to
+    // device memory while training
+    NDArray label_array =
+      NDArray(Shape(data_count), ctx_cpu,
+                false);  // it's also ok if just store them all in device memory
+    data_array.SyncCopyFromCPU(dptr, data_count * W * H);
+    label_array.SyncCopyFromCPU(lptr, data_count);
+    data_array.WaitToRead();
+    label_array.WaitToRead();
+
+    size_t train_num = data_count * (1 - val_fold / 10.0);
+    train_data = data_array.Slice(0, train_num);
+    train_label = label_array.Slice(0, train_num);
+    val_data = data_array.Slice(train_num, data_count);
+    val_label = label_array.Slice(train_num, data_count);
+
+    LG << "here read fin";
+
+    /*init some of the args*/
+    // map<string, NDArray> args_map;
+    args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev);
+    args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev);
+    NDArray::WaitAll();
+
+    LG << "here slice fin";
+    /*
+     * we can also feed in some of the args other than the input all by
+     * ourselves,
+     * fc2-w , fc1-b for example:
+     * */
+    // args_map["fc2_w"] =
+    // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false);
+    // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]);
+    // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false);
+    // args_map["fc1_b"] = 0;
+
+    lenet.InferArgsMap(ctx_dev, &args_map, args_map);
+    Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+    opt->SetParam("momentum", 0.9)
+       ->SetParam("rescale_grad", 1.0)
+       ->SetParam("clip_gradient", 10);
+
+    for (int ITER = 0; ITER < max_epoch; ++ITER) {
+      size_t start_index = 0;
+      while (start_index < train_num) {
+        if (start_index + batch_size > train_num) {
+          start_index = train_num - batch_size;
+        }
+        args_map["data"] =
+            train_data.Slice(start_index, start_index + batch_size)
+                .Copy(ctx_dev);
+        args_map["data_label"] =
+            train_label.Slice(start_index, start_index + batch_size)
+                .Copy(ctx_dev);
+        start_index += batch_size;
+        NDArray::WaitAll();
+
+        Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+        exe->Forward(true);
+        exe->Backward();
+        exe->UpdateAll(opt, learning_rate, weight_decay);
+
+        delete exe;
+      }
+
+      LG << "Iter " << ITER
+         << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
+    }
+  }
+
+ private:
+  Context ctx_cpu;
+  Context ctx_dev;
+  map<string, NDArray> args_map;
+  NDArray train_data;
+  NDArray train_label;
+  NDArray val_data;
+  NDArray val_label;
+
+  size_t GetData(vector<float> *data, vector<float> *label) {
+    const char *train_data_path = "./train.csv";
+    ifstream inf(train_data_path);
+    string line;
+    inf >> line;  // ignore the header
+    size_t _N = 0;
+    while (inf >> line) {
+      for (auto &c : line) c = (c == ',') ? ' ' : c;
+      stringstream ss;
+      ss << line;
+      float _data;
+      ss >> _data;
+      label->push_back(_data);
+      while (ss >> _data) data->push_back(_data / 256.0);
+      _N++;
+    }
+    inf.close();
+    return _N;
+  }
+
+  float ValAccuracy(int batch_size, Symbol lenet) {
+    size_t val_num = val_data.GetShape()[0];
+
+    size_t correct_count = 0;
+    size_t all_count = 0;
+
+    size_t start_index = 0;
+    while (start_index < val_num) {
+      if (start_index + batch_size > val_num) {
+        start_index = val_num - batch_size;
+      }
+      args_map["data"] =
+          val_data.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
+      args_map["data_label"] =
+          val_label.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
+      start_index += batch_size;
+      NDArray::WaitAll();
+
+      Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+      exe->Forward(false);
+
+      const auto &out = exe->outputs;
+      NDArray out_cpu = out[0].Copy(ctx_cpu);
+      NDArray label_cpu =
+          val_label.Slice(start_index - batch_size, start_index).Copy(ctx_cpu);
+
+      NDArray::WaitAll();
+
+      const mx_float *dptr_out = out_cpu.GetData();
+      const mx_float *dptr_label = label_cpu.GetData();
+      for (int i = 0; i < batch_size; ++i) {
+        float label = dptr_label[i];
+        int cat_num = out_cpu.GetShape()[1];
+        float p_label = 0, max_p = dptr_out[i * cat_num];
+        for (int j = 0; j < cat_num; ++j) {
+          float p = dptr_out[i * cat_num + j];
+          if (max_p < p) {
+            p_label = j;
+            max_p = p;
+          }
+        }
+        if (label == p_label) correct_count++;
+      }
+      all_count += batch_size;
+
+      delete exe;
+    }
+    return correct_count * 1.0 / all_count;
+  }
+};
+
+int main(int argc, char const *argv[]) {
+  Lenet lenet;
+  lenet.Run();
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
new file mode 100644
index 000000000000..6a54fa44a67b
--- /dev/null
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -0,0 +1,119 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace std;
+using namespace mxnet::cpp;
+
+Symbol LenetSymbol() {
+  /*
+   * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
+   * "Gradient-based learning applied to document recognition."
+   * Proceedings of the IEEE (1998)
+   * */
+
+  /*define the symbolic net*/
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+  Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
+  Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
+  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+  Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
+
+  Symbol conv1 = Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
+  Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::tanh);
+  Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, Shape(5, 5), 50);
+  Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::tanh);
+  Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::max,
+      false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  Symbol flatten = Flatten("flatten", pool2);
+  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
+  Symbol tanh3 = Activation("tanh3", fc1, ActivationActType::tanh);
+  Symbol fc2 = FullyConnected("fc2", tanh3, fc2_w, fc2_b, 10);
+
+  Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
+
+  return lenet;
+}
+
+int main(int argc, char const *argv[]) {
+  /*setup basic configs*/
+  int W = 28;
+  int H = 28;
+  int batch_size = 128;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto lenet = LenetSymbol();
+  std::map<string, NDArray> args_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 1, W, H), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  lenet.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), Context::gpu());
+  NDArray::SampleGaussian(0, 1, &args_map["fc1_w"]);
+  args_map["fc2_b"] = NDArray(Shape(10), Context::gpu());
+  args_map["fc2_b"] = 0;
+
+  auto train_iter = MXDataIter("MNISTIter")
+      .SetParam("image", "./train-images-idx3-ubyte")
+      .SetParam("label", "./train-labels-idx1-ubyte")
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .SetParam("flat", 0)
+      .CreateDataIter();
+  auto val_iter = MXDataIter("MNISTIter")
+      .SetParam("image", "./t10k-images-idx3-ubyte")
+      .SetParam("label", "./t10k-labels-idx1-ubyte")
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0)
+     ->SetParam("clip_gradient", 10);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      delete exec;
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+      delete exec;
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
new file mode 100644
index 000000000000..2a19f6ea1e87
--- /dev/null
+++ b/cpp-package/example/mlp.cpp
@@ -0,0 +1,162 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+/*
+ * In this example,
+ * we make by hand some data in 10 classes with some pattern
+ * and try to use MLP to recognize the pattern.
+ */
+
+void OutputAccuracy(mx_float* pred, mx_float* target) {
+  int right = 0;
+  for (int i = 0; i < 128; ++i) {
+    float mx_p = pred[i * 10 + 0];
+    float p_y = 0;
+    for (int j = 0; j < 10; ++j) {
+      if (pred[i * 10 + j] > mx_p) {
+        mx_p = pred[i * 10 + j];
+        p_y = j;
+      }
+    }
+    if (p_y == target[i]) right++;
+  }
+  cout << "Accuracy: " << right / 128.0 << endl;
+}
+
+void MLP() {
+  auto sym_x = Symbol::Variable("X");
+  auto sym_label = Symbol::Variable("label");
+
+  const int nLayers = 2;
+  vector<int> layerSizes({512, 10});
+  vector<Symbol> weights(nLayers);
+  vector<Symbol> biases(nLayers);
+  vector<Symbol> outputs(nLayers);
+
+  for (int i = 0; i < nLayers; i++) {
+    string istr = to_string(i);
+    weights[i] = Symbol::Variable(string("w") + istr);
+    biases[i] = Symbol::Variable(string("b") + istr);
+    Symbol fc = FullyConnected(string("fc") + istr,
+      i == 0? sym_x : outputs[i-1],
+      weights[i], biases[i], layerSizes[i]);
+    outputs[i] = LeakyReLU(string("act") + istr, fc, LeakyReLUActType::leaky);
+  }
+  auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);
+
+  Context ctx_dev(DeviceType::kCPU, 0);
+
+  NDArray array_x(Shape(128, 28), ctx_dev, false);
+  NDArray array_y(Shape(128), ctx_dev, false);
+
+  mx_float* aptr_x = new mx_float[128 * 28];
+  mx_float* aptr_y = new mx_float[128];
+
+  // we make the data by hand, in 10 classes, with some pattern
+  for (int i = 0; i < 128; i++) {
+    for (int j = 0; j < 28; j++) {
+      aptr_x[i * 28 + j] = i % 10 * 1.0f;
+    }
+    aptr_y[i] = i % 10;
+  }
+  array_x.SyncCopyFromCPU(aptr_x, 128 * 28);
+  array_x.WaitToRead();
+  array_y.SyncCopyFromCPU(aptr_y, 128);
+  array_y.WaitToRead();
+
+  // init the parameters
+  NDArray array_w_1(Shape(512, 28), ctx_dev, false);
+  NDArray array_b_1(Shape(512), ctx_dev, false);
+  NDArray array_w_2(Shape(10, 512), ctx_dev, false);
+  NDArray array_b_2(Shape(10), ctx_dev, false);
+
+  // the parameters should be initialized in some kind of distribution,
+  // so it learns fast
+  // but here just give a const value by hand
+  array_w_1 = 0.5f;
+  array_b_1 = 0.0f;
+  array_w_2 = 0.5f;
+  array_b_2 = 0.0f;
+
+  // the grads
+  NDArray array_w_1_g(Shape(512, 28), ctx_dev, false);
+  NDArray array_b_1_g(Shape(512), ctx_dev, false);
+  NDArray array_w_2_g(Shape(10, 512), ctx_dev, false);
+  NDArray array_b_2_g(Shape(10), ctx_dev, false);
+
+  // Bind the symolic network with the ndarray
+  // all the input args
+  std::vector<NDArray> in_args;
+  in_args.push_back(array_x);
+  in_args.push_back(array_w_1);
+  in_args.push_back(array_b_1);
+  in_args.push_back(array_w_2);
+  in_args.push_back(array_b_2);
+  in_args.push_back(array_y);
+  // all the grads
+  std::vector<NDArray> arg_grad_store;
+  arg_grad_store.push_back(NDArray());  // we don't need the grad of the input
+  arg_grad_store.push_back(array_w_1_g);
+  arg_grad_store.push_back(array_b_1_g);
+  arg_grad_store.push_back(array_w_2_g);
+  arg_grad_store.push_back(array_b_2_g);
+  arg_grad_store.push_back(
+      NDArray());  // neither do we need the grad of the loss
+  // how to handle the grad
+  std::vector<OpReqType> grad_req_type;
+  grad_req_type.push_back(kNullOp);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kNullOp);
+  std::vector<NDArray> aux_states;
+
+  cout << "make the Executor" << endl;
+  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
+                               grad_req_type, aux_states);
+
+  cout << "Training" << endl;
+  int max_iters = 20000;
+  mx_float learning_rate = 0.0001;
+  for (int iter = 0; iter < max_iters; ++iter) {
+    exe->Forward(true);
+
+    if (iter % 100 == 0) {
+      cout << "epoch " << iter << endl;
+      std::vector<NDArray>& out = exe->outputs;
+      float* cptr = new float[128 * 10];
+      out[0].SyncCopyToCPU(cptr, 128 * 10);
+      NDArray::WaitAll();
+      OutputAccuracy(cptr, aptr_y);
+      delete[] cptr;
+    }
+
+    // update the parameters
+    exe->Backward();
+    for (int i = 1; i < 5; ++i) {
+      in_args[i] -= arg_grad_store[i] * learning_rate;
+    }
+    NDArray::WaitAll();
+  }
+
+  delete exe;
+  delete[] aptr_x;
+  delete[] aptr_y;
+}
+
+int main(int argc, char** argv) {
+  MLP();
+  MXNotifyShutdown();
+  return 0;
+}
+
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
new file mode 100644
index 000000000000..4477c87224f2
--- /dev/null
+++ b/cpp-package/example/resnet.cpp
@@ -0,0 +1,191 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvolutionNoBias(const std::string& symbol_name,
+                         Symbol data,
+                         Symbol weight,
+                         Shape kernel,
+                         int num_filter,
+                         Shape stride = Shape(1, 1),
+                         Shape dilate = Shape(1, 1),
+                         Shape pad = Shape(0, 0),
+                         int num_group = 1,
+                         int64_t workspace = 512) {
+  return Operator("Convolution")
+      .SetParam("kernel", kernel)
+      .SetParam("num_filter", num_filter)
+      .SetParam("stride", stride)
+      .SetParam("dilate", dilate)
+      .SetParam("pad", pad)
+      .SetParam("num_group", num_group)
+      .SetParam("workspace", workspace)
+      .SetParam("no_bias", true)
+      .SetInput("data", data)
+      .SetInput("weight", weight)
+      .CreateSymbol(symbol_name);
+}
+
+Symbol getConv(const std::string & name, Symbol data,
+               int  num_filter,
+               Shape kernel, Shape stride, Shape pad,
+               bool with_relu,
+               mx_float bn_momentum) {
+  Symbol conv_w(name + "_w");
+  Symbol conv = ConvolutionNoBias(name, data, conv_w,
+                                  kernel, num_filter, stride, Shape(1, 1),
+                                  pad, 1, 512);
+
+  Symbol bn = BatchNorm(name + "_bn", conv, 2e-5, bn_momentum, false);
+
+  if (with_relu) {
+    return Activation(name + "_relu", bn, "relu");
+  } else {
+    return bn;
+  }
+}
+
+Symbol makeBlock(const std::string & name, Symbol data, int num_filter,
+                 bool dim_match, mx_float bn_momentum) {
+  Shape stride;
+  if (dim_match) {
+    stride = Shape(1, 1);
+  } else {
+    stride = Shape(2, 2);
+  }
+
+  Symbol conv1 = getConv(name + "_conv1", data, num_filter,
+                         Shape(3, 3), stride, Shape(1, 1),
+                         true, bn_momentum);
+
+  Symbol conv2 = getConv(name + "_conv2", conv1, num_filter,
+                         Shape(3, 3), Shape(1, 1), Shape(1, 1),
+                         false, bn_momentum);
+
+  Symbol shortcut;
+
+  if (dim_match) {
+    shortcut = data;
+  } else {
+    Symbol shortcut_w(name + "_proj_w");
+    shortcut = ConvolutionNoBias(name + "_proj", data, shortcut_w,
+                                 Shape(2, 2), num_filter,
+                                 Shape(2, 2), Shape(1, 1), Shape(0, 0),
+                                 1, 512);
+  }
+
+  Symbol fused = shortcut + conv2;
+  return Activation(name + "_relu", fused, "relu");
+}
+
+Symbol getBody(Symbol data, int num_level, int num_block, int num_filter, mx_float bn_momentum) {
+  for (int level = 0; level < num_level; level++) {
+    for (int block = 0; block < num_block; block++) {
+      data = makeBlock("level" + std::to_string(level + 1) + "_block" + std::to_string(block + 1),
+                       data, num_filter * (std::pow(2, level)),
+                       (level == 0 || block > 0), bn_momentum);
+    }
+  }
+  return data;
+}
+
+Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
+                    int num_filter = 16, mx_float bn_momentum = 0.9,
+                    mxnet::cpp::Shape pool_kernel = mxnet::cpp::Shape(8, 8)) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  Symbol zscore = BatchNorm("zscore", data, 0.001, bn_momentum);
+
+  Symbol conv = getConv("conv0", zscore, num_filter,
+                        Shape(3, 3), Shape(1, 1), Shape(1, 1),
+                        true, bn_momentum);
+
+  Symbol body = getBody(conv, num_level, num_block, num_filter, bn_momentum);
+
+  Symbol pool = Pooling("pool", body, pool_kernel, PoolingPoolType::avg);
+
+  Symbol flat = Flatten("flatten", pool);
+
+  Symbol fc_w("fc_w"), fc_b("fc_b");
+  Symbol fc = FullyConnected("fc", flat, fc_w, fc_b, num_class);
+
+  return SoftmaxOutput("softmax", fc, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 50;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto resnet = ResNetSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  resnet.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  auto train_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./sf1_train.lst")
+      .SetParam("path_imgrec", "./sf1_train.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .CreateDataIter();
+
+  auto val_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./sf1_val.lst")
+      .SetParam("path_imgrec", "./sf1_val.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  auto *exec = resnet.SimpleBind(Context::gpu(), args_map);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  delete exec;
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/run_lenet_with_mxdataiter.sh b/cpp-package/example/run_lenet_with_mxdataiter.sh
new file mode 100755
index 000000000000..fffc355865bc
--- /dev/null
+++ b/cpp-package/example/run_lenet_with_mxdataiter.sh
@@ -0,0 +1,6 @@
+if [ ! -f "./mnist.zip" ]; then
+  wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip
+  unzip -u mnist.zip
+fi
+make lenet_with_mxdataiter
+LD_LIBRARY_PATH=../lib/linux ./lenet_with_mxdataiter
diff --git a/cpp-package/include/mxnet-cpp/CPPLINT.cfg b/cpp-package/include/mxnet-cpp/CPPLINT.cfg
new file mode 100644
index 000000000000..2f2b772b465b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/CPPLINT.cfg
@@ -0,0 +1,2 @@
+filter=-runtime/references
+exclude_files=op.h
diff --git a/cpp-package/include/mxnet-cpp/MxNetCpp.h b/cpp-package/include/mxnet-cpp/MxNetCpp.h
new file mode 100644
index 000000000000..c4bf94a4e48c
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/MxNetCpp.h
@@ -0,0 +1,23 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file MxNetCpp.h
+ * \brief meta include file for mxnet.cpp
+ * \author Chuntao Hong, Zhang Chen
+ */
+
+#ifndef MXNETCPP_H_
+#define MXNETCPP_H_
+
+#include "mxnet-cpp/executor.hpp"
+#include "mxnet-cpp/symbol.hpp"
+#include "mxnet-cpp/ndarray.hpp"
+#include "mxnet-cpp/operator.hpp"
+#include "mxnet-cpp/optimizer.hpp"
+#include "mxnet-cpp/kvstore.hpp"
+#include "mxnet-cpp/op.h"
+#include "mxnet-cpp/op_suppl.h"
+#include "mxnet-cpp/io.hpp"
+#include "mxnet-cpp/metric.h"
+#include "mxnet-cpp/initializer.h"
+
+#endif  // MXNETCPP_H_
diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
new file mode 100644
index 000000000000..f17f898b01cc
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -0,0 +1,38 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file base.h
+* \brief base definitions for mxnetcpp
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_BASE_H
+#define MXNETCPP_BASE_H
+
+#include <cstdlib>
+#include "mxnet/c_api.h"
+#include "nnvm/c_api.h"
+
+namespace mxnet {
+namespace cpp {
+
+typedef unsigned index_t;
+
+enum OpReqType {
+  /*! \brief no operation, do not write anything */
+  kNullOp,
+  /*! \brief write gradient to provided space */
+  kWriteTo,
+  /*!
+  * \brief perform an inplace write,
+  * Target shares memory with one of input arguments.
+  * This option only happen when
+  */
+  kWriteInplace,
+  /*! \brief add to the provided space */
+  kAddTo
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_BASE_H
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
new file mode 100644
index 000000000000..acf8c9e39d4b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -0,0 +1,137 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file executor.h
+* \brief executor definition
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_EXECUTOR_H
+#define MXNETCPP_EXECUTOR_H
+
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Optimizer;
+
+/*!
+* \brief Executor interface
+*/
+class Executor {
+ public:
+  Executor(const Symbol &symbol, Context context,
+           const std::vector<NDArray> &arg_arrays,
+           const std::vector<NDArray> &grad_arrays,
+           const std::vector<OpReqType> &grad_reqs,
+           const std::vector<NDArray> &aux_arrays,
+           const std::map<std::string, Context> &group_to_ctx =
+               std::map<std::string, Context>(),
+           Executor *shared_exec = nullptr);
+  explicit Executor(const ExecutorHandle &h) { handle_ = h; }
+  /*!
+  * \brief Perform a Forward operation of Operator
+  *  After this operation, user can get the result by using function head.
+  */
+  void Forward(bool is_train) {
+    MXExecutorForward(handle_, is_train ? 1 : 0);
+    mx_uint out_size;
+    NDArrayHandle *out_array;
+    CHECK_EQ(MXExecutorOutputs(handle_, &out_size, &out_array), 0);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      outputs[i] = NDArray(out_array[i]);
+    }
+  }
+  /*!
+  * \brief Perform a Backward operation of the Operator.
+  *  This must be called after Forward.
+  *  After this operation, NDArrays specified by grad_in_args_store will be
+  *updated accordingly.
+  *  User is allowed to pass in an empty Array if the head node is
+  *  loss function and head gradeitn is not needed.
+  *
+  * \param head_grads the gradient of head nodes to be backproped.
+  */
+  void Backward(const std::vector<NDArray> &head_grads =
+                    std::vector<NDArray>()) {
+    std::vector<NDArrayHandle> head_grads_;
+    for (auto d : head_grads) {
+      head_grads_.push_back(d.GetHandle());
+    }
+    if (head_grads_.size() > 0) {
+      MXExecutorBackward(handle_, head_grads_.size(), head_grads_.data());
+    } else {
+      MXExecutorBackward(handle_, 0, nullptr);
+    }
+  }
+  // TODO(zhangchen-qinyinghua)
+  // To implement reshape function
+  void Reshape();
+  /*!
+  * \brief update the arguments with given learning rate and optimizer
+  * \return the SymbolHandle
+  */
+  std::string DebugStr();
+  /*!
+  * \brief update the arguments with given learning rate and optimizer
+  * \param opt the pointer to the optimizer
+  * \param lr learning rate
+  * \param wd weight decay
+  * \param arg_update_begin begin index of the arguments to be updated, it
+  * starts after the input data by default
+  * \param arg_update_end end index of the arguments to be updated, it ends
+  * before the label data by default
+  */
+  void UpdateAll(Optimizer *opt, float lr, float wd, int arg_update_begin = 1,
+                 int arg_update_end = -1);
+  /*!
+  * \brief destructor, free the handle
+  */
+  ~Executor() { MXExecutorFree(handle_); }
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<NDArray> aux_arrays;
+  /*!
+  * \brief arrays store the outputs of forward
+  */
+  std::vector<NDArray> outputs;
+  std::map<std::string, NDArray> arg_dict() {
+    return GetDict(symbol_.ListArguments(), arg_arrays);
+  }
+  std::map<std::string, NDArray> grad_dict() {
+    return GetDict(symbol_.ListArguments(), grad_arrays);
+  }
+  std::map<std::string, NDArray> aux_dict() {
+    return GetDict(symbol_.ListAuxiliaryStates(), aux_arrays);
+  }
+
+ private:
+  Executor(const Executor &e);
+  Executor &operator=(const Executor &e);
+  ExecutorHandle handle_;
+  Symbol symbol_;
+  std::map<std::string, NDArray> GetDict(const std::vector<std::string> &names,
+                                         const std::vector<NDArray> &arrays) {
+    std::map<std::string, NDArray> ret;
+    std::set<std::string> name_set;
+    for (const auto &s : names) {
+      CHECK(name_set.find(s) == name_set.end()) << "Duplicate names detected, "
+                                                << s;
+      name_set.insert(s);
+    }
+    CHECK_EQ(name_set.size(), arrays.size())
+        << "names size not equal to arrays size";
+    for (size_t i = 0; i < names.size(); ++i) {
+      ret[names[i]] = arrays[i];
+    }
+    return ret;
+  }
+};
+}  // namespace cpp
+}  // namespace mxnet
+#endif  // MXNETCPP_EXECUTOR_H
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
new file mode 100644
index 000000000000..c642a96268dd
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -0,0 +1,92 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file executor.hpp
+ * \brief implementation of the executor
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef MXNETCPP_EXECUTOR_HPP
+#define MXNETCPP_EXECUTOR_HPP
+
+#include <vector>
+#include <map>
+#include <string>
+#include "mxnet-cpp/executor.h"
+#include "mxnet-cpp/optimizer.h"
+
+namespace mxnet {
+namespace cpp {
+Executor::Executor(const Symbol &symbol, Context context,
+                   const std::vector<NDArray> &arg_arrays,
+                   const std::vector<NDArray> &grad_arrays,
+                   const std::vector<OpReqType> &grad_reqs,
+                   const std::vector<NDArray> &aux_arrays,
+                   const std::map<std::string, Context> &group_to_ctx,
+                   Executor *shared_exec) {
+  this->arg_arrays = arg_arrays;
+  this->grad_arrays = grad_arrays;
+  this->aux_arrays = aux_arrays;
+  this->symbol_ = symbol;
+
+  std::vector<NDArrayHandle> arg_handles;
+  std::vector<NDArrayHandle> grad_handles;
+  std::vector<NDArrayHandle> aux_handles;
+
+  for (const auto &array : arg_arrays) {
+    arg_handles.push_back(array.GetHandle());
+  }
+  for (const auto &array : grad_arrays) {
+    grad_handles.push_back(array.GetHandle());
+  }
+  for (const auto &array : aux_arrays) {
+    aux_handles.push_back(array.GetHandle());
+  }
+
+  std::vector<mx_uint> grad_reqs_uint;
+  for (auto s : grad_reqs) grad_reqs_uint.push_back(s);
+
+  std::vector<const char *> map_keys;
+  std::vector<int> dev_types, dev_ids;
+  for (const auto &s : group_to_ctx) {
+    map_keys.push_back(s.first.c_str());
+    dev_types.push_back(s.second.GetDeviceType());
+    dev_ids.push_back(s.second.GetDeviceId());
+  }
+
+  ExecutorHandle *shared_exec_handle =
+      shared_exec == nullptr ? nullptr : &shared_exec->handle_;
+
+  CHECK_EQ(MXExecutorBindEX(symbol.GetHandle(), context.GetDeviceType(),
+                            context.GetDeviceId(), group_to_ctx.size(),
+                            map_keys.data(), dev_types.data(), dev_ids.data(),
+                            arg_handles.size(), arg_handles.data(),
+                            grad_handles.data(), grad_reqs_uint.data(),
+                            aux_handles.size(), aux_handles.data(),
+                            shared_exec_handle, &handle_),
+           0);
+
+  mx_uint out_size;
+  NDArrayHandle *out_array;
+  CHECK_EQ(MXExecutorOutputs(handle_, &out_size, &out_array), 0);
+  for (mx_uint i = 0; i < out_size; ++i) {
+    outputs.push_back(NDArray(out_array[i]));
+  }
+}
+
+std::string Executor::DebugStr() {
+  const char *output;
+  MXExecutorPrint(handle_, &output);
+  return std::string(output);
+}
+
+void Executor::UpdateAll(Optimizer *opt, float lr, float wd,
+                         int arg_update_begin, int arg_update_end) {
+  arg_update_end = arg_update_end < 0 ? arg_arrays.size() - 1 : arg_update_end;
+  for (int i = arg_update_begin; i < arg_update_end; ++i) {
+    opt->Update(i, arg_arrays[i], grad_arrays[i], lr, wd);
+  }
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_EXECUTOR_HPP
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
new file mode 100644
index 000000000000..2fe43ff6e9f8
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -0,0 +1,130 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file initializer.h
+ * \brief random initializer
+ * \author Zhang Chen
+ */
+
+#ifndef MXNETCPP_INITIALIZER_H
+#define MXNETCPP_INITIALIZER_H
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Initializer {
+ public:
+  static bool StringStartWith(const std::string& name,
+                              const std::string& check_str) {
+    return (name.size() >= check_str.size() &&
+            name.substr(0, check_str.size()) == check_str);
+  }
+  static bool StringEndWith(const std::string& name,
+                            const std::string& check_str) {
+    return (name.size() >= check_str.size() &&
+            name.substr(name.size() - check_str.size(), check_str.size()) ==
+                check_str);
+  }
+  virtual void operator()(const std::string& name, NDArray* arr) {
+    if (StringStartWith(name, "upsampling")) {
+      InitBilinear(arr);
+    } else if (StringEndWith(name, "bias")) {
+      InitBias(arr);
+    } else if (StringEndWith(name, "gamma")) {
+      InitGamma(arr);
+    } else if (StringEndWith(name, "beta")) {
+      InitBeta(arr);
+    } else if (StringEndWith(name, "weight")) {
+      InitWeight(arr);
+    } else if (StringEndWith(name, "moving_mean")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "moving_var")) {
+      InitOne(arr);
+    } else if (StringEndWith(name, "moving_inv_var")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "moving_avg")) {
+      InitZero(arr);
+    } else {
+      InitDefault(arr);
+    }
+  }
+
+ protected:
+  virtual void InitBilinear(NDArray* arr) {
+    Shape shape(arr->GetShape());
+    std::vector<float> weight(shape.Size(), 0);
+    int f = std::ceil(shape[3] / 2.0);
+    float c = (2 * f - 1 - f % 2) / (2. * f);
+    for (size_t i = 0; i < shape.Size(); ++i) {
+      int x = i % shape[3];
+      int y = (i / shape[3]) % shape[2];
+      weight[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
+    }
+    (*arr).SyncCopyFromCPU(weight);
+  }
+  virtual void InitZero(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitOne(NDArray* arr) { (*arr) = 1.0f; }
+  virtual void InitBias(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitGamma(NDArray* arr) { (*arr) = 1.0f; }
+  virtual void InitBeta(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitWeight(NDArray* arr) {}
+  virtual void InitDefault(NDArray* arr) {}
+};
+
+class Xavier : public Initializer {
+ public:
+  enum RandType {
+    gaussian,
+    uniform
+  } rand_type;
+  enum FactorType {
+    avg,
+    in,
+    out
+  } factor_type;
+  float magnitude;
+  Xavier(RandType rand_type = gaussian, FactorType factor_type = avg,
+         float magnitude = 3)
+      : rand_type(rand_type), factor_type(factor_type), magnitude(magnitude) {}
+
+ protected:
+  virtual void InitWeight(NDArray* arr) {
+    Shape shape(arr->GetShape());
+    float hw_scale = 1.0f;
+    if (shape.ndim() > 2) {
+      for (size_t i = 2; i < shape.ndim(); ++i) {
+        hw_scale *= shape[i];
+      }
+    }
+    float fan_in = shape[1] * hw_scale, fan_out = shape[0] * hw_scale;
+    float factor = 1.0f;
+    switch (factor_type) {
+      case avg:
+        factor = (fan_in + fan_out) / 2.0;
+        break;
+      case in:
+        factor = fan_in;
+        break;
+      case out:
+        factor = fan_out;
+    }
+    float scale = std::sqrt(magnitude / factor);
+    switch (rand_type) {
+      case uniform:
+        NDArray::SampleUniform(-scale, scale, arr);
+        break;
+      case gaussian:
+        NDArray::SampleGaussian(0, scale, arr);
+        break;
+    }
+  }
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif /* end of include guard: MXNETCPP_INITIALIZER_H */
diff --git a/cpp-package/include/mxnet-cpp/io.h b/cpp-package/include/mxnet-cpp/io.h
new file mode 100644
index 000000000000..41c02f249614
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/io.h
@@ -0,0 +1,128 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.h
+* \brief definition of io, such as DataIter
+* \author Zhang Chen
+*/
+#ifndef MXNETCPP_IO_H
+#define MXNETCPP_IO_H
+
+#include <map>
+#include <string>
+#include <vector>
+#include <sstream>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/ndarray.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+/*!
+* \brief Default object for holding a mini-batch of data and related
+* information.
+*/
+class DataBatch {
+ public:
+  NDArray data;
+  NDArray label;
+  int pad_num;
+  std::vector<int> index;
+};
+class DataIter {
+ public:
+  virtual void BeforeFirst(void) = 0;
+  virtual bool Next(void) = 0;
+  virtual NDArray GetData(void) = 0;
+  virtual NDArray GetLabel(void) = 0;
+  virtual int GetPadNum(void) = 0;
+  virtual std::vector<int> GetIndex(void) = 0;
+
+  DataBatch GetDataBatch() {
+    return DataBatch{GetData(), GetLabel(), GetPadNum(), GetIndex()};
+  }
+  void Reset() { BeforeFirst(); }
+};
+
+class MXDataIterMap {
+ public:
+  inline MXDataIterMap() {
+    mx_uint num_data_iter_creators = 0;
+    DataIterCreator *data_iter_creators = nullptr;
+    int r = MXListDataIters(&num_data_iter_creators, &data_iter_creators);
+    CHECK_EQ(r, 0);
+    for (mx_uint i = 0; i < num_data_iter_creators; i++) {
+      const char *name;
+      const char *description;
+      mx_uint num_args;
+      const char **arg_names;
+      const char **arg_type_infos;
+      const char **arg_descriptions;
+      r = MXDataIterGetIterInfo(data_iter_creators[i], &name, &description,
+                                &num_args, &arg_names, &arg_type_infos,
+                                &arg_descriptions);
+      CHECK_EQ(r, 0);
+      mxdataiter_creators_[name] = data_iter_creators[i];
+    }
+  }
+  inline DataIterCreator GetMXDataIterCreator(const std::string &name) {
+    return mxdataiter_creators_[name];
+  }
+
+ private:
+  std::map<std::string, DataIterCreator> mxdataiter_creators_;
+};
+
+struct MXDataIterBlob {
+ public:
+  MXDataIterBlob() : handle_(nullptr) {}
+  explicit MXDataIterBlob(DataIterHandle handle) : handle_(handle) {}
+  ~MXDataIterBlob() { MXDataIterFree(handle_); }
+  DataIterHandle handle_;
+
+ private:
+  MXDataIterBlob &operator=(const MXDataIterBlob &);
+};
+
+class MXDataIter : public DataIter {
+ public:
+  explicit MXDataIter(const std::string &mxdataiter_type);
+  MXDataIter(const MXDataIter &other) {
+    creator_ = other.creator_;
+    params_ = other.params_;
+    blob_ptr_ = other.blob_ptr_;
+  }
+  void BeforeFirst();
+  bool Next();
+  NDArray GetData();
+  NDArray GetLabel();
+  int GetPadNum();
+  std::vector<int> GetIndex();
+  MXDataIter CreateDataIter();
+  /*!
+   * \brief set config parameters
+   * \param name name of the config parameter
+   * \param value value of the config parameter
+   * \return reference of self
+   */
+  template <typename T>
+  MXDataIter &SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return *this;
+  }
+
+ private:
+  DataIterCreator creator_;
+  std::map<std::string, std::string> params_;
+  std::shared_ptr<MXDataIterBlob> blob_ptr_;
+  static MXDataIterMap *mxdataiter_map_;
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif /* end of include guard: MXNETCPP_IO_H */
+
diff --git a/cpp-package/include/mxnet-cpp/io.hpp b/cpp-package/include/mxnet-cpp/io.hpp
new file mode 100644
index 000000000000..853a6bafb488
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/io.hpp
@@ -0,0 +1,87 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.hpp
+* \brief implementation of data iter
+* \author Zhang Chen
+*/
+#ifndef MXNETCPP_IO_HPP
+#define MXNETCPP_IO_HPP
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/io.h"
+
+namespace mxnet {
+namespace cpp {
+
+MXDataIterMap *MXDataIter::mxdataiter_map_ = new MXDataIterMap;
+
+MXDataIter::MXDataIter(const std::string &mxdataiter_type) {
+  creator_ = mxdataiter_map_->GetMXDataIterCreator(mxdataiter_type);
+  blob_ptr_ = std::make_shared<MXDataIterBlob>(nullptr);
+}
+
+void MXDataIter::BeforeFirst() {
+  int r = MXDataIterBeforeFirst(blob_ptr_->handle_);
+  CHECK_EQ(r, 0);
+}
+
+bool MXDataIter::Next() {
+  int out;
+  int r = MXDataIterNext(blob_ptr_->handle_, &out);
+  CHECK_EQ(r, 0);
+  return out;
+}
+
+NDArray MXDataIter::GetData() {
+  NDArrayHandle handle;
+  int r = MXDataIterGetData(blob_ptr_->handle_, &handle);
+  CHECK_EQ(r, 0);
+  return NDArray(handle);
+}
+
+NDArray MXDataIter::GetLabel() {
+  NDArrayHandle handle;
+  int r = MXDataIterGetLabel(blob_ptr_->handle_, &handle);
+  CHECK_EQ(r, 0);
+  return NDArray(handle);
+}
+
+int MXDataIter::GetPadNum() {
+  int out;
+  int r = MXDataIterGetPadNum(blob_ptr_->handle_, &out);
+  CHECK_EQ(r, 0);
+  return out;
+}
+std::vector<int> MXDataIter::GetIndex() {
+  uint64_t *out_index, out_size;
+  int r = MXDataIterGetIndex(blob_ptr_->handle_, &out_index, &out_size);
+  CHECK_EQ(r, 0);
+  std::vector<int> ret;
+  for (uint64_t i = 0; i < out_size; ++i) {
+    ret.push_back(out_index[i]);
+  }
+  return ret;
+}
+
+MXDataIter MXDataIter::CreateDataIter() {
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+
+  MXDataIterCreateIter(creator_, param_keys.size(), param_keys.data(),
+                       param_values.data(), &blob_ptr_->handle_);
+  return *this;
+}
+
+// MXDataIter MNIst
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif /* end of include guard: MXNETCPP_IO_HPP */
+
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
new file mode 100644
index 000000000000..a7f8404bed8e
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -0,0 +1,49 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file kvstore.h
+* \brief definition of kvstore
+* \author Chuntao Hong
+*/
+
+#ifndef MXNETCPP_KVSTORE_H
+#define MXNETCPP_KVSTORE_H
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+class KVStore {
+ public:
+  explicit inline KVStore(const std::string& name = "local");
+  KVStore(const KVStore &) = delete;
+  // VS 2013 doesn't support default move constructor.
+  KVStore(KVStore &&);
+  inline void RunServer();
+  inline void Init(int key, const NDArray& val);
+  inline void Init(const std::vector<int>& keys, const std::vector<NDArray>& vals);
+  inline void Push(int key, const NDArray& val, int priority = 0);
+  inline void Push(const std::vector<int>& keys,
+      const std::vector<NDArray>& vals, int priority = 0);
+  inline void Pull(int key, NDArray* out, int priority = 0);
+  inline void Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority = 0);
+  // TODO(lx): put lr in optimizer or not?
+  inline void SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local = false);
+  inline std::string GetType() const;
+  inline int GetRank() const;
+  inline int GetNumWorkers() const;
+  inline void Barrier() const;
+  inline std::string GetRole() const;
+  ~KVStore() { MXKVStoreFree(handle_); }
+
+ private:
+  KVStoreHandle handle_;
+  std::unique_ptr<Optimizer> optimizer_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_KVSTORE_H
diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp
new file mode 100644
index 000000000000..f4dd765d2f8b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/kvstore.hpp
@@ -0,0 +1,178 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file kvstore.hpp
+ * \brief implementation of kvstore
+ * \author Xin Li
+ */
+
+#include <algorithm>
+#include <map>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "mxnet-cpp/kvstore.h"
+#include "mxnet-cpp/optimizer.h"
+
+#ifndef KVSTORE_HPP
+#define KVSTORE_HPP
+
+namespace mxnet {
+namespace cpp {
+
+namespace private_ {
+  KVStore *kvstore = nullptr;
+
+  extern "C"
+  void controller(int head, const char* body, void * controller_handle) {
+    if (kvstore == nullptr) {
+      return;
+    }
+    if (head == 0) {
+      std::map<std::string, std::string> params;
+      std::istringstream sin(body);
+      std::string line;
+      while (getline(sin, line)) {
+        size_t n = line.find('=');
+        params.emplace(line.substr(0, n), line.substr(n+1));
+      }
+      std::unique_ptr<Optimizer> opt(OptimizerRegistry::Find(params.at("opt_type")));
+      params.erase("opt_type");
+      for (const auto& pair : params) {
+        opt->SetParam(pair.first, pair.second);
+      }
+      kvstore->SetOptimizer(std::move(opt), true);
+    }
+  }
+}  // namespace private_
+
+KVStore::KVStore(const std::string& name) {
+  CHECK_EQ(MXKVStoreCreate(name.c_str(), &handle_), 0);
+}
+
+KVStore::KVStore(KVStore &&kv) {
+  optimizer_ = std::move(kv.optimizer_);
+  handle_ = kv.handle_;
+  kv.handle_ = nullptr;
+}
+
+void KVStore::RunServer() {
+  CHECK_NE(GetRole(), "worker");
+  private_::kvstore = this;
+  CHECK_EQ(MXKVStoreRunServer(handle_, &private_::controller, 0), 0);
+}
+
+void KVStore::Init(int key, const NDArray& val) {
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStoreInit(handle_, 1, &key, &val_handle), 0);
+}
+
+void KVStore::Init(const std::vector<int>& keys, const std::vector<NDArray>& vals) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStoreInit(handle_, keys.size(), keys.data(),
+      val_handles.data()), 0);
+}
+
+void KVStore::Push(int key, const NDArray& val, int priority) {
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStorePush(handle_, 1, &key, &val_handle, priority), 0);
+}
+
+void KVStore::Push(const std::vector<int>& keys,
+                   const std::vector<NDArray>& vals,
+                   int priority) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePush(handle_, keys.size(), keys.data(),
+      val_handles.data(), priority), 0);
+}
+
+void KVStore::Pull(int key, NDArray* out, int priority) {
+  NDArrayHandle out_handle = out->GetHandle();
+  CHECK_EQ(MXKVStorePull(handle_, 1, &key, &out_handle, priority), 0);
+}
+
+void KVStore::Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority) {
+  CHECK_EQ(keys.size(), outs->size());
+
+  std::vector<NDArrayHandle> out_handles(keys.size());
+  std::transform(outs->cbegin(), outs->cend(), out_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePull(handle_, keys.size(), keys.data(),
+      out_handles.data(), priority), 0);
+}
+
+namespace private_ {
+  extern "C"
+  void updater(int key, NDArrayHandle recv, NDArrayHandle local,
+      void* handle_) {
+    Optimizer *opt = static_cast<Optimizer*>(handle_);
+    opt->Update(key, NDArray(local), NDArray(recv));
+  }
+}
+
+void KVStore::SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local) {
+  if (local) {
+    optimizer_ = std::move(optimizer);
+    CHECK_EQ(MXKVStoreSetUpdater(handle_, &private_::updater, optimizer_.get()), 0);
+  } else {
+    CHECK_EQ(MXKVStoreSendCommmandToServers(handle_, 0, (*optimizer).Serialize().c_str()), 0);
+  }
+}
+
+std::string KVStore::GetType() const {
+  const char *type;
+  CHECK_EQ(MXKVStoreGetType(handle_, &type), 0);
+  // type is managed by handle_, no need to free its memory.
+  return type;
+}
+
+int KVStore::GetRank() const {
+  int rank;
+  CHECK_EQ(MXKVStoreGetRank(handle_, &rank), 0);
+  return rank;
+}
+
+int KVStore::GetNumWorkers() const {
+  int num_workers;
+  CHECK_EQ(MXKVStoreGetGroupSize(handle_, &num_workers), 0);
+  return num_workers;
+}
+
+void KVStore::Barrier() const {
+  CHECK_EQ(MXKVStoreBarrier(handle_), 0);
+}
+
+std::string KVStore::GetRole() const {
+  int ret;
+  CHECK_EQ(MXKVStoreIsSchedulerNode(&ret), 0);
+  if (ret) {
+    return "scheduler";
+  }
+  CHECK_EQ(MXKVStoreIsServerNode(&ret), 0);
+  if (ret) {
+    return "server";
+  }
+  CHECK_EQ(MXKVStoreIsWorkerNode(&ret), 0);
+  CHECK(ret);
+  return "worker";
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // KVSTORE_HPP
diff --git a/cpp-package/include/mxnet-cpp/metric.h b/cpp-package/include/mxnet-cpp/metric.h
new file mode 100644
index 000000000000..71f7dc332187
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/metric.h
@@ -0,0 +1,91 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file base.h
+* \brief metrics defined
+* \author Zhang Chen
+*/
+
+#ifndef MXNETCPP_METRIC_H
+#define MXNETCPP_METRIC_H
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "mxnet-cpp/ndarray.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+class EvalMetric {
+ public:
+  explicit EvalMetric(const std::string& name, int num = 0)
+      : name(name), num(num) {}
+  virtual void Update(NDArray labels, NDArray preds) = 0;
+  void Reset() {
+    num_inst = 0;
+    sum_metric = 0.0f;
+  }
+  float Get() { return sum_metric / num_inst; }
+  void GetNameValue();
+
+ protected:
+  std::string name;
+  int num;
+  float sum_metric = 0.0f;
+  int num_inst = 0;
+
+  static bool CheckLabelShapes(NDArray labels, NDArray preds,
+                               Shape shape = Shape(0)) {
+    // TODO(zhangchen-qinyinghua)
+    // inplement this
+    return true;
+  }
+};
+
+class Accuracy : public EvalMetric {
+ public:
+  Accuracy() : EvalMetric("accuracy") {}
+
+  void Update(NDArray labels, NDArray preds) {
+    CHECK_EQ(labels.GetShape().size(), 1);
+    mx_uint len = labels.GetShape()[0];
+    std::vector<mx_float> pred_data(len);
+    std::vector<mx_float> label_data(len);
+    preds.ArgmaxChannel().SyncCopyToCPU(&pred_data, len);
+    labels.SyncCopyToCPU(&label_data, len);
+    NDArray::WaitAll();
+    for (mx_uint i = 0; i < len; ++i) {
+      sum_metric += (pred_data[i] == label_data[i]) ? 1 : 0;
+      num_inst += 1;
+    }
+  }
+};
+
+class LogLoss : public EvalMetric {
+ public:
+  LogLoss() : EvalMetric("logloss") {}
+
+  void Update(NDArray labels, NDArray preds) {
+    static const float epsilon = 1e-15;
+    mx_uint len = labels.GetShape()[0];
+    mx_uint m = preds.GetShape()[1];
+    std::vector<mx_float> pred_data(len * m);
+    std::vector<mx_float> label_data(len);
+    preds.SyncCopyToCPU(&pred_data, pred_data.size());
+    labels.SyncCopyToCPU(&label_data, len);
+    NDArray::WaitAll();
+    for (mx_uint i = 0; i < len; ++i) {
+      sum_metric +=
+          -std::log(std::max(pred_data[i * m + label_data[i]], epsilon));
+      num_inst += 1;
+    }
+  }
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif /* end of include guard: MXNETCPP_METRIC_H */
+
diff --git a/cpp-package/include/mxnet-cpp/model.h b/cpp-package/include/mxnet-cpp/model.h
new file mode 100644
index 000000000000..8444c9d3e7da
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/model.h
@@ -0,0 +1,58 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file model.h
+* \brief MXNET.cpp model module
+* \author Zhang Chen
+*/
+
+#ifndef MXNETCPP_MODEL_H
+#define MXNETCPP_MODEL_H
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/symbol.h"
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+struct FeedForwardConfig {
+  Symbol symbol;
+  std::vector<Context> ctx = {Context::cpu()};
+  int num_epoch = 0;
+  int epoch_size = 0;
+  std::string optimizer = "sgd";
+  // TODO(zhangchen-qinyinghua) More implement
+  // initializer=Uniform(0.01),
+  // numpy_batch_size=128,
+  // arg_params=None, aux_params=None,
+  // allow_extra_params=False,
+  // begin_epoch=0,
+  // **kwargs):
+  FeedForwardConfig(const FeedForwardConfig &other) {}
+  FeedForwardConfig() {}
+};
+class FeedForward {
+ public:
+  explicit FeedForward(const FeedForwardConfig &conf) : conf_(conf) {}
+  void Predict();
+  void Score();
+  void Fit();
+  void Save();
+  void Load();
+  static FeedForward Create();
+
+ private:
+  void InitParams();
+  void InitPredictor();
+  void InitIter();
+  void InitEvalIter();
+  FeedForwardConfig conf_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif /* end of include guard: MXNETCPP_MODEL_H */
+
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
new file mode 100644
index 000000000000..88ca80fd1110
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -0,0 +1,407 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file ndarray.h
+* \brief definition of ndarray
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_NDARRAY_H
+#define MXNETCPP_NDARRAY_H
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+
+namespace mxnet {
+namespace cpp {
+
+enum DeviceType {
+  kCPU = 1,
+  kGPU = 2,
+  kCPUPinned = 3
+};
+
+/*!
+* \brief Context interface
+*/
+class Context {
+ public:
+  /*!
+  * \brief Context constructor
+  * \param type type of the device
+  * \param id id of the device
+  */
+  Context(const DeviceType &type, int id) : type_(type), id_(id) {}
+  /*!
+  * \return the type of the device
+  */
+  DeviceType GetDeviceType() const { return type_; }
+  /*!
+  * \return the id of the device
+  */
+  int GetDeviceId() const { return id_; }
+
+  /*!
+   * \brief Return a GPU context
+   * \param device_id id of the device
+   * \return the corresponding GPU context
+   */
+  static Context gpu(int device_id = 0) {
+    return Context(DeviceType::kGPU, device_id);
+  }
+
+  /*!
+   * \brief Return a CPU context
+   * \param device_id id of the device. this is not needed by CPU
+   * \return the corresponding CPU context
+   */
+  static Context cpu(int device_id = 0) {
+    return Context(DeviceType::kCPU, device_id);
+  }
+
+ private:
+  DeviceType type_;
+  int id_;
+};
+
+/*!
+* \brief struct to store NDArrayHandle
+*/
+struct NDBlob {
+ public:
+  /*!
+  * \brief default constructor
+  */
+  NDBlob() : handle_(nullptr) {}
+  /*!
+  * \brief construct with a NDArrayHandle
+  * \param handle NDArrayHandle to store
+  */
+  explicit NDBlob(NDArrayHandle handle) : handle_(handle) {}
+  /*!
+  * \brief destructor, free the NDArrayHandle
+  */
+  ~NDBlob() { MXNDArrayFree(handle_); }
+  /*!
+  * \brief the NDArrayHandle
+  */
+  NDArrayHandle handle_;
+
+ private:
+  NDBlob(const NDBlob &);
+  NDBlob &operator=(const NDBlob &);
+};
+
+/*!
+* \brief NDArray interface
+*/
+class NDArray {
+ public:
+  /*!
+  * \brief construct with a none handle
+  */
+  NDArray();
+  /*!
+  * \brief construct with a NDArrayHandle
+  */
+  explicit NDArray(const NDArrayHandle &handle);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  * \param delay_alloc whether delay the allocation
+  */
+  NDArray(const std::vector<mx_uint> &shape, const Context &context,
+          bool delay_alloc = true);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  * \param delay_alloc whether delay the allocation
+  */
+  NDArray(const Shape &shape, const Context &context, bool delay_alloc = true);
+  NDArray(const mx_float *data, size_t size);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param data the data to create NDArray from
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  */
+  NDArray(const mx_float *data, const Shape &shape, const Context &context);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param data the data to create NDArray from
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  */
+  NDArray(const std::vector<mx_float> &data, const Shape &shape,
+          const Context &context);
+  explicit NDArray(const std::vector<mx_float> &data);
+  NDArray operator+(mx_float scalar);
+  NDArray operator-(mx_float scalar);
+  NDArray operator*(mx_float scalar);
+  NDArray operator/(mx_float scalar);
+  NDArray operator+(const NDArray &);
+  NDArray operator-(const NDArray &);
+  NDArray operator*(const NDArray &);
+  NDArray operator/(const NDArray &);
+  /*!
+  * \brief set all the elements in ndarray to be scalar
+  * \param scalar the scalar to set
+  * \return reference of self
+  */
+  NDArray &operator=(mx_float scalar);
+  /*!
+  * \brief elementwise add to current space
+  *  this mutate the current NDArray
+  * \param scalar the data to add
+  * \return reference of self
+  */
+  NDArray &operator+=(mx_float scalar);
+  /*!
+  * \brief elementwise subtract from current ndarray
+  * this mutate the current NDArray
+  * \param scalar the data to substract
+  * \return reference of self
+  */
+  NDArray &operator-=(mx_float scalar);
+  /*!
+  * \brief elementwise multiplication to current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to substract
+  * \return reference of self
+  */
+  NDArray &operator*=(mx_float scalar);
+  /*!
+  * \brief elementwise division from current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to substract
+  * \return reference of self
+  */
+  NDArray &operator/=(mx_float scalar);
+  /*!
+  * \brief elementwise add to current space
+  *  this mutate the current NDArray
+  * \param src the data to add
+  * \return reference of self
+  */
+  NDArray &operator+=(const NDArray &src);
+  /*!
+  * \brief elementwise subtract from current ndarray
+  * this mutate the current NDArray
+  * \param src the data to substract
+  * \return reference of self
+  */
+  NDArray &operator-=(const NDArray &src);
+  /*!
+  * \brief elementwise multiplication to current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to substract
+  * \return reference of self
+  */
+  NDArray &operator*=(const NDArray &src);
+  /*!
+  * \brief elementwise division from current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to substract
+  * \return reference of self
+  */
+  NDArray &operator/=(const NDArray &src);
+  NDArray ArgmaxChannel();
+  /*!
+  * \brief Do a synchronize copy from a continugous CPU memory region.
+  *
+  *  This function will call WaitToWrite before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copy from.
+  * \param size the memory size we want to copy from.
+  */
+  void SyncCopyFromCPU(const mx_float *data, size_t size);
+  /*!
+  * \brief Do a synchronize copy from a continugous CPU memory region.
+  *
+  *  This function will call WaitToWrite before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copy from, int the form of mx_float vector
+  */
+  void SyncCopyFromCPU(const std::vector<mx_float> &data);
+  /*!
+  * \brief Do a synchronize copy to a continugous CPU memory region.
+  *
+  *  This function will call WaitToRead before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copyinto.
+  * \param size the memory size we want to copy into. Defualt value is Size()
+  */
+  void SyncCopyToCPU(mx_float *data, size_t size = 0);
+  /*!
+  * \brief Do a synchronize copy to a continugous CPU memory region.
+  *
+  *  This function will call WaitToRead before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copyinto.
+  * \param size the memory size we want to copy into. Defualt value is Size()
+  */
+  void SyncCopyToCPU(std::vector<mx_float> *data, size_t size = 0);
+  /*!
+  * \brief Copy the content of current array to other.
+  * \param other the new context of this NDArray
+  * \return the new copy
+  */
+  NDArray CopyTo(NDArray * other) const;
+  /*!
+  * \brief return a new copy this NDArray
+  * \param other the target NDArray
+  * \return the copy target NDarray
+  */
+  NDArray Copy(const Context &) const;
+  /*!
+  * \brief return offset of the element at (h, w)
+  * \param h height position
+  * \param w width position
+  * \return offset of two dimensions array
+  */
+  size_t Offset(size_t h = 0, size_t w = 0) const;
+  /*!
+   * \brief return offset of three dimensions array
+   * \param c channel position
+   * \param h height position
+   * \param w width position
+   * \return offset of three dimensions array
+   */
+  size_t Offset(size_t c, size_t h, size_t w) const;
+  /*!
+  * \brief return value of the element at (h, w)
+  * \param h height position
+  * \param w width position
+  * \return value of two dimensions array
+  */
+  mx_float At(size_t h, size_t w) const;
+  /*!
+   * \brief return value of three dimensions array
+   * \param c channel position
+   * \param h height position
+   * \param w width position
+   * \return value of three dimensions array
+   */
+  mx_float At(size_t c, size_t h, size_t w) const;
+  /*!
+  * \brief Slice a NDArray
+  * \param begin begin index in first dim
+  * \param end end index in first dim
+  * \return sliced NDArray
+  */
+  NDArray Slice(mx_uint begin, mx_uint end) const;
+  /*!
+  * \brief Return a reshaped NDArray that shares memory with current one
+  * \param new_shape the new shape
+  * \return reshaped NDarray
+  */
+  NDArray Reshape(const Shape &new_shape) const;
+  /*!
+  * \brief Block until all the pending write operations with respect
+  *    to current NDArray are finished, and read can be performed.
+  */
+  void WaitToRead() const;
+  /*!
+  * \brief Block until all the pending read/write operations with respect
+  *    to current NDArray are finished, and write can be performed.
+  */
+  void WaitToWrite();
+  /*!
+  * \brief Block until all the pending read/write operations with respect
+  *    to current NDArray are finished, and read/write can be performed.
+  */
+  static void WaitAll();
+  /*!
+  * \brief Sample gaussian distribution for each elements of out.
+  * \param mu mean of gaussian distribution.
+  * \param sigma standard deviation of gaussian distribution.
+  * \param out output NDArray.
+  */
+  static void SampleGaussian(mx_float mu, mx_float sigma, NDArray *out);
+  /*!
+  * \brief Sample uniform distribution for each elements of out.
+  * \param begin lower bound of distribution.
+  * \param end upper bound of distribution.
+  * \param out output NDArray.
+  */
+  static void SampleUniform(mx_float begin, mx_float end, NDArray *out);
+  /*!
+  * \brief Load NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \param array_list a list of NDArrays returned, do not fill the list if
+  * nullptr is given.
+  * \param array_map a map from names to NDArrays returned, do not fill the map
+  * if nullptr is given or no names is stored in binary file.
+  */
+  static void Load(const std::string &file_name,
+                   std::vector<NDArray> *array_list = nullptr,
+                   std::map<std::string, NDArray> *array_map = nullptr);
+  /*!
+  * \brief Load map of NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \return a list of NDArrays.
+  */
+  static std::map<std::string, NDArray> LoadToMap(const std::string &file_name);
+  /*!
+  * \brief Load list of NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \return a map from names to NDArrays.
+  */
+  static std::vector<NDArray> LoadToList(const std::string &file_name);
+  /*!
+  * \brief save a map of string->NDArray to binary file.
+  * \param file_name name of the binary file.
+  * \param array_map a map from names to NDArrays.
+  */
+  static void Save(const std::string &file_name,
+                   const std::map<std::string, NDArray> &array_map);
+  /*!
+  * \brief save a list of NDArrays to binary file.
+  * \param file_name name of the binary file.
+  * \param array_list a list of NDArrays.
+  */
+  static void Save(const std::string &file_name,
+                   const std::vector<NDArray> &array_list);
+  /*!
+  * \return the size of current NDArray, a.k.a. the production of all shape dims
+  */
+  size_t Size() const;
+  /*!
+  * \return the shape of current NDArray, in the form of mx_uint vector
+  */
+  std::vector<mx_uint> GetShape() const;
+  /*!
+  * \return the data pointer to the current NDArray
+  */
+  const mx_float *GetData() const;
+
+  /*!
+  * \return the context of NDArray
+  */
+  Context GetContext() const;
+
+  /*!
+  * \return the NDArrayHandle of the current NDArray
+  */
+  NDArrayHandle GetHandle() const { return blob_ptr_->handle_; }
+
+ private:
+  std::shared_ptr<NDBlob> blob_ptr_;
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_NDARRAY_H
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
new file mode 100644
index 000000000000..f7b5c3233205
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -0,0 +1,331 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file ndarray.hpp
+ * \brief implementation of the ndarray
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef MXNETCPP_NDARRAY_HPP
+#define MXNETCPP_NDARRAY_HPP
+
+#include <map>
+#include <string>
+#include <vector>
+#include "dmlc/logging.h"
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+NDArray::NDArray() {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const NDArrayHandle &handle) {
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const std::vector<mx_uint> &shape, const Context &context,
+                 bool delay_alloc) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.size(), context.GetDeviceType(),
+                           context.GetDeviceId(), delay_alloc, &handle),
+           0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const Shape &shape, const Context &context, bool delay_alloc) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), delay_alloc, &handle),
+           0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const mx_float *data, size_t size) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  MXNDArraySyncCopyFromCPU(handle, data, size);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const mx_float *data, const Shape &shape,
+                 const Context &context) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), false, &handle),
+           0);
+  MXNDArraySyncCopyFromCPU(handle, data, shape.Size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const std::vector<mx_float> &data, const Shape &shape,
+                 const Context &context) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), false, &handle),
+           0);
+  MXNDArraySyncCopyFromCPU(handle, data.data(), shape.Size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+NDArray::NDArray(const std::vector<mx_float> &data) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  MXNDArraySyncCopyFromCPU(handle, data.data(), data.size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+
+NDArray NDArray::operator+(mx_float scalar) {
+  NDArray ret;
+  Operator("_plus_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator-(mx_float scalar) {
+  NDArray ret;
+  Operator("_minus_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator*(mx_float scalar) {
+  NDArray ret;
+  Operator("_mul_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator/(mx_float scalar) {
+  NDArray ret;
+  Operator("_div_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator+(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_plus")(*this, rhs).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator-(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_minus")(*this, rhs).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator*(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_mul")(*this, rhs).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::operator/(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_div")(*this, rhs).Invoke(ret);
+  return ret;
+}
+NDArray &NDArray::operator=(mx_float scalar) {
+  Operator("_set_value")(scalar).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator+=(mx_float scalar) {
+  Operator("_plus_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator-=(mx_float scalar) {
+  Operator("_minus_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator*=(mx_float scalar) {
+  Operator("_mul_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator/=(mx_float scalar) {
+  Operator("_div_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator+=(const NDArray &rhs) {
+  Operator("_plus")(*this, rhs).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator-=(const NDArray &rhs) {
+  Operator("_minus")(*this, rhs).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator*=(const NDArray &rhs) {
+  Operator("_mul")(*this, rhs).Invoke(*this);
+  return *this;
+}
+NDArray &NDArray::operator/=(const NDArray &rhs) {
+  Operator("_div")(*this, rhs).Invoke(*this);
+  return *this;
+}
+
+NDArray NDArray::ArgmaxChannel() {
+  NDArray ret;
+  Operator("argmax_channel")(*this).Invoke(ret);
+  return ret;
+}
+
+void NDArray::SyncCopyFromCPU(const mx_float *data, size_t size) {
+  MXNDArraySyncCopyFromCPU(blob_ptr_->handle_, data, size);
+}
+void NDArray::SyncCopyFromCPU(const std::vector<mx_float> &data) {
+  MXNDArraySyncCopyFromCPU(blob_ptr_->handle_, data.data(), data.size());
+}
+void NDArray::SyncCopyToCPU(mx_float *data, size_t size) {
+  MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data, size > 0 ? size : Size());
+}
+void NDArray::SyncCopyToCPU(std::vector<mx_float> *data, size_t size) {
+  size = size > 0 ? size : Size();
+  data->resize(size);
+  MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data->data(), size);
+}
+NDArray NDArray::Copy(const Context &ctx) const {
+  NDArray ret(GetShape(), ctx);
+  Operator("_copyto")(*this).Invoke(ret);
+  return ret;
+}
+NDArray NDArray::CopyTo(NDArray * other) const {
+  Operator("_copyto")(*this).Invoke(*other);
+  return *other;
+}
+NDArray NDArray::Slice(mx_uint begin, mx_uint end) const {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArraySlice(GetHandle(), begin, end, &handle), 0);
+  return NDArray(handle);
+}
+NDArray NDArray::Reshape(const Shape &new_shape) const {
+  NDArrayHandle handle;
+  std::vector<int> dims(new_shape.ndim());
+  for (index_t i = 0; i < new_shape.ndim(); ++i) {
+    dims[i] = new_shape[i];
+  }
+  new_shape.data();
+  CHECK_EQ(
+      MXNDArrayReshape(GetHandle(), new_shape.ndim(), dims.data(), &handle), 0);
+  return NDArray(handle);
+}
+void NDArray::WaitToRead() const {
+  CHECK_EQ(MXNDArrayWaitToRead(blob_ptr_->handle_), 0);
+}
+void NDArray::WaitToWrite() {
+  CHECK_EQ(MXNDArrayWaitToWrite(blob_ptr_->handle_), 0);
+}
+void NDArray::WaitAll() { CHECK_EQ(MXNDArrayWaitAll(), 0); }
+void NDArray::SampleGaussian(mx_float mu, mx_float sigma, NDArray *out) {
+  Operator("_sample_normal")(mu, sigma).Invoke(*out);
+}
+void NDArray::SampleUniform(mx_float begin, mx_float end, NDArray *out) {
+  Operator("_sample_uniform")(begin, end).Invoke(*out);
+}
+void NDArray::Load(const std::string &file_name,
+                   std::vector<NDArray> *array_list,
+                   std::map<std::string, NDArray> *array_map) {
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (array_list != nullptr) {
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_list->push_back(NDArray(out_arr[i]));
+    }
+  }
+  if (array_map != nullptr && out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      (*array_map)[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+}
+std::map<std::string, NDArray> NDArray::LoadToMap(
+    const std::string &file_name) {
+  std::map<std::string, NDArray> array_map;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_map[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+  return array_map;
+}
+std::vector<NDArray> NDArray::LoadToList(const std::string &file_name) {
+  std::vector<NDArray> array_list;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  for (mx_uint i = 0; i < out_size; ++i) {
+    array_list.push_back(NDArray(out_arr[i]));
+  }
+  return array_list;
+}
+void NDArray::Save(const std::string &file_name,
+                   const std::map<std::string, NDArray> &array_map) {
+  std::vector<NDArrayHandle> args;
+  std::vector<const char *> keys;
+  for (const auto &t : array_map) {
+    args.push_back(t.second.GetHandle());
+    keys.push_back(t.first.c_str());
+  }
+  CHECK_EQ(
+      MXNDArraySave(file_name.c_str(), args.size(), args.data(), keys.data()),
+      0);
+}
+void NDArray::Save(const std::string &file_name,
+                   const std::vector<NDArray> &array_list) {
+  std::vector<NDArrayHandle> args;
+  for (const auto &t : array_list) {
+    args.push_back(t.GetHandle());
+  }
+  CHECK_EQ(MXNDArraySave(file_name.c_str(), args.size(), args.data(), nullptr),
+           0);
+}
+
+size_t NDArray::Offset(size_t h, size_t w) const {
+  return (h * GetShape()[1]) + w;
+}
+
+size_t NDArray::Offset(size_t c, size_t h, size_t w) const {
+  auto const shape = GetShape();
+  return h * shape[0] * shape[2] + w * shape[0] + c;
+}
+
+mx_float NDArray::At(size_t h, size_t w) const {
+  return GetData()[Offset(h, w)];
+}
+
+mx_float NDArray::At(size_t c, size_t h, size_t w) const {
+  return GetData()[Offset(c, h, w)];
+}
+
+size_t NDArray::Size() const {
+  size_t ret = 1;
+  for (auto &i : GetShape()) ret *= i;
+  return ret;
+}
+
+std::vector<mx_uint> NDArray::GetShape() const {
+  const mx_uint *out_pdata;
+  mx_uint out_dim;
+  MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
+  std::vector<mx_uint> ret;
+  for (mx_uint i = 0; i < out_dim; ++i) {
+    ret.push_back(out_pdata[i]);
+  }
+  return ret;
+}
+
+const mx_float *NDArray::GetData() const {
+  mx_float *ret;
+  CHECK_NE(GetContext().GetDeviceType(), DeviceType::kGPU);
+  MXNDArrayGetData(blob_ptr_->handle_, &ret);
+  return ret;
+}
+Context NDArray::GetContext() const {
+  int out_dev_type;
+  int out_dev_id;
+  MXNDArrayGetContext(blob_ptr_->handle_, &out_dev_type, &out_dev_id);
+  return Context((DeviceType)out_dev_type, out_dev_id);
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_NDARRAY_HPP
diff --git a/cpp-package/include/mxnet-cpp/op.h b/cpp-package/include/mxnet-cpp/op.h
new file mode 100644
index 000000000000..d873b8034a99
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op.h
@@ -0,0 +1,7629 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op.h
+* \brief definition of all the operators
+* \author Chuntao Hong, Xin Li
+*/
+
+#ifndef _MXNETOP_H
+#define _MXNETOP_H
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+#include "mxnet-cpp/operator.h"
+#include "dmlc/optional.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+ * \breif Batch normalization.
+ *
+ *        Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+ *        well as offset ``beta``.
+ *
+ *        Assume the input has more than one dimension and we normalize along axis 1.
+ *        We first compute the mean and variance along this axis:
+ *
+ *        .. math::
+ *
+ *        data\_mean[i] = mean(data[:,i,:,...]) \\
+ *        data\_var[i] = var(data[:,i,:,...])
+ *
+ *        Then compute the normalized output, which has the same shape as input, as
+ *
+ *        .. math::
+ *
+ *        out[:,i,:,...] = \frac{data[:,i,:,...] -
+ *
+ *        Both *mean* and *var* returns a scalar by treating the input as a vector.
+ *
+ *        Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+ *        have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both
+ *        ``data_var`` as well, which are needed for the backward pass.
+ *
+ *        Besides the inputs and the outputs, this operator accepts two auxiliary
+ *        states, ``moving_mean`` and ``moving_var``, which are *k*-length
+ *        vectors. They are global statistics for the whole dataset, which are updated
+ *        by::
+ *
+ *        moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+ *        moving_var = moving_var * momentum + data_var * (1 - momentum)
+ *
+ *        If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+ *        ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+ *        the output. It is often used during inference.
+ *
+ *        Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is
+ *        then set ``gamma`` to 1 and its gradient to 0.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/batch_norm.cc:L84
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to batch normalization
+ * \param gamma gamma array
+ * \param beta beta array
+ * \param eps Epsilon to prevent div 0
+ * \param momentum Momentum for moving average
+ * \param fix_gamma Fix gamma while training
+ * \param use_global_stats Whether use global moving statistics instead of local
+ * \param output_mean_var Output All,normal mean and var
+ * \return new symbol
+ */
+inline Symbol BatchNorm(const std::string& symbol_name,
+                        Symbol data,
+                        Symbol gamma,
+                        Symbol beta,
+                        mx_float eps = 0.001,
+                        mx_float momentum = 0.9,
+                        bool fix_gamma = true,
+                        bool use_global_stats = false,
+                        bool output_mean_var = false) {
+  return Operator("BatchNorm")
+           .SetParam("eps", eps)
+           .SetParam("momentum", momentum)
+           .SetParam("fix_gamma", fix_gamma)
+           .SetParam("use_global_stats", use_global_stats)
+           .SetParam("output_mean_var", output_mean_var)
+           .SetInput("data", data)
+           .SetInput("gamma", gamma)
+           .SetInput("beta", beta)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Activation function to be applied.
+ */
+enum class LeakyReLUActType {
+  elu = 0,
+  leaky = 1,
+  prelu = 2,
+  rrelu = 3
+};
+
+/*!
+ * \breif Leaky ReLu activation
+ *
+ *        The following types are supported:
+ *
+ *        - *elu*: ``y = x > 0 ? x : slop * (exp(x)-1)``
+ *        - *leaky*: ``y = x > 0 ? x : slope * x``
+ *        - *prelu*: same as *leaky* but the ``slope`` is learnable.
+ *        - *rrelu*: same as *leaky* but the ``slope`` is uniformly randomly chosen from
+ *        *[lower_bound, upper_bound)* for training, while fixed to be
+ *        *(lower_bound+upper_bound)/2* for inference.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/leaky_relu.cc:L36
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \param slope Init slope for the activation. (For leaky and elu only)
+ * \param lower_bound Lower bound of random slope. (For rrelu only)
+ * \param upper_bound Upper bound of random slope. (For rrelu only)
+ * \return new symbol
+ */
+inline Symbol LeakyReLU(const std::string& symbol_name,
+                        Symbol data,
+                        LeakyReLUActType act_type = LeakyReLUActType::leaky,
+                        mx_float slope = 0.25,
+                        mx_float lower_bound = 0.125,
+                        mx_float upper_bound = 0.334) {
+  static const char *LeakyReLUActTypeValues[] = {
+    "elu",
+    "leaky",
+    "prelu",
+    "rrelu"
+  };
+  return Operator("LeakyReLU")
+           .SetParam("act_type", LeakyReLUActTypeValues[int(act_type)])
+           .SetParam("slope", slope)
+           .SetParam("lower_bound", lower_bound)
+           .SetParam("upper_bound", upper_bound)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Concate a list of array along a given axis.
+ *
+ *        The dimension sizes of the input arrays on the given axis should be the same.
+ *
+ *        For example::
+ *
+ *        x = [[1,1],[1,1]]
+ *        y = [[2,2],[2,2]]
+ *        z = [[3,3],[3,3],[3,3]]
+ *
+ *        Concat(x,y,z,dim=0) = [[ 1.,  1.],
+ *        [ 1.,  1.],
+ *        [ 2.,  2.],
+ *        [ 2.,  2.],
+ *        [ 3.,  3.],
+ *        [ 3.,  3.],
+ *        [ 3.,  3.]]
+ *
+ *        Concat(x,y,z,dim=1) = [[ 1.,  1.,  2.,  2.],
+ *        [ 1.,  1.,  2.,  2.]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/concat.cc:L69
+ * \param symbol_name name of the resulting symbol
+ * \param data List of tensors to concatenate
+ * \param num_args Number of inputs to be concated.
+ * \param dim the dimension to be concated.
+ * \return new symbol
+ */
+inline Symbol Concat(const std::string& symbol_name,
+                     const std::vector<Symbol>& data,
+                     int num_args,
+                     int dim = 1) {
+  return Operator("Concat")
+           .SetParam("num_args", num_args)
+           .SetParam("dim", dim)
+(data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply a sparse regularization to the output a sigmoid activation function.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data.
+ * \param sparseness_target The sparseness target
+ * \param penalty The tradeoff parameter for the sparseness penalty
+ * \param momentum The momentum for running average
+ * \return new symbol
+ */
+inline Symbol IdentityAttachKLSparseReg(const std::string& symbol_name,
+                                        Symbol data,
+                                        mx_float sparseness_target = 0.1,
+                                        mx_float penalty = 0.001,
+                                        mx_float momentum = 0.9) {
+  return Operator("IdentityAttachKLSparseReg")
+           .SetParam("sparseness_target", sparseness_target)
+           .SetParam("penalty", penalty)
+           .SetParam("momentum", momentum)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate cross_entropy(data, one_hot(label))
+ *
+ *        From:/home/xlidc/mxnet/src/operator/loss_binary_op.cc:12
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data
+ * \param label Input label
+ * \return new symbol
+ */
+inline Symbol softmax_cross_entropy(const std::string& symbol_name,
+                                    Symbol data,
+                                    Symbol label) {
+  return Operator("softmax_cross_entropy")
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Padding type to use. "constant" pads all values with a constant value, the
+ *        value of which can be specified with the constant_value option. "edge" uses the
+ */
+enum class PadMode {
+  constant = 0,
+  edge = 1
+};
+
+/*!
+ * \breif Pad an array.
+ *
+ *        Only supports 4-D and 5-D input array.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/pad.cc:L407
+ * \param symbol_name name of the resulting symbol
+ * \param data An n-dimensional input tensor.
+ * \param mode Padding type to use. "constant" pads all values with a constant value, the
+ *        value of which can be specified with the constant_value option. "edge" uses the
+ * \param pad_width A tuple of padding widths of length 2*r, where r is the rank of the
+ *        input tensor, specifying number of values padded to the edges of each axis.
+ *        (before_1, after_1, ... , before_N, after_N) unique pad widths for each axis.
+ * \param constant_value This option is only used when mode is "constant". This value
+ * \return new symbol
+ */
+inline Symbol Pad(const std::string& symbol_name,
+                  Symbol data,
+                  PadMode mode,
+                  Shape pad_width,
+                  double constant_value = 0) {
+  static const char *PadModeValues[] = {
+    "constant",
+    "edge"
+  };
+  return Operator("Pad")
+           .SetParam("mode", PadModeValues[int(mode)])
+           .SetParam("pad_width", pad_width)
+           .SetParam("constant_value", constant_value)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Updater function for sgd optimizer
+ * \param symbol_name name of the resulting symbol
+ * \param lr learning_rate
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \return new symbol
+ */
+inline Symbol sgd_update(const std::string& symbol_name,
+                         mx_float lr,
+                         mx_float wd = 0,
+                         mx_float rescale_grad = 1,
+                         mx_float clip_gradient = -1) {
+  return Operator("sgd_update")
+           .SetParam("lr", lr)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Updater function for sgd optimizer
+ * \param symbol_name name of the resulting symbol
+ * \param lr learning_rate
+ * \param momentum momentum
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \return new symbol
+ */
+inline Symbol sgd_mom_update(const std::string& symbol_name,
+                             mx_float lr,
+                             mx_float momentum = 0,
+                             mx_float wd = 0,
+                             mx_float rescale_grad = 1,
+                             mx_float clip_gradient = -1) {
+  return Operator("sgd_mom_update")
+           .SetParam("lr", lr)
+           .SetParam("momentum", momentum)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Updater function for adam optimizer
+ * \param symbol_name name of the resulting symbol
+ * \param lr learning_rate
+ * \param beta1 beta1
+ * \param beta2 beta2
+ * \param epsilon epsilon
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \return new symbol
+ */
+inline Symbol adam_update(const std::string& symbol_name,
+                          mx_float lr,
+                          mx_float beta1 = 0.9,
+                          mx_float beta2 = 0.999,
+                          mx_float epsilon = 1e-08,
+                          mx_float wd = 0,
+                          mx_float rescale_grad = 1,
+                          mx_float clip_gradient = -1) {
+  return Operator("adam_update")
+           .SetParam("lr", lr)
+           .SetParam("beta1", beta1)
+           .SetParam("beta2", beta2)
+           .SetParam("epsilon", epsilon)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Updater function for RMSProp optimizer. The RMSProp code follows the version in
+ *        http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+ * \param symbol_name name of the resulting symbol
+ * \param lr learning_rate
+ * \param gamma1 gamma1
+ * \param epsilon epsilon
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \param clip_weights If greater than 0, clip weights to weights = max(min(weights,
+ * \return new symbol
+ */
+inline Symbol rmsprop_update(const std::string& symbol_name,
+                             mx_float lr,
+                             mx_float gamma1 = 0.95,
+                             mx_float epsilon = 1e-08,
+                             mx_float wd = 0,
+                             mx_float rescale_grad = 1,
+                             mx_float clip_gradient = -1,
+                             mx_float clip_weights = -1) {
+  return Operator("rmsprop_update")
+           .SetParam("lr", lr)
+           .SetParam("gamma1", gamma1)
+           .SetParam("epsilon", epsilon)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .SetParam("clip_weights", clip_weights)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Updater function for RMSPropAlex optimizer. The RMSPropAlex code follows the
+ *        version in http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves,
+ * \param symbol_name name of the resulting symbol
+ * \param lr learning_rate
+ * \param gamma1 gamma1
+ * \param gamma2 gamma2
+ * \param epsilon epsilon
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \param clip_weights If greater than 0, clip weights to weights = max(min(weights,
+ * \return new symbol
+ */
+inline Symbol rmspropalex_update(const std::string& symbol_name,
+                                 mx_float lr,
+                                 mx_float gamma1 = 0.95,
+                                 mx_float gamma2 = 0.9,
+                                 mx_float epsilon = 1e-08,
+                                 mx_float wd = 0,
+                                 mx_float rescale_grad = 1,
+                                 mx_float clip_gradient = -1,
+                                 mx_float clip_weights = -1) {
+  return Operator("rmspropalex_update")
+           .SetParam("lr", lr)
+           .SetParam("gamma1", gamma1)
+           .SetParam("gamma2", gamma2)
+           .SetParam("epsilon", epsilon)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .SetParam("clip_weights", clip_weights)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Interchange two axes of an array.
+ *
+ *        Examples::
+ *
+ *        x = [[1, 2, 3]])
+ *        swapaxes(x, 0, 1) = [[ 1],
+ *        [ 2],
+ *        [ 3]]
+ *
+ *        x = [[[ 0, 1],
+ *        [ 2, 3]],
+ *        [[ 4, 5],
+ *        [ 6, 7]]]  // (2,2,2) array
+ *
+ *        swapaxes(x, 0, 2) = [[[ 0, 4],
+ *        [ 2, 6]],
+ *        [[ 1, 5],
+ *        [ 3, 7]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/swapaxis.cc:L55
+ * \param symbol_name name of the resulting symbol
+ * \param data Input array.
+ * \param dim1 the first axis to be swapped.
+ * \param dim2 the second axis to be swapped.
+ * \return new symbol
+ */
+inline Symbol SwapAxis(const std::string& symbol_name,
+                       Symbol data,
+                       uint32_t dim1 = 0,
+                       uint32_t dim2 = 0) {
+  return Operator("SwapAxis")
+           .SetParam("dim1", dim1)
+           .SetParam("dim2", dim2)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Split an array along a particular axis into multiple sub-arrays.
+ *
+ *        Assume the input array has shape ``(d_0, ..., d_n)`` and we slice it into *m*
+ *        (``num_outputs=m``) subarrays along axis *k*, then we will obtain a list of *m*
+ *        arrays with each of which has shape ``(d_0, ..., d_k/m, ..., d_n)``.
+ *
+ *        For example::
+ *
+ *        x = [[1, 2],
+ *        [3, 4],
+ *        [5, 6],
+ *        [7, 8]]  // 4x2 array
+ *
+ *        y = split(x, axis=0, num_outputs=4) // a list of 4 arrays
+ *        y[0] = [[ 1.,  2.]]  // 1x2 array
+ *
+ *        z = split(x, axis=0, num_outputs=2) // a list of 2 arrays
+ *        z[0] = [[ 1.,  2.],
+ *        [ 3.,  4.]]
+ *
+ *        When setting optional argument ``squeeze_axis=1``, then the *k*-dimension will
+ *        be removed from the shape if it becomes 1::
+ *
+ *        y = split(x, axis=0, num_outputs=4, squeeze_axis=1)
+ *        y[0] = [ 1.,  2.]  // (2,) vector
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/slice_channel.cc:L50
+ * \param symbol_name name of the resulting symbol
+ * \param num_outputs Number of outputs to be sliced.
+ * \param axis Dimension along which to slice.
+ * \param squeeze_axis If true, the dimension will be squeezed. Also, input.shape[axis]
+ * \return new symbol
+ */
+inline Symbol SliceChannel(const std::string& symbol_name,
+                           int num_outputs,
+                           int axis = 1,
+                           bool squeeze_axis = false) {
+  return Operator("SliceChannel")
+           .SetParam("num_outputs", num_outputs)
+           .SetParam("axis", axis)
+           .SetParam("squeeze_axis", squeeze_axis)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif upsampling method
+ */
+enum class UpSamplingSampleType {
+  bilinear = 0,
+  nearest = 1
+};
+
+/*! \breif How to handle multiple input. concat means concatenate upsampled images along
+ *        the channel dimension. sum means add all images together, only available for
+ */
+enum class UpSamplingMultiInputMode {
+  concat = 0,
+  sum = 1
+};
+
+/*!
+ * \breif Perform nearest neighboor/bilinear up sampling to inputs
+ * \param symbol_name name of the resulting symbol
+ * \param data Array of tensors to upsample
+ * \param scale Up sampling scale
+ * \param sample_type upsampling method
+ * \param num_args Number of inputs to be upsampled. For nearest neighbor upsampling,
+ *        this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other
+ *        inputs will be upsampled to thesame size. For bilinear upsampling this must be
+ * \param num_filter Input filter. Only used by bilinear sample_type.
+ * \param multi_input_mode How to handle multiple input. concat means concatenate
+ *        upsampled images along the channel dimension. sum means add all images
+ * \param workspace Tmp workspace for deconvolution (MB)
+ * \return new symbol
+ */
+inline Symbol UpSampling(const std::string& symbol_name,
+                         const std::vector<Symbol>& data,
+                         uint32_t scale,
+                         UpSamplingSampleType sample_type,
+                         int num_args,
+                         uint32_t num_filter = 0,
+                         UpSamplingMultiInputMode multi_input_mode = UpSamplingMultiInputMode::concat,
+                         uint64_t workspace = 512) {
+  static const char *UpSamplingSampleTypeValues[] = {
+    "bilinear",
+    "nearest"
+  };
+  static const char *UpSamplingMultiInputModeValues[] = {
+    "concat",
+    "sum"
+  };
+  return Operator("UpSampling")
+           .SetParam("scale", scale)
+           .SetParam("sample_type", UpSamplingSampleTypeValues[int(sample_type)])
+           .SetParam("num_args", num_args)
+           .SetParam("num_filter", num_filter)
+           .SetParam("multi_input_mode", UpSamplingMultiInputModeValues[int(multi_input_mode)])
+           .SetParam("workspace", workspace)
+(data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol elemwise_add(const std::string& symbol_name,
+                           Symbol lhs,
+                           Symbol rhs) {
+  return Operator("elemwise_add")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate Smooth L1 Loss(lhs, scalar)
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_binary_scalar_op_extended.cc:63
+ * \param symbol_name name of the resulting symbol
+ * \param data source input
+ * \param scalar scalar input
+ * \return new symbol
+ */
+inline Symbol smooth_l1(const std::string& symbol_name,
+                        Symbol data,
+                        mx_float scalar) {
+  return Operator("smooth_l1")
+           .SetParam("scalar", scalar)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif The return type. "value" means returning the top k values, "indices" means
+ *        returning the indices of the top k values, "mask" means to return a mask array
+ *        containing 0 and 1. 1 means the top k values. "both" means to return both value
+ */
+enum class TopkRetTyp {
+  both = 0,
+  indices = 1,
+  mask = 2,
+  value = 3
+};
+
+/*!
+ * \breif Return the top *k* elements in an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 0.3,  0.2,  0.4],
+ *        [ 0.1,  0.3,  0.2]]
+ *
+ *        // return the index of the largest element on last axis
+ *        topk(x) = [[ 2.],
+ *        [ 1.]]
+ *
+ *        // return the value of the top-2 elements on last axis
+ *        topk(x, ret_typ='value', k=2) = [[ 0.4,  0.3],
+ *        [ 0.3,  0.2]]
+ *
+ *        // flatten and then return both index and value
+ *        topk(x, ret_typ='both', k=2, axis=None) = [ 0.4,  0.3], [ 2.,  0.]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/ordering_op.cc:L36
+ * \param symbol_name name of the resulting symbol
+ * \param src Source input
+ * \param axis Axis along which to choose the top k indices. If not given, the flattened
+ * \param k Number of top elements to select, should be always smaller than or equal to
+ * \param ret_typ The return type. "value" means returning the top k values, "indices"
+ *        means returning the indices of the top k values, "mask" means to return a mask
+ *        array containing 0 and 1. 1 means the top k values. "both" means to return both
+ * \param is_ascend Whether to choose k largest or k smallest. Top K largest elements
+ * \return new symbol
+ */
+inline Symbol topk(const std::string& symbol_name,
+                   Symbol src,
+                   dmlc::optional<int> axis = dmlc::optional<int>(-1),
+                   int k = 1,
+                   TopkRetTyp ret_typ = TopkRetTyp::indices,
+                   bool is_ascend = false) {
+  static const char *TopkRetTypValues[] = {
+    "both",
+    "indices",
+    "mask",
+    "value"
+  };
+  return Operator("topk")
+           .SetParam("axis", axis)
+           .SetParam("k", k)
+           .SetParam("ret_typ", TopkRetTypValues[int(ret_typ)])
+           .SetParam("is_ascend", is_ascend)
+           .SetInput("src", src)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return a sorted copy of an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 1, 4],
+ *        [ 3, 1]]
+ *
+ *        // sort along the last axis
+ *        sort(x) = [[ 1.,  4.],
+ *        [ 1.,  3.]]
+ *
+ *        // flatten and then sort
+ *        sort(x, axis=None) = [ 1.,  1.,  3.,  4.]
+ *
+ *        // sort long the first axis
+ *        sort(x, axis=0) = [[ 1.,  1.],
+ *        [ 3.,  4.]]
+ *
+ *        // in a descend order
+ *        sort(x, is_ascend=0) = [[ 4.,  1.],
+ *        [ 3.,  1.]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/ordering_op.cc:L99
+ * \param symbol_name name of the resulting symbol
+ * \param src Source input
+ * \param axis Axis along which to choose sort the input tensor. If not given, the
+ * \param is_ascend Whether sort in ascending or descending order.
+ * \return new symbol
+ */
+inline Symbol sort(const std::string& symbol_name,
+                   Symbol src,
+                   dmlc::optional<int> axis = dmlc::optional<int>(-1),
+                   bool is_ascend = true) {
+  return Operator("sort")
+           .SetParam("axis", axis)
+           .SetParam("is_ascend", is_ascend)
+           .SetInput("src", src)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Returns the indices that can sort an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 0.3,  0.2,  0.4],
+ *        [ 0.1,  0.3,  0.2]]
+ *
+ *        // sort along axis -1
+ *        argsort(x) = [[ 1.,  0.,  2.],
+ *        [ 0.,  2.,  1.]]
+ *
+ *        // sort along axis 0
+ *        argsort(x, axis=0) = [[ 1.,  0.,  1.]
+ *        [ 0.,  1.,  0.]]
+ *
+ *        // flatten and then sort
+ *        argsort(x, axis=None) = [ 3.,  1.,  5.,  0.,  4.,  2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/ordering_op.cc:L146
+ * \param symbol_name name of the resulting symbol
+ * \param src Source input
+ * \param axis Axis along which to sort the input tensor. If not given, the flattened
+ * \param is_ascend Whether sort in ascending or descending order.
+ * \return new symbol
+ */
+inline Symbol argsort(const std::string& symbol_name,
+                      Symbol src,
+                      dmlc::optional<int> axis = dmlc::optional<int>(-1),
+                      bool is_ascend = true) {
+  return Operator("argsort")
+           .SetParam("axis", axis)
+           .SetParam("is_ascend", is_ascend)
+           .SetInput("src", src)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Get output from a symbol and pass 0 gradient back
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:31
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol BlockGrad(const std::string& symbol_name,
+                        Symbol data) {
+  return Operator("BlockGrad")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Output data type.
+ */
+enum class CastDtype {
+  float16 = 0,
+  float32 = 1,
+  float64 = 2,
+  int32 = 3,
+  uint8 = 4
+};
+
+/*!
+ * \breif Cast to a specified type, element-wise.
+ *
+ *        For example::
+ *
+ *        cast([1e20, 11.1], dtype='float16') = [inf, 11.09375]
+ *        cast([300, 11.1, 10.9, -1, -3], dtype='uint8') = [44, 11, 10, 255, 253]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L65
+ * \param symbol_name name of the resulting symbol
+ * \param data Source input
+ * \param dtype Output data type.
+ * \return new symbol
+ */
+inline Symbol Cast(const std::string& symbol_name,
+                   Symbol data,
+                   CastDtype dtype) {
+  static const char *CastDtypeValues[] = {
+    "float16",
+    "float32",
+    "float64",
+    "int32",
+    "uint8"
+  };
+  return Operator("Cast")
+           .SetParam("dtype", CastDtypeValues[int(dtype)])
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Negate src
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:84
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol negative(const std::string& symbol_name,
+                       Symbol data) {
+  return Operator("negative")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Returns the absolute value of array elements, element-wise.
+ *
+ *        For example:
+ *        abs([-2, 0, 3]) = [2, 0, 3]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L95
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol abs(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("abs")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Returns the indication sign of array elements, element-wise.
+ *
+ *        For example::
+ *        sign([-2, 0, 3]) = [-1, 0, 1]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L109
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sign(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("sign")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Round elements of the array to the nearest integer, element-wise.
+ *
+ *        For example::
+ *        round([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -2.,  2.,  2.,  2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L122
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol round(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("round")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return the ceiling of the input, element-wise.
+ *
+ *        For example::
+ *        ceil([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  2.,  2.,  3.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L132
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol ceil(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("ceil")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return the floor of the input, element-wise.
+ *
+ *        For example::
+ *        floor([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-3., -2.,  1.,  1.,  2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L141
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol floor(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("floor")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Round elements of the array to the nearest integer, element-wise.
+ *
+ *        For example::
+ *        rint([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -2.,  1.,  2.,  2.]
+ *
+ *        The difference to ``round`` is that ``rint`` returns ``n`` for input ``n.5``
+ *        while ``round`` returns ``n+1`` for ``n>=0``.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L154
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol rint(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("rint")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Round elements of the array to the nearest integer towards
+ *        zero, element-wise.
+ *
+ *        For example::
+ *        fix([-2.1, -1.9, 1.9, 2.1]) = [-2., -1.,  1., 2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L164
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol fix(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("fix")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate the square of an array, element-wise.
+ *
+ *        For example::
+ *        square(x) = x^2
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L174
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol square(const std::string& symbol_name,
+                     Symbol data) {
+  return Operator("square")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate the square-root of an array, element-wise.
+ *
+ *        For example::
+ *        sqrt(x) = \sqrt{x}
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L187
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sqrt(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("sqrt")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate the inverse square-root of an array, element-wise.
+ *
+ *        For example::
+ *        rsqrt(x) = 1/\sqrt{x}
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L200
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol rsqrt(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("rsqrt")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate the exponential of the array, element-wise
+ *
+ *        For example::
+ *        exp(x) = e^x \approx 2.718^x
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L215
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol exp(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("exp")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Natural logarithm, element-wise.
+ *
+ *        The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L225
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("log")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate the base 10 logarithm of the array, element-wise.
+ *
+ *        ``10**log10(x) = x``
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L235
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log10(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("log10")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate the base 2 logarithm of the array, element-wise.
+ *
+ *        ``2**log2(x) = x``
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L245
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log2(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("log2")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Trigonometric sine, element-wise.
+ *
+ *        Then input is in radians (:math:`2\pi` rad equals 360 degress).
+ *
+ *        .. math::
+ *        sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L261
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sin(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("sin")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate ``log(1 + x)``
+ *
+ *        This function is more accurate than ``log(1 + x)``  for small ``x`` so that
+ *        :math:`1+x\approx 1`
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L275
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log1p(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("log1p")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Calculate ``exp(x) - 1``
+ *
+ *        This function provides greater precision than ``exp(x) - 1`` for small values
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L288
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol expm1(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("expm1")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Cosine, element-wise.
+ *
+ *        Then input is in radians (:math:`2\pi` rad equals 360 degress).
+ *
+ *        .. math::
+ *        cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L304
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol cos(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("cos")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Tangent, element-wise.
+ *
+ *        Then input is in radians (:math:`2\pi` rad equals 360 degress).
+ *
+ *        .. math::
+ *        tan([0, \pi/4, \pi/2]) = [0, 1, -inf]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L320
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol tan(const std::string& symbol_name,
+                  Symbol data) {
+  return Operator("tan")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Inverse sine, element-wise.
+ *
+ *        The input should be in range :math:`[-1, 1]`.
+ *        The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+ *
+ *        .. math::
+ *        arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L337
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arcsin(const std::string& symbol_name,
+                     Symbol data) {
+  return Operator("arcsin")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Inverse cosine, element-wise.
+ *
+ *        The input should be in range :math:`[-1, 1]`.
+ *        The output is in the closed interval :math:`[0, \pi]`
+ *
+ *        .. math::
+ *        arccos([-1, -.707, 0, .707, 1]) = [\pi, 3\pi/4, \pi/2, \pi/4, 0]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L354
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arccos(const std::string& symbol_name,
+                     Symbol data) {
+  return Operator("arccos")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Inverse tangent, element-wise.
+ *
+ *        The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+ *
+ *        .. math::
+ *        arccos([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L370
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arctan(const std::string& symbol_name,
+                     Symbol data) {
+  return Operator("arctan")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Convert angles from radians to degrees.
+ *
+ *        .. math::
+ *        degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L384
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol degrees(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("degrees")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Convert angles from degrees to radians.
+ *
+ *        .. math::
+ *        radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L398
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol radians(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("radians")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Hyperbolic sine, element-wise.
+ *
+ *        For example::
+ *        sinh(x) = 0.5\times(exp(x) - exp(-x))
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L412
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sinh(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("sinh")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Hyperbolic cosine, element-wise.
+ *
+ *        For example::
+ *        cosh(x) = 0.5\times(exp(x) + exp(-x))
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L426
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol cosh(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("cosh")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Hyperbolic tangent element-wise.
+ *
+ *        For example::
+ *        tanh(x) = sinh(x) / cosh(x)
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L440
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol tanh(const std::string& symbol_name,
+                   Symbol data) {
+  return Operator("tanh")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Inverse hyperbolic sine, element-wise.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L450
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arcsinh(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("arcsinh")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Inverse hyperbolic cosine, element-wise.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L460
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arccosh(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("arccosh")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Inverse hyperbolic tangent, element-wise.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L470
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arctanh(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("arctanh")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif The gamma function (extension of the factorial function), element-wise
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:479
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol gamma(const std::string& symbol_name,
+                    Symbol data) {
+  return Operator("gamma")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Log of the absolute value of the gamma function, element-wise
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:488
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol gammaln(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("gammaln")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Map integer index to vector representations (embeddings). Those embeddings are
+ *        learnable parameters. For a input of shape (d1, ..., dK), the output shape is
+ *        (d1, ..., dK, output_dim). All the input values should be integers in the range
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:19
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the EmbeddingOp.
+ * \param weight Embedding weight matrix.
+ * \param input_dim vocabulary size of the input indices.
+ * \param output_dim dimension of the embedding vectors.
+ * \return new symbol
+ */
+inline Symbol Embedding(const std::string& symbol_name,
+                        Symbol data,
+                        Symbol weight,
+                        int input_dim,
+                        int output_dim) {
+  return Operator("Embedding")
+           .SetParam("input_dim", input_dim)
+           .SetParam("output_dim", output_dim)
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif specify how out-of-bound indices bahave.
+ */
+enum class TakeMode {
+  clip = 0,
+  raise = 1,
+  wrap = 2
+};
+
+/*!
+ * \breif Take elements from an array along an axis.
+ *
+ *        Slice along a particular axis with the provided indices. E.g., given an input
+ *        with shape ``(d0, d1, d2)`` and indices with shape ``(i0, i1)``, then the output
+ *        will have shape ``(i0, i1, d1, d2)``, with::
+ *
+ *        output[i,j,:,:] = input[indices[i,j],:,:]
+ *
+ *        Examples::
+ *
+ *        x = [[ 1.,  2.],
+ *        [ 3.,  4.],
+ *        [ 5.,  6.]]
+ *
+ *        take(x, [[0,1],[1,2]]) = [[[ 1.,  2.],
+ *        [ 3.,  4.]],
+ *
+ *        [[ 3.,  4.],
+ *        [ 5.,  6.]]]
+ *
+ *        .. note::
+ *        Only slicing axis 0 is supported now.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:L79
+ * \param symbol_name name of the resulting symbol
+ * \param a The source array.
+ * \param indices The indices of the values to extract.
+ * \param axis the axis of data tensor to be taken.
+ * \param mode specify how out-of-bound indices bahave.
+ * \return new symbol
+ */
+inline Symbol take(const std::string& symbol_name,
+                   Symbol a,
+                   Symbol indices,
+                   int axis = 0,
+                   TakeMode mode = TakeMode::raise) {
+  static const char *TakeModeValues[] = {
+    "clip",
+    "raise",
+    "wrap"
+  };
+  return Operator("take")
+           .SetParam("axis", axis)
+           .SetParam("mode", TakeModeValues[int(mode)])
+           .SetInput("a", a)
+           .SetInput("indices", indices)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Take elements from a data batch.
+ *
+ *        Given an ``(d0, d1)`` input array, and ``(d0,)`` indices, the output will be a
+ *        ``(d0,)`` computed by::
+ *
+ *        output[i] = input[i, indices[i]]
+ *
+ *        Examples::
+ *
+ *        x = [[ 1.,  2.],
+ *        [ 3.,  4.],
+ *        [ 5.,  6.]]
+ *
+ *        batch_take(x, [0,1,0]) = [ 1.  4.  5.]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:L131
+ * \param symbol_name name of the resulting symbol
+ * \param a Input data array
+ * \param indices index array
+ * \return new symbol
+ */
+inline Symbol batch_take(const std::string& symbol_name,
+                         Symbol a,
+                         Symbol indices) {
+  return Operator("batch_take")
+           .SetInput("a", a)
+           .SetInput("indices", indices)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif DType of the output
+ */
+enum class One_hotDtype {
+  float16 = 0,
+  float32 = 1,
+  float64 = 2,
+  int32 = 3,
+  uint8 = 4
+};
+
+/*!
+ * \breif Returns a one-hot array.
+ *
+ *        The locations represented by ``indices`` take value ``on_value``, while all
+ *        other locations take value ``off_value``.
+ *
+ *        Assume ``indices`` has shape ``(i0, i1)``, then the output will have shape
+ *        ``(i0, i1, depth)`` and::
+ *
+ *        output[i,j,:] = off_value
+ *        output[i,j,indices[i,j]] = on_value
+ *
+ *        Examples::
+ *
+ *        one_hot([1,0,2,0], 3) = [[ 0.  1.  0.]
+ *        [ 1.  0.  0.]
+ *        [ 0.  0.  1.]
+ *        [ 1.  0.  0.]]
+ *
+ *        one_hot([1,0,2,0], 3, on_value=8, off_value=1,
+ *        dtype='int32') = [[1 8 1]
+ *        [8 1 1]
+ *        [1 1 8]
+ *        [8 1 1]]
+ *
+ *        one_hot([[1,0],[1,0],[2,0]], 3) = [[[ 0.  1.  0.]
+ *        [ 1.  0.  0.]]
+ *
+ *        [[ 0.  1.  0.]
+ *        [ 1.  0.  0.]]
+ *
+ *        [[ 0.  0.  1.]
+ *        [ 1.  0.  0.]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:L177
+ * \param symbol_name name of the resulting symbol
+ * \param indices array of locations where to set on_value
+ * \param depth The dimension size at dim = axis.
+ * \param on_value The value assigned to the locations represented by indices.
+ * \param off_value The value assigned to the locations not represented by indices.
+ * \param dtype DType of the output
+ * \return new symbol
+ */
+inline Symbol one_hot(const std::string& symbol_name,
+                      Symbol indices,
+                      int depth,
+                      double on_value = 1,
+                      double off_value = 0,
+                      One_hotDtype dtype = One_hotDtype::float32) {
+  static const char *One_hotDtypeValues[] = {
+    "float16",
+    "float32",
+    "float64",
+    "int32",
+    "uint8"
+  };
+  return Operator("one_hot")
+           .SetParam("depth", depth)
+           .SetParam("on_value", on_value)
+           .SetParam("off_value", off_value)
+           .SetParam("dtype", One_hotDtypeValues[int(dtype)])
+           .SetInput("indices", indices)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Reshape array into a new shape.
+ *
+ *        The shape is a tuple of int such as (2,3,4). The new shape should not change the
+ *        array size. For example::
+ *
+ *        reshape([1,2,3,4], shape=(2,2)) = [[1,2], [3,4]]
+ *
+ *        In addition, we can use special codes, which are integers less than
+ *        1, on some shape dimensions. To inference the output shape, we set it to an
+ *        empty tuple at beginning. When continuously pop dimensions from the original
+ *        shape starting from the beginning, and then push translated results into the
+ *        shape.
+ *
+ *        Each special code presents a way of translation.
+ *
+ *        - ``0`` for copying one. Pop one input dimension and push into the output. For
+ *
+ *        - input=(2,3,4), shape=(4,0,2), output=(4,3,2)
+ *        - input=(2,3,4), shape=(2,0,0), output=(2,3,4)
+ *
+ *        - ``-1`` for inference. Push a placeholder into the output whose value will be
+ *
+ *        - input=(2,3,4), shape=(6,1,-1), output=(6,1,4)
+ *        - input=(2,3,4), shape=(3,-1,8), output=(3,1,8)
+ *        - input=(2,3,4), shape=(-1,), output=(24,)
+ *
+ *        - ``-2`` for copying all. Pop all remaining input dimensions and push them into
+ *        the output::
+ *
+ *        - input=(2,3,4), shape=(-2), output=(9,8,7)
+ *        - input=(2,3,4), shape=(2,-2), output=(2,3,4)
+ *        - input=(2,3,4), shape=(-2,1,1), output=(2,3,4,1,1)
+ *
+ *        - ``-3`` for merging two dimensions. Pop two input dimensions, compute the
+ *        push into the output::
+ *
+ *        - input=(2,3,4), shape=(-3,4), output=(6,4)
+ *        - input=(2,3,4), shape=(0,-3), output=(2,12)
+ *        - input=(2,3,4), shape=(-3,-2), output=(6,4)
+ *
+ *        - ``-4`` for splitting two dimensions. Pop one input dimensions, next split it
+ *        according to the next two dimensions (can contain one ``-1``) specified after
+ *        this code, then push into the output::
+ *
+ *        - input=(2,3,4), shape=(-4,1,2,-2), output=(1,2,3,4)
+ *        - input=(2,3,4), shape=(2,-4,-1,3,-2), output=(2,1,3,4)
+ *
+ *        If the argument ``reverse`` is set to be true, then translating the input shape
+ *        from right to left. For example, with input shape (10, 5, 4) target shape (-1,
+ *        0), then the output shape will be (50,4) if ``reverse=1``, otherwise it will be
+ *        (40,5).
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L78
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to reshape.
+ * \param target_shape (Deprecated! Use ``shape`` instead.) Target new shape. One and
+ * \param keep_highest (Deprecated! Use ``shape`` instead.) Whether keep the highest dim
+ *        unchanged.If set to true, then the first dim in target_shape is ignored,and
+ * \param shape The target shape
+ * \param reverse If true then translating the input shape from right to left
+ * \return new symbol
+ */
+inline Symbol Reshape(const std::string& symbol_name,
+                      Symbol data,
+                      Shape target_shape = Shape(0,0),
+                      bool keep_highest = false,
+                      Shape shape = Shape(),
+                      bool reverse = false) {
+  return Operator("Reshape")
+           .SetParam("target_shape", target_shape)
+           .SetParam("keep_highest", keep_highest)
+           .SetParam("shape", shape)
+           .SetParam("reverse", reverse)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Flatten input into a 2-D array by collapsing the higher dimensions.
+ *
+ *        Assume the input array has shape ``(d1, d2, ..., dk)``, then ``flatten``
+ *        the input array into shape ``(d1, d2*...*dk)``.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L101
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to reshape.
+ * \return new symbol
+ */
+inline Symbol Flatten(const std::string& symbol_name,
+                      Symbol data) {
+  return Operator("Flatten")
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Permute the dimensions of an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 1, 2],
+ *        [ 3, 4]]
+ *
+ *        transpose(x) = [[ 1.,  3.],
+ *        [ 2.,  4.]]
+ *
+ *        x = [[[ 1.,  2.],
+ *        [ 3.,  4.]],
+ *
+ *        [[ 5.,  6.],
+ *        [ 7.,  8.]]]
+ *
+ *        transpose(x) = [[[ 1.,  5.],
+ *        [ 3.,  7.]],
+ *
+ *        [[ 2.,  6.],
+ *        [ 4.,  8.]]]
+ *
+ *        transpose(x, axes=(1,0,2)) = [[[ 1.,  2.],
+ *        [ 5.,  6.]],
+ *
+ *        [[ 3.,  4.],
+ *        [ 7.,  8.]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L142
+ * \param symbol_name name of the resulting symbol
+ * \param data Source input
+ * \param axes Target axis order. By default the axes will be inverted.
+ * \return new symbol
+ */
+inline Symbol transpose(const std::string& symbol_name,
+                        Symbol data,
+                        Shape axes = Shape()) {
+  return Operator("transpose")
+           .SetParam("axes", axes)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Insert a new axis with size 1 into the array shape
+ *
+ *        For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
+ *        will return a new array with shape ``(2,1,3,4)``.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L175
+ * \param symbol_name name of the resulting symbol
+ * \param data Source input
+ * \param axis Position (amongst axes) where new axis is to be inserted.
+ * \return new symbol
+ */
+inline Symbol expand_dims(const std::string& symbol_name,
+                          Symbol data,
+                          uint32_t axis) {
+  return Operator("expand_dims")
+           .SetParam("axis", axis)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Crop a continuous region from the array.
+ *
+ *        Assume the input array has *n* dimensions, given ``begin=(b_1, ..., b_n)`` and
+ *        ``end=(e_1, ..., e_n)``, then ``crop`` will return a region with shape
+ *        ``(e_1-b_1, ..., e_n-b_n)``. The result's *k*-th dimension contains elements
+ *        from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``.
+ *
+ *        For example::
+ *
+ *        x = [[  1.,   2.,   3.,   4.],
+ *        [  5.,   6.,   7.,   8.],
+ *        [  9.,  10.,  11.,  12.]]
+ *
+ *        crop(x, begin=(0,1), end=(2,4)) = [[ 2.,  3.,  4.],
+ *        [ 6.,  7.,  8.]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L207
+ * \param symbol_name name of the resulting symbol
+ * \param data Source input
+ * \param begin starting coordinates
+ * \param end ending coordinates
+ * \return new symbol
+ */
+inline Symbol slice(const std::string& symbol_name,
+                    Symbol data,
+                    Shape begin,
+                    Shape end) {
+  return Operator("slice")
+           .SetParam("begin", begin)
+           .SetParam("end", end)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Slice along a given axis.
+ *
+ *        Examples:
+ *
+ *        x = [[  1.,   2.,   3.,   4.],
+ *        [  5.,   6.,   7.,   8.],
+ *        [  9.,  10.,  11.,  12.]]
+ *
+ *        slice_axis(x, axis=0, begin=1, end=3) = [[  5.,   6.,   7.,   8.],
+ *        [  9.,  10.,  11.,  12.]]
+ *
+ *        slice_axis(x, axis=1, begin=0, end=2) = [[  1.,   2.],
+ *        [  5.,   6.],
+ *        [  9.,  10.]]
+ *
+ *        slice_axis(x, axis=1, begin=-3, end=-1) = [[  2.,   3.],
+ *        [  6.,   7.],
+ *        [ 10.,  11.]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L285
+ * \param symbol_name name of the resulting symbol
+ * \param data Source input
+ * \param axis The axis to be sliced. Negative axis means to count from the last to the
+ * \param begin The beginning index to be sliced. Negative values are interpreted as
+ * \param end The end index to be sliced. The end can be None, in which case all the rest
+ *        elements are used. Also, negative values are interpreted as counting from the
+ * \return new symbol
+ */
+inline Symbol slice_axis(const std::string& symbol_name,
+                         Symbol data,
+                         int axis,
+                         int begin,
+                         dmlc::optional<int> end) {
+  return Operator("slice_axis")
+           .SetParam("axis", axis)
+           .SetParam("begin", begin)
+           .SetParam("end", end)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Dot product of two arrays.
+ *
+ *        ``dot``'s behavior depends on the input array dimensions:
+ *
+ *        - 1-D arrays: inner product of vectors
+ *        - 2-D arrays: matrix multiplication
+ *        - N-D arrays: a sum product over the last axis of the first input and the first
+ *        axis of the second input
+ *
+ *        For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape
+ *        result array will have shape `(n,m,r,s)`. It is computed by::
+ *
+ *        dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L318
+ * \param symbol_name name of the resulting symbol
+ * \param lhs The first input
+ * \param rhs The second input
+ * \param transpose_a If true then transpose the first input before dot.
+ * \param transpose_b If true then transpose the second input before dot.
+ * \return new symbol
+ */
+inline Symbol dot(const std::string& symbol_name,
+                  Symbol lhs,
+                  Symbol rhs,
+                  bool transpose_a = false,
+                  bool transpose_b = false) {
+  return Operator("dot")
+           .SetParam("transpose_a", transpose_a)
+           .SetParam("transpose_b", transpose_b)
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Batchwise dot product.
+ *
+ *        ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
+ *        ``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+ *
+ *        For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
+ *        `(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+ *        which is computed by::
+ *
+ *        batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L354
+ * \param symbol_name name of the resulting symbol
+ * \param lhs The first input
+ * \param rhs The second input
+ * \param transpose_a If true then transpose the first input before dot.
+ * \param transpose_b If true then transpose the second input before dot.
+ * \return new symbol
+ */
+inline Symbol batch_dot(const std::string& symbol_name,
+                        Symbol lhs,
+                        Symbol rhs,
+                        bool transpose_a = false,
+                        bool transpose_b = false) {
+  return Operator("batch_dot")
+           .SetParam("transpose_a", transpose_a)
+           .SetParam("transpose_b", transpose_b)
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Clip (limit) the values in an array, elementwise
+ *
+ *        Given an interval, values outside the interval are clipped to the interval
+ *        edges. That is::
+ *
+ *        clip(x) = max(min(x, a_max)), a_min)
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L393
+ * \param symbol_name name of the resulting symbol
+ * \param data Source input
+ * \param a_min Minimum value
+ * \param a_max Maximum value
+ * \return new symbol
+ */
+inline Symbol clip(const std::string& symbol_name,
+                   Symbol data,
+                   mx_float a_min,
+                   mx_float a_max) {
+  return Operator("clip")
+           .SetParam("a_min", a_min)
+           .SetParam("a_max", a_max)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Repeat elements of an array.
+ *
+ *        In default, ``repeat`` flatten the input array into 1-D and then repeat the
+ *        elements::
+ *
+ *        x = [[ 1, 2],
+ *        [ 3, 4]]
+ *
+ *        repeat(x, repeats=2) = [ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.]
+ *
+ *        We can also choose a particular axis to repeat, in which a negative axis is
+ *        interpreted counting from the backward::
+ *
+ *        repeat(x, repeats=2, axis=1) = [[ 1.,  1.,  2.,  2.],
+ *        [ 3.,  3.,  4.,  4.]]
+ *
+ *        repeat(x, repeats=2, axis=-1) = [[ 1.,  2.],
+ *        [ 1.,  2.],
+ *        [ 3.,  4.],
+ *        [ 3.,  4.]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L432
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data array
+ * \param repeats The number of repetitions for each element.
+ * \param axis The axis along which to repeat values. The negative numbers are
+ *        interpreted counting from the backward. By default, use the flattened input
+ * \return new symbol
+ */
+inline Symbol repeat(const std::string& symbol_name,
+                     Symbol data,
+                     int repeats,
+                     dmlc::optional<int> axis = dmlc::optional<int>()) {
+  return Operator("repeat")
+           .SetParam("repeats", repeats)
+           .SetParam("axis", axis)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Repeat the whole array by multiple times.
+ *
+ *        If ``reps`` has length *d*, and input array has dimension of *n*. There are
+ *        there cases:
+ *
+ *        - **n=d**. Repeat *i*-th dimension of the input by ``reps[i]`` times::
+ *
+ *        x = [[1, 2],
+ *        [3, 4]]
+ *
+ *        tile(x, reps=(2,3)) = [[ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.],
+ *        [ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.]]
+ *
+ *        - **n>d**. ``reps`` is promoted to length *n* by pre-pending 1’s to it. Thus
+ *        an input shape ``(2,3)``, ``repos=(2,)`` is treated as ``(1,2)``::
+ *
+ *
+ *        tile(x, reps=(2,)) = [[ 1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.]]
+ *
+ *        - **n<d**. The input is promoted to be d-dimensional by prepending new axes. So
+ *        shape ``(2,2)`` array is promoted to ``(1,2,2)`` for 3-D replication::
+ *
+ *        tile(x, reps=(2,2,3)) = [[[ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.],
+ *        [ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.]],
+ *
+ *        [[ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.],
+ *        [ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L489
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data array
+ * \param reps The number of times for repeating the tensor a. If reps has length d, the
+ *        result will have dimension of max(d, a.ndim); If a.ndim < d, a is promoted to
+ *        be d-dimensional by prepending new axes. If a.ndim > d, reps is promoted to
+ * \return new symbol
+ */
+inline Symbol tile(const std::string& symbol_name,
+                   Symbol data,
+                   Shape reps) {
+  return Operator("tile")
+           .SetParam("reps", reps)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Reverse elements of an array with axis
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:512
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data array
+ * \param axis The axis which to reverse elements.
+ * \return new symbol
+ */
+inline Symbol reverse(const std::string& symbol_name,
+                      Symbol data,
+                      Shape axis) {
+  return Operator("reverse")
+           .SetParam("axis", axis)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif DType of the output. If output given, set to type of output.If output not given
+ */
+enum class UniformDtype {
+  None = 0,
+  float16 = 1,
+  float32 = 2,
+  float64 = 3
+};
+
+/*!
+ * \breif Draw samples from a uniform distribution.
+ *
+ *        Samples are uniformly distributed over the half-open interval [low, high)
+ *        (includes low, but excludes high)::
+ *
+ *        nd.uniform(low=0, high=1, shape=(2,2)) = [[ 0.60276335,  0.85794562],
+ *        [ 0.54488319,  0.84725171]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/sample_op.cc:L24
+ * \param symbol_name name of the resulting symbol
+ * \param low The lower bound of distribution
+ * \param high The upper bound of distribution
+ * \param shape The shape of the output
+ * \param ctx Context of output, in format [cpu|gpu|cpu_pinned](n).Only used for
+ * \param dtype DType of the output. If output given, set to type of output.If output not
+ * \return new symbol
+ */
+inline Symbol uniform(const std::string& symbol_name,
+                      mx_float low = 0,
+                      mx_float high = 1,
+                      Shape shape = Shape(),
+                      const std::string& ctx = "",
+                      UniformDtype dtype = UniformDtype::None) {
+  static const char *UniformDtypeValues[] = {
+    "None",
+    "float16",
+    "float32",
+    "float64"
+  };
+  return Operator("uniform")
+           .SetParam("low", low)
+           .SetParam("high", high)
+           .SetParam("shape", shape)
+           .SetParam("dtype", UniformDtypeValues[int(dtype)])
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif DType of the output. If output given, set to type of output.If output not given
+ */
+enum class NormalDtype {
+  None = 0,
+  float16 = 1,
+  float32 = 2,
+  float64 = 3
+};
+
+/*!
+ * \breif Draw random samples from a normal (Gaussian) distribution.
+ *
+ *        Examples::
+ *
+ *        normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478],
+ *        [-1.23474145,  1.55807114]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/sample_op.cc:L35
+ * \param symbol_name name of the resulting symbol
+ * \param loc Mean of the distribution.
+ * \param scale Standard deviation of the distribution.
+ * \param shape The shape of the output
+ * \param ctx Context of output, in format [cpu|gpu|cpu_pinned](n).Only used for
+ * \param dtype DType of the output. If output given, set to type of output.If output not
+ * \return new symbol
+ */
+inline Symbol normal(const std::string& symbol_name,
+                     mx_float loc = 0,
+                     mx_float scale = 1,
+                     Shape shape = Shape(),
+                     const std::string& ctx = "",
+                     NormalDtype dtype = NormalDtype::None) {
+  static const char *NormalDtypeValues[] = {
+    "None",
+    "float16",
+    "float32",
+    "float64"
+  };
+  return Operator("normal")
+           .SetParam("loc", loc)
+           .SetParam("scale", scale)
+           .SetParam("shape", shape)
+           .SetParam("dtype", NormalDtypeValues[int(dtype)])
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Returns the indices of the maximum values along an axis.
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/broadcast_reduce_op_index.cc:11
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis Empty or unsigned. The axis to perform the reduction.If left empty, a
+ * \param keepdims If true, the axis which is reduced is left in the result as dimension
+ * \return new symbol
+ */
+inline Symbol argmax(const std::string& symbol_name,
+                     Symbol data,
+                     int axis = -1,
+                     bool keepdims = false) {
+  return Operator("argmax")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Returns the indices of the minimum values along an axis.
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/broadcast_reduce_op_index.cc:16
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis Empty or unsigned. The axis to perform the reduction.If left empty, a
+ * \param keepdims If true, the axis which is reduced is left in the result as dimension
+ * \return new symbol
+ */
+inline Symbol argmin(const std::string& symbol_name,
+                     Symbol data,
+                     int axis = -1,
+                     bool keepdims = false) {
+  return Operator("argmin")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif
+ * \param symbol_name name of the resulting symbol
+ * \param src Source input
+ * \return new symbol
+ */
+inline Symbol argmax_channel(const std::string& symbol_name,
+                             Symbol src) {
+  return Operator("argmax_channel")
+           .SetInput("src", src)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the sum of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol sum(const std::string& symbol_name,
+                  Symbol data,
+                  Shape axis = Shape(),
+                  bool keepdims = false) {
+  return Operator("sum")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the mean of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol mean(const std::string& symbol_name,
+                   Symbol data,
+                   Shape axis = Shape(),
+                   bool keepdims = false) {
+  return Operator("mean")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the product of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol prod(const std::string& symbol_name,
+                   Symbol data,
+                   Shape axis = Shape(),
+                   bool keepdims = false) {
+  return Operator("prod")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the sum of array elements over given axes with ``NaN`` ignored
+ *
+ *        Refer to ``sum`` for more details.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol nansum(const std::string& symbol_name,
+                     Symbol data,
+                     Shape axis = Shape(),
+                     bool keepdims = false) {
+  return Operator("nansum")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the product of array elements over given axes with ``NaN`` ignored
+ *
+ *        Refer to ``prod`` for more details.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol nanprod(const std::string& symbol_name,
+                      Symbol data,
+                      Shape axis = Shape(),
+                      bool keepdims = false) {
+  return Operator("nanprod")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the max of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol max(const std::string& symbol_name,
+                  Symbol data,
+                  Shape axis = Shape(),
+                  bool keepdims = false) {
+  return Operator("max")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the min of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol min(const std::string& symbol_name,
+                  Symbol data,
+                  Shape axis = Shape(),
+                  bool keepdims = false) {
+  return Operator("min")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Broadcast an array over particular axes.
+ *
+ *        Broadcasting is allowed on axes which size 1, such as from ``(2,1,3,1)`` to
+ *        ``(2,8,3,9)``. Elemenets will be duplicated on the broadcasted axes.
+ *
+ *        For example::
+ *
+ *        // given (1,2,1) shape x
+ *        x = [[[ 1.],
+ *        [ 2.]]]
+ *
+ *        // broadcast on axis 2
+ *        broadcast_axis(x, axis=2, size=3) = [[[ 1.,  1.,  1.],
+ *        [ 2.,  2.,  2.]]]
+ *        // broadcast on axes 0 and 2
+ *        broadcast_axis(x, axis=(0,2), size=(2,3)) = [[[ 1.,  1.,  1.],
+ *        [ 2.,  2.,  2.]],
+ *        [[ 1.,  1.,  1.],
+ *        [ 2.,  2.,  2.]]]
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param axis The axes to perform the broadcasting.
+ * \param size Target sizes of the broadcasting axes.
+ * \return new symbol
+ */
+inline Symbol broadcast_axis(const std::string& symbol_name,
+                             Symbol data,
+                             Shape axis = Shape(),
+                             Shape size = Shape()) {
+  return Operator("broadcast_axis")
+           .SetParam("axis", axis)
+           .SetParam("size", size)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Broadcast an array to a new shape.
+ *
+ *        Broadcasting is allowed on axes which size 1, such as from ``(2,1,3,1)`` to
+ *        ``(2,8,3,9)``. Elemenets will be duplicated on the broadcasted axes.
+ *
+ *        For example::
+ *
+ *        broadcast_to([[1,2,3]], shape=(2,3)) = [[ 1.,  2.,  3.],
+ *        [ 1.,  2.,  3.]])
+ *
+ *        The dimensions that will not be changed can also use the special code ``0`` that
+ *        means copy the original value. So with ``shape=(2,0)`` we will obtain the same
+ *        results in the above example.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param data The input
+ * \param shape The shape of the desired array. We can set the dim to zero if it's same
+ *        as the original. E.g `A = broadcast_to(B, shape=(10, 0, 0))` has the same
+ * \return new symbol
+ */
+inline Symbol broadcast_to(const std::string& symbol_name,
+                           Symbol data,
+                           Shape shape = Shape()) {
+  return Operator("broadcast_to")
+           .SetParam("shape", shape)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Compute the L2 norm.
+ *
+ *        Flatten then input array and then compute the l2 norm.
+ *
+ *        Examples::
+ *
+ *        x = [[1, 2],
+ *        [3, 4]]
+ *
+ *        norm(x) = [5.47722578]
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param src Source input
+ * \return new symbol
+ */
+inline Symbol norm(const std::string& symbol_name,
+                   Symbol src) {
+  return Operator("norm")
+           .SetInput("src", src)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Given three ndarrays, condition, x, and y, return an ndarray with the elements
+ *        from x or y, depending on the elements from condition are true or false. x and
+ *        y must have the same shape. If condition has the same shape as x, each element
+ *        in the output array is from x if the corresponding element in the condition is
+ *        true, and from y if false. If condtion does not have the same shape as x, it
+ *        must be a 1D array whose size is the same as x's first dimension size. Each row
+ *        of the output array is from x's row if the corresponding element from condition
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/control_flow_op.cc:21
+ * \param symbol_name name of the resulting symbol
+ * \param condition condition array
+ * \param x
+ * \param y
+ * \return new symbol
+ */
+inline Symbol where(const std::string& symbol_name,
+                    Symbol condition,
+                    Symbol x,
+                    Symbol y) {
+  return Operator("where")
+           .SetInput("condition", condition)
+           .SetInput("x", x)
+           .SetInput("y", y)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Add arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_add(const std::string& symbol_name,
+                            Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_add")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Substract arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_sub(const std::string& symbol_name,
+                            Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_sub")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Multiply arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_mul(const std::string& symbol_name,
+                            Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_mul")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Divide arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_div(const std::string& symbol_name,
+                            Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_div")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif First array elements raised to powers from second array,
+ *        element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L16
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_power(const std::string& symbol_name,
+                              Symbol lhs,
+                              Symbol rhs) {
+  return Operator("broadcast_power")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Element-wise maximum of array elements with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L34
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_maximum(const std::string& symbol_name,
+                                Symbol lhs,
+                                Symbol rhs) {
+  return Operator("broadcast_maximum")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Element-wise minimum of array elements with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L52
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_minimum(const std::string& symbol_name,
+                                Symbol lhs,
+                                Symbol rhs) {
+  return Operator("broadcast_minimum")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Given the "legs" of a right triangle, return its hypotenuse
+ *        with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L71
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_hypot(const std::string& symbol_name,
+                              Symbol lhs,
+                              Symbol rhs) {
+  return Operator("broadcast_hypot")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return (lhs == rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_equal(const std::string& symbol_name,
+                              Symbol lhs,
+                              Symbol rhs) {
+  return Operator("broadcast_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return (lhs != rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_not_equal(const std::string& symbol_name,
+                                  Symbol lhs,
+                                  Symbol rhs) {
+  return Operator("broadcast_not_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return (lhs > rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_greater(const std::string& symbol_name,
+                                Symbol lhs,
+                                Symbol rhs) {
+  return Operator("broadcast_greater")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return (lhs >= rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_greater_equal(const std::string& symbol_name,
+                                      Symbol lhs,
+                                      Symbol rhs) {
+  return Operator("broadcast_greater_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return (lhs < rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_lesser(const std::string& symbol_name,
+                               Symbol lhs,
+                               Symbol rhs) {
+  return Operator("broadcast_lesser")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Return (lhs <= rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param symbol_name name of the resulting symbol
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_lesser_equal(const std::string& symbol_name,
+                                     Symbol lhs,
+                                     Symbol rhs) {
+  return Operator("broadcast_lesser_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Add all input arguments element-wise.
+ *
+ *        .. math::
+ *        add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
+ *
+ *        ``add_n`` is potentially more efficient than calling ``add`` by `n` times.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_sum.cc:L63
+ * \param symbol_name name of the resulting symbol
+ * \param args Positional input arguments
+ * \return new symbol
+ */
+inline Symbol add_n(const std::string& symbol_name,
+                    const std::vector<Symbol>& args) {
+  return Operator("add_n")
+(args)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Custom operator implemented in frontend.
+ * \param symbol_name name of the resulting symbol
+ * \param op_type Type of custom operator. Must be registered first.
+ * \return new symbol
+ */
+inline Symbol Custom(const std::string& symbol_name,
+                     const std::string& op_type) {
+  return Operator("Custom")
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Activation function to be applied.
+ */
+enum class ActivationActType {
+  relu = 0,
+  sigmoid = 1,
+  softrelu = 2,
+  tanh = 3
+};
+
+/*!
+ * \breif Elementwise activation function.
+ *        The activation operations are applied elementwisely to each array elements. The
+ *        following types are supported:
+ *
+ *        - `relu`: Rectified Linear Unit, `y = max(x, 0)`
+ *        - `sigmoid`: `y = 1 / (1 + exp(-x))`
+ *        - `tanh`: Hyperbolic tangent, `y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))`
+ *        - `softrelu`: Soft ReLU, or SoftPlus, `y = log(1 + exp(x))`
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/activation.cc:L76
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \return new symbol
+ */
+inline Symbol Activation(const std::string& symbol_name,
+                         Symbol data,
+                         ActivationActType act_type) {
+  static const char *ActivationActTypeValues[] = {
+    "relu",
+    "sigmoid",
+    "softrelu",
+    "tanh"
+  };
+  return Operator("Activation")
+           .SetParam("act_type", ActivationActTypeValues[int(act_type)])
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply bilinear sampling to input feature map, which is the key of "[NIPS2015]
+ *        output[batch, channel, y_dst, x_dst] = G(data[batch, channel, y_src, x_src)
+ *        x_dst, y_dst enumerate all spatial locations in output
+ *        x_src = grid[batch, 0, y_dst, x_dst]
+ *        y_src = grid[batch, 1, y_dst, x_dst]
+ *        G() denotes the bilinear interpolation kernel
+ *        The out-boundary points will be padded as zeros. (The boundary is defined to be
+ *        The shape of output will be (data.shape[0], data.shape[1], grid.shape[2],
+ *        The operator assumes that grid has been nomalized. If you want to design a
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the BilinearsamplerOp.
+ * \param grid Input grid to the BilinearsamplerOp.grid has two channels: x_src, y_src
+ * \return new symbol
+ */
+inline Symbol BilinearSampler(const std::string& symbol_name,
+                              Symbol data,
+                              Symbol grid) {
+  return Operator("BilinearSampler")
+           .SetInput("data", data)
+           .SetInput("grid", grid)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Whether to pick convolution algo by running performance test.
+ */
+enum class ConvolutionCudnnTune {
+  None = 0,
+  fastest = 1,
+  limited_workspace = 2,
+  off = 3
+};
+
+/*! \breif Set layout for input, output and weight. Empty for
+ *        default layout: NCHW for 2d and NCDHW for 3d.
+ */
+enum class ConvolutionLayout {
+  None = 0,
+  NCDHW = 1,
+  NCHW = 2,
+  NDHWC = 3,
+  NHWC = 4
+};
+
+/*!
+ * \breif Compute *N*-D convolution on *(N+2)*-D input.
+ *
+ *        In the simplest 2-D convolution, given input data with shape *(batch_size,
+ *        channel, height, weight)*, the output is computed by
+ *
+ *        .. math::
+ *
+ *        out[n,i,:,:] = bias[i] + \sum_{j=0}^{num\_filter} data[n,j,:,:] \star
+ *        weight[i,j,:,:]
+ *
+ *        where :math:`\star` is the 2-D cross-correlation operator.
+ *
+ *        For general 2-D convolution, the shapes are
+ *
+ *        - **data**: *(batch_size, channel, height, weight)*
+ *        - **weight**: *(num_filter, channel, kernel[0], kernel[1])*
+ *        - **bias**: *(num_filter,)*
+ *        - **out**: *(batch_size, num_filter, out_height, out_weight)*.
+ *
+ *        Define::
+ *
+ *        f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1
+ *
+ *        then we have::
+ *
+ *        out_height=f(height, kernel[0], pad[0], stride[0], dilate[0])
+ *        out_weight=f(weight, kernel[1], pad[1], stride[1], dilate[1])
+ *
+ *        If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+ *
+ *        The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height,
+ *        weight)*. We can choose other layouts such as *NHWC*.
+ *
+ *        If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data``
+ *        evenly into *g* parts along the channel axis, and also evenly split ``weight``
+ *        along the first dimension. Next compute the convolution on the *i*-th part of
+ *        the data with the *i*-th weight part. The output is obtained by concating all
+ *        the *g* results.
+ *
+ *        To perform 1-D convolution, simply use 2-D convolution but set the last axis
+ *        size to be 1 for both data and weight.
+ *
+ *        3-D convolution adds an additional depth dimension besides height and
+ *        weight. The shapes are
+ *
+ *        - **data**: *(batch_size, channel, depth, height, weight)*
+ *        - **weight**: *(num_filter, channel, kernel[0], kernel[1], kernel[2])*
+ *        - **bias**: *(num_filter,)*
+ *        - **out**: *(batch_size, num_filter, out_depth, out_height, out_weight)*.
+ *
+ *        Both ``weight`` and ``bias`` are learnable parameters.
+ *
+ *        There are other options to tune the performance.
+ *
+ *        - **cudnn_tune**: enable this option leads to higher startup time but may give
+ *        faster speed. Options are
+ *
+ *        - **off**: no tuning
+ *        - **limited_workspace**:run test and pick the fastest algorithm that doesn't
+ *        exceed workspace limit.
+ *        - **fastest**: pick the fastest algorithm and ignore workspace limit.
+ *        - **None** (default): the behavior is determined by environment variable
+ *        ``MXNET_CUDNN_AUTOTUNE_DEFAULT``. 0 for off, 1 for limited workspace
+ *        (default), 2 for fastest.
+ *
+ *        - **workspace**: A large number leads to more (GPU) memory usage but may improve
+ *        the performance.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/convolution.cc:L143
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the ConvolutionOp.
+ * \param weight Weight matrix.
+ * \param bias Bias parameter.
+ * \param kernel convolution kernel size: (h, w) or (d, h, w)
+ * \param num_filter convolution filter(channel) number
+ * \param stride convolution stride: (h, w) or (d, h, w)
+ * \param dilate convolution dilate: (h, w) or (d, h, w)
+ * \param pad pad for convolution: (h, w) or (d, h, w)
+ * \param num_group Number of group partitions.
+ * \param workspace Maximum temperal workspace allowed for convolution (MB).
+ * \param no_bias Whether to disable bias parameter.
+ * \param cudnn_tune Whether to pick convolution algo by running performance test.
+ * \param cudnn_off Turn off cudnn for this layer.
+ * \param layout Set layout for input, output and weight. Empty for
+ *        default layout: NCHW for 2d and NCDHW for 3d.
+ * \return new symbol
+ */
+inline Symbol Convolution(const std::string& symbol_name,
+                          Symbol data,
+                          Symbol weight,
+                          Symbol bias,
+                          Shape kernel,
+                          uint32_t num_filter,
+                          Shape stride = Shape(),
+                          Shape dilate = Shape(),
+                          Shape pad = Shape(),
+                          uint32_t num_group = 1,
+                          uint64_t workspace = 1024,
+                          bool no_bias = false,
+                          ConvolutionCudnnTune cudnn_tune = ConvolutionCudnnTune::None,
+                          bool cudnn_off = false,
+                          ConvolutionLayout layout = ConvolutionLayout::None) {
+  static const char *ConvolutionCudnnTuneValues[] = {
+    "None",
+    "fastest",
+    "limited_workspace",
+    "off"
+  };
+  static const char *ConvolutionLayoutValues[] = {
+    "None",
+    "NCDHW",
+    "NCHW",
+    "NDHWC",
+    "NHWC"
+  };
+  return Operator("Convolution")
+           .SetParam("kernel", kernel)
+           .SetParam("num_filter", num_filter)
+           .SetParam("stride", stride)
+           .SetParam("dilate", dilate)
+           .SetParam("pad", pad)
+           .SetParam("num_group", num_group)
+           .SetParam("workspace", workspace)
+           .SetParam("no_bias", no_bias)
+           .SetParam("cudnn_tune", ConvolutionCudnnTuneValues[int(cudnn_tune)])
+           .SetParam("cudnn_off", cudnn_off)
+           .SetParam("layout", ConvolutionLayoutValues[int(layout)])
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .SetInput("bias", bias)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply correlation to inputs
+ * \param symbol_name name of the resulting symbol
+ * \param data1 Input data1 to the correlation.
+ * \param data2 Input data2 to the correlation.
+ * \param kernel_size kernel size for Correlation must be an odd number
+ * \param max_displacement Max displacement of Correlation
+ * \param stride1 stride1 quantize data1 globally
+ * \param stride2 stride2 quantize data2 within the neighborhood centered around data1
+ * \param pad_size pad for Correlation
+ * \param is_multiply operation type is either multiplication or subduction
+ * \return new symbol
+ */
+inline Symbol Correlation(const std::string& symbol_name,
+                          Symbol data1,
+                          Symbol data2,
+                          uint32_t kernel_size = 1,
+                          uint32_t max_displacement = 1,
+                          uint32_t stride1 = 1,
+                          uint32_t stride2 = 1,
+                          uint32_t pad_size = 0,
+                          bool is_multiply = true) {
+  return Operator("Correlation")
+           .SetParam("kernel_size", kernel_size)
+           .SetParam("max_displacement", max_displacement)
+           .SetParam("stride1", stride1)
+           .SetParam("stride2", stride2)
+           .SetParam("pad_size", pad_size)
+           .SetParam("is_multiply", is_multiply)
+           .SetInput("data1", data1)
+           .SetInput("data2", data2)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Crop the 2nd and 3rd dim of input data, with the corresponding size of h_w or
+ *        with width and height of the second input symbol, i.e., with one input, we need
+ *        h_w to specify the crop height and width, otherwise the second input symbol's
+ * \param symbol_name name of the resulting symbol
+ * \param data Tensor or List of Tensors, the second input will be used as crop_like
+ * \param num_args Number of inputs for crop, if equals one, then we will use the h_wfor
+ *        crop height and width, else if equals two, then we will use the heightand width
+ * \param offset crop offset coordinate: (y, x)
+ * \param h_w crop height and weight: (h, w)
+ * \param center_crop If set to true, then it will use be the center_crop,or it will crop
+ * \return new symbol
+ */
+inline Symbol Crop(const std::string& symbol_name,
+                   const std::vector<Symbol>& data,
+                   int num_args,
+                   Shape offset = Shape(0,0),
+                   Shape h_w = Shape(0,0),
+                   bool center_crop = false) {
+  return Operator("Crop")
+           .SetParam("num_args", num_args)
+           .SetParam("offset", offset)
+           .SetParam("h_w", h_w)
+           .SetParam("center_crop", center_crop)
+(data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply deconvolution to input then add a bias.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the DeconvolutionOp.
+ * \param weight Weight matrix.
+ * \param bias Bias parameter.
+ * \param kernel deconvolution kernel size: (y, x)
+ * \param num_filter deconvolution filter(channel) number
+ * \param stride deconvolution stride: (y, x)
+ * \param pad pad for deconvolution: (y, x), a good number is : (kernel-1)/2, if
+ * \param adj adjustment for output shape: (y, x), if target_shape set, adj will be
+ * \param target_shape output shape with targe shape : (y, x)
+ * \param num_group number of groups partition
+ * \param workspace Tmp workspace for deconvolution (MB)
+ * \param no_bias Whether to disable bias parameter.
+ * \return new symbol
+ */
+inline Symbol Deconvolution(const std::string& symbol_name,
+                            Symbol data,
+                            Symbol weight,
+                            Symbol bias,
+                            Shape kernel,
+                            uint32_t num_filter,
+                            Shape stride = Shape(1,1),
+                            Shape pad = Shape(0,0),
+                            Shape adj = Shape(0,0),
+                            Shape target_shape = Shape(0,0),
+                            uint32_t num_group = 1,
+                            uint64_t workspace = 512,
+                            bool no_bias = true) {
+  return Operator("Deconvolution")
+           .SetParam("kernel", kernel)
+           .SetParam("num_filter", num_filter)
+           .SetParam("stride", stride)
+           .SetParam("pad", pad)
+           .SetParam("adj", adj)
+           .SetParam("target_shape", target_shape)
+           .SetParam("num_group", num_group)
+           .SetParam("workspace", workspace)
+           .SetParam("no_bias", no_bias)
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .SetInput("bias", bias)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply dropout to input.
+ *        During training, each element of the input is randomly set to zero with
+ *        And then the whole tensor is rescaled by 1/(1-p) to keep the expectation the
+ *        before applying dropout. During the test time, this behaves as an identity map.
+ *
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to dropout.
+ * \param p Fraction of the input that gets dropped out at training time
+ * \return new symbol
+ */
+inline Symbol Dropout(const std::string& symbol_name,
+                      Symbol data,
+                      mx_float p = 0.5) {
+  return Operator("Dropout")
+           .SetParam("p", p)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply a linear transformation: :math:`Y = XW^T + b`.
+ *
+ *        Shapes:
+ *
+ *        - **data**: `(batch_size, input_dim)`
+ *        - **weight**: `(num_hidden, input_dim)`
+ *        - **bias**: `(num_hidden,)`
+ *        - **out**: `(batch_size, num_hidden)`
+ *
+ *        The learnable parameters include both ``weight`` and ``bias``.
+ *
+ *        If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/fully_connected.cc:L94
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data.
+ * \param weight Weight matrix.
+ * \param bias Bias parameter.
+ * \param num_hidden Number of hidden nodes of the output.
+ * \param no_bias Whether to disable bias parameter.
+ * \return new symbol
+ */
+inline Symbol FullyConnected(const std::string& symbol_name,
+                             Symbol data,
+                             Symbol weight,
+                             Symbol bias,
+                             int num_hidden,
+                             bool no_bias = false) {
+  return Operator("FullyConnected")
+           .SetParam("num_hidden", num_hidden)
+           .SetParam("no_bias", no_bias)
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .SetInput("bias", bias)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif An operator taking in a n-dimensional input tensor (n > 2), and normalizing the
+ *        input by subtracting the mean and variance calculated over the spatial
+ *        dimensions. This is an implemention of the operator described in "Instance
+ *        Normalization: The Missing Ingredient for Fast Stylization", D. Ulyanov, A.
+ *        Vedaldi, V. Lempitsky, 2016 (arXiv:1607.08022v2). This layer is similar to
+ *        batch normalization, with two differences: first, the normalization is carried
+ *        out per example ('instance'), not over a batch. Second, the same normalization
+ *        is applied both at test and train time. This operation is also known as
+ * \param symbol_name name of the resulting symbol
+ * \param data A n-dimensional tensor (n > 2) of the form [batch, channel, spatial_dim1,
+ * \param gamma A vector of length 'channel', which multiplies the normalized input.
+ * \param beta A vector of length 'channel', which is added to the product of the
+ * \param eps Epsilon to prevent division by 0.
+ * \return new symbol
+ */
+inline Symbol InstanceNorm(const std::string& symbol_name,
+                           Symbol data,
+                           Symbol gamma,
+                           Symbol beta,
+                           mx_float eps = 0.001) {
+  return Operator("InstanceNorm")
+           .SetParam("eps", eps)
+           .SetInput("data", data)
+           .SetInput("gamma", gamma)
+           .SetInput("beta", beta)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Normalization Mode. If set to instance, this operator will compute a norm for
+ *        each instance in the batch; this is the default mode. If set to channel, this
+ *        operator will compute a cross channel norm at each position of each instance.
+ */
+enum class L2NormalizationMode {
+  channel = 0,
+  instance = 1,
+  spatial = 2
+};
+
+/*!
+ * \breif Set the l2 norm of each instance to a constant.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the L2NormalizationOp.
+ * \param eps Epsilon to prevent div 0
+ * \param mode Normalization Mode. If set to instance, this operator will compute a norm
+ *        for each instance in the batch; this is the default mode. If set to channel,
+ *        this operator will compute a cross channel norm at each position of each
+ * \return new symbol
+ */
+inline Symbol L2Normalization(const std::string& symbol_name,
+                              Symbol data,
+                              mx_float eps = 1e-10,
+                              L2NormalizationMode mode = L2NormalizationMode::instance) {
+  static const char *L2NormalizationModeValues[] = {
+    "channel",
+    "instance",
+    "spatial"
+  };
+  return Operator("L2Normalization")
+           .SetParam("eps", eps)
+           .SetParam("mode", L2NormalizationModeValues[int(mode)])
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply convolution to input then add a bias.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the ConvolutionOp.
+ * \param nsize normalization window width in elements.
+ * \param alpha value of the alpha variance scaling parameter in the normalization formula
+ * \param beta value of the beta power parameter in the normalization formula
+ * \param knorm value of the k parameter in normalization formula
+ * \return new symbol
+ */
+inline Symbol LRN(const std::string& symbol_name,
+                  Symbol data,
+                  uint32_t nsize,
+                  mx_float alpha = 0.0001,
+                  mx_float beta = 0.75,
+                  mx_float knorm = 2) {
+  return Operator("LRN")
+           .SetParam("nsize", nsize)
+           .SetParam("alpha", alpha)
+           .SetParam("beta", beta)
+           .SetParam("knorm", knorm)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif If set to null, op will not normalize on output gradient.If set to batch, op
+ *        will normalize gradient by divide batch size.If set to valid, op will normalize
+ */
+enum class MakeLossNormalization {
+  batch = 0,
+  null = 1,
+  valid = 2
+};
+
+/*!
+ * \breif Get output from a symbol and pass 1 gradient back. This is used as a terminal
+ *        loss if unary and binary operator are used to composite a loss with no
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data.
+ * \param grad_scale gradient scale as a supplement to unary and binary operators
+ * \param valid_thresh regard element valid when x > valid_thresh, this is used only in
+ * \param normalization If set to null, op will not normalize on output gradient.If set
+ *        to batch, op will normalize gradient by divide batch size.If set to valid, op
+ * \return new symbol
+ */
+inline Symbol MakeLoss(const std::string& symbol_name,
+                       Symbol data,
+                       mx_float grad_scale = 1,
+                       mx_float valid_thresh = 0,
+                       MakeLossNormalization normalization = MakeLossNormalization::null) {
+  static const char *MakeLossNormalizationValues[] = {
+    "batch",
+    "null",
+    "valid"
+  };
+  return Operator("MakeLoss")
+           .SetParam("grad_scale", grad_scale)
+           .SetParam("valid_thresh", valid_thresh)
+           .SetParam("normalization", MakeLossNormalizationValues[int(normalization)])
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Pooling type to be applied.
+ */
+enum class PoolingPoolType {
+  avg = 0,
+  max = 1,
+  sum = 2
+};
+
+/*! \breif Pooling convention to be applied.
+ */
+enum class PoolingPoolingConvention {
+  full = 0,
+  valid = 1
+};
+
+/*!
+ * \breif Perform pooling on the input.
+ *
+ *        The shapes for 2-D pooling is
+ *
+ *        - **data**: *(batch_size, channel, height, width)*
+ *        - **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+ *
+ *        out_height = f(height, kernel[0], pad[0], stride[0])
+ *        out_width = f(width, kernel[1], pad[1], stride[1])
+ *
+ *        The defintion of *f* depends on ``pooling_convention``, which has two options:
+ *
+ *        - **valid** (default)::
+ *
+ *        f(x, k, p, s) = floor(x+2*p-k)/s+1
+ *
+ *        - **full**, which is compatible with Caffe::
+ *
+ *        f(x, k, p, s) = ceil(x+2*p-k)/s+1
+ *
+ *        But ``global_pool`` is set to be true, then do a global pooling, namely reset
+ *        ``kernel=(height, width)``.
+ *
+ *        Three pooling options are supported by ``pool_type``:
+ *
+ *        - **avg**: average pooling
+ *        - **max**: max pooling
+ *        - **sum**: sum pooling
+ *
+ *        1-D pooling is special case of 2-D pooling with *weight=1* and
+ *        *kernel[1]=1*.
+ *
+ *        For 3-D pooling, an additional *depth* dimension is added before
+ *        *height*. Namely the input data will have shape *(batch_size, channel, depth,
+ *        height, width)*.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/pooling.cc:L122
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the pooling operator.
+ * \param kernel pooling kernel size: (y, x) or (d, y, x)
+ * \param pool_type Pooling type to be applied.
+ * \param global_pool Ignore kernel size, do global pooling based on current input
+ * \param pooling_convention Pooling convention to be applied.
+ * \param stride stride: for pooling (y, x) or (d, y, x)
+ * \param pad pad for pooling: (y, x) or (d, y, x)
+ * \return new symbol
+ */
+inline Symbol Pooling(const std::string& symbol_name,
+                      Symbol data,
+                      Shape kernel,
+                      PoolingPoolType pool_type,
+                      bool global_pool = false,
+                      PoolingPoolingConvention pooling_convention = PoolingPoolingConvention::valid,
+                      Shape stride = Shape(),
+                      Shape pad = Shape()) {
+  static const char *PoolingPoolTypeValues[] = {
+    "avg",
+    "max",
+    "sum"
+  };
+  static const char *PoolingPoolingConventionValues[] = {
+    "full",
+    "valid"
+  };
+  return Operator("Pooling")
+           .SetParam("kernel", kernel)
+           .SetParam("pool_type", PoolingPoolTypeValues[int(pool_type)])
+           .SetParam("global_pool", global_pool)
+           .SetParam("pooling_convention", PoolingPoolingConventionValues[int(pooling_convention)])
+           .SetParam("stride", stride)
+           .SetParam("pad", pad)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Use linear regression for final output, this is used on final output of a net.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to function.
+ * \param label Input label to function.
+ * \param grad_scale Scale the gradient by a float factor
+ * \return new symbol
+ */
+inline Symbol LinearRegressionOutput(const std::string& symbol_name,
+                                     Symbol data,
+                                     Symbol label,
+                                     mx_float grad_scale = 1) {
+  return Operator("LinearRegressionOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Use mean absolute error regression for final output, this is used on final
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to function.
+ * \param label Input label to function.
+ * \param grad_scale Scale the gradient by a float factor
+ * \return new symbol
+ */
+inline Symbol MAERegressionOutput(const std::string& symbol_name,
+                                  Symbol data,
+                                  Symbol label,
+                                  mx_float grad_scale = 1) {
+  return Operator("MAERegressionOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Use Logistic regression for final output, this is used on final output of a net.
+ *        Logistic regression is suitable for binary classification or probability
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to function.
+ * \param label Input label to function.
+ * \param grad_scale Scale the gradient by a float factor
+ * \return new symbol
+ */
+inline Symbol LogisticRegressionOutput(const std::string& symbol_name,
+                                       Symbol data,
+                                       Symbol label,
+                                       mx_float grad_scale = 1) {
+  return Operator("LogisticRegressionOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif the type of RNN to compute
+ */
+enum class RNNMode {
+  gru = 0,
+  lstm = 1,
+  rnn_relu = 2,
+  rnn_tanh = 3
+};
+
+/*!
+ * \breif Apply a recurrent layer to input.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to RNN
+ * \param parameters Vector of all RNN trainable parameters concatenated
+ * \param state initial hidden state of the RNN
+ * \param state_cell initial cell state for LSTM networks (only for LSTM)
+ * \param state_size size of the state for each layer
+ * \param num_layers number of stacked layers
+ * \param mode the type of RNN to compute
+ * \param bidirectional whether to use bidirectional recurrent layers
+ * \param p Dropout probability, fraction of the input that gets dropped out at training
+ * \param state_outputs Whether to have the states as symbol outputs.
+ * \return new symbol
+ */
+inline Symbol RNN(const std::string& symbol_name,
+                  Symbol data,
+                  Symbol parameters,
+                  Symbol state,
+                  Symbol state_cell,
+                  uint32_t state_size,
+                  uint32_t num_layers,
+                  RNNMode mode,
+                  bool bidirectional = false,
+                  mx_float p = 0,
+                  bool state_outputs = false) {
+  static const char *RNNModeValues[] = {
+    "gru",
+    "lstm",
+    "rnn_relu",
+    "rnn_tanh"
+  };
+  return Operator("RNN")
+           .SetParam("state_size", state_size)
+           .SetParam("num_layers", num_layers)
+           .SetParam("mode", RNNModeValues[int(mode)])
+           .SetParam("bidirectional", bidirectional)
+           .SetParam("p", p)
+           .SetParam("state_outputs", state_outputs)
+           .SetInput("data", data)
+           .SetInput("parameters", parameters)
+           .SetInput("state", state)
+           .SetInput("state_cell", state_cell)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Performs region-of-interest pooling on inputs. Resize bounding box coordinates
+ *        by spatial_scale and crop input feature maps accordingly. The cropped feature
+ *        maps are pooled by max pooling to a fixed size output indicated by pooled_size.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the pooling operator, a 4D Feature maps
+ * \param rois Bounding box coordinates, a 2D array of [[batch_index, x1, y1, x2, y2]].
+ *        (x1, y1) and (x2, y2) are top left and down right corners of designated region
+ *        of interest. batch_index indicates the index of corresponding image in the
+ * \param pooled_size fix pooled size: (h, w)
+ * \param spatial_scale Ratio of input feature map height (or w) to raw image height (or
+ * \return new symbol
+ */
+inline Symbol ROIPooling(const std::string& symbol_name,
+                         Symbol data,
+                         Symbol rois,
+                         Shape pooled_size,
+                         mx_float spatial_scale) {
+  return Operator("ROIPooling")
+           .SetParam("pooled_size", pooled_size)
+           .SetParam("spatial_scale", spatial_scale)
+           .SetInput("data", data)
+           .SetInput("rois", rois)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Takes the last element of a sequence. Takes an n-dimensional tensor of the form
+ *        [max sequence length, batchsize, other dims] and returns a (n-1)-dimensional
+ *        tensor of the form [batchsize, other dims]. This operator takes an optional
+ *        input tensor sequence_length of positive ints of dimension [batchsize] when the
+ *        sequence_length option is set to true. This allows the operator to handle
+ *        variable-length sequences. If sequence_length is false, then each example in
+ * \param symbol_name name of the resulting symbol
+ * \param data n-dimensional input tensor of the form [max sequence length, batchsize,
+ * \param sequence_length vector of sequence lengths of size batchsize
+ * \param use_sequence_length If set to true, this layer takes in extra input
+ * \return new symbol
+ */
+inline Symbol SequenceLast(const std::string& symbol_name,
+                           Symbol data,
+                           Symbol sequence_length,
+                           bool use_sequence_length = false) {
+  return Operator("SequenceLast")
+           .SetParam("use_sequence_length", use_sequence_length)
+           .SetInput("data", data)
+           .SetInput("sequence_length", sequence_length)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Sets all elements outside the sequence to a constant value. Takes an
+ *        n-dimensional tensor of the form [max sequence length, batchsize, other dims]
+ *        and returns a tensor of the same shape. This operator takes an optional input
+ *        tensor sequence_length of positive ints of dimension [batchsize] when the
+ *        sequence_length option is set to true. This allows the operator to handle
+ *        variable-length sequences. If sequence_length is false, then each example in
+ *        the batch is assumed to have the max sequence length, and this operator becomes
+ * \param symbol_name name of the resulting symbol
+ * \param data n-dimensional input tensor of the form [max sequence length, batchsize,
+ * \param sequence_length vector of sequence lengths of size batchsize
+ * \param use_sequence_length If set to true, this layer takes in extra input
+ * \param value The value to be used as a mask.
+ * \return new symbol
+ */
+inline Symbol SequenceMask(const std::string& symbol_name,
+                           Symbol data,
+                           Symbol sequence_length,
+                           bool use_sequence_length = false,
+                           mx_float value = 0) {
+  return Operator("SequenceMask")
+           .SetParam("use_sequence_length", use_sequence_length)
+           .SetParam("value", value)
+           .SetInput("data", data)
+           .SetInput("sequence_length", sequence_length)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Reverses the elements of each sequence. Takes an n-dimensional tensor of the
+ *        form [max sequence length, batchsize, other dims] and returns a tensor of the
+ *        same shape. This operator takes an optional input tensor sequence_length of
+ *        positive ints of dimension [batchsize] when the sequence_length option is set
+ *        to true. This allows the operator to handle variable-length sequences. If
+ *        sequence_length is false, then each example in the batch is assumed to have the
+ * \param symbol_name name of the resulting symbol
+ * \param data n-dimensional input tensor of the form [max sequence length, batchsize,
+ * \param sequence_length vector of sequence lengths of size batchsize
+ * \param use_sequence_length If set to true, this layer takes in extra input
+ * \return new symbol
+ */
+inline Symbol SequenceReverse(const std::string& symbol_name,
+                              Symbol data,
+                              Symbol sequence_length,
+                              bool use_sequence_length = false) {
+  return Operator("SequenceReverse")
+           .SetParam("use_sequence_length", use_sequence_length)
+           .SetInput("data", data)
+           .SetInput("sequence_length", sequence_length)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Softmax Mode. If set to instance, this operator will compute a softmax for each
+ *        instance in the batch; this is the default mode. If set to channel, this
+ *        operator will compute a num_channel-class softmax at each position of each
+ *        instance; this can be used for fully convolutional network, image segmentation,
+ */
+enum class SoftmaxActivationMode {
+  channel = 0,
+  instance = 1
+};
+
+/*!
+ * \breif Apply softmax activation to input. This is intended for internal layers. For
+ *        output (loss layer) please use SoftmaxOutput. If mode=instance, this operator
+ *        will compute a softmax for each instance in the batch; this is the default
+ *        mode. If mode=channel, this operator will compute a num_channel-class softmax
+ *        at each position of each instance; this can be used for fully convolutional
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to activation function.
+ * \param mode Softmax Mode. If set to instance, this operator will compute a softmax for
+ *        each instance in the batch; this is the default mode. If set to channel, this
+ *        operator will compute a num_channel-class softmax at each position of each
+ *        instance; this can be used for fully convolutional network, image segmentation,
+ * \return new symbol
+ */
+inline Symbol SoftmaxActivation(const std::string& symbol_name,
+                                Symbol data,
+                                SoftmaxActivationMode mode = SoftmaxActivationMode::instance) {
+  static const char *SoftmaxActivationModeValues[] = {
+    "channel",
+    "instance"
+  };
+  return Operator("SoftmaxActivation")
+           .SetParam("mode", SoftmaxActivationModeValues[int(mode)])
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Normalize the gradient
+ */
+enum class SoftmaxOutputNormalization {
+  batch = 0,
+  null = 1,
+  valid = 2
+};
+
+/*!
+ * \breif Softmax with logit loss.
+ *
+ *        In the forward pass, the softmax output is returned. Assume the input data has
+ *        shape *(n,k)*, then the output will have the same shape as the input, which is
+ *
+ *        .. math::
+ *        out[i,:] = softmax(data[i,:])
+ *
+ *        for :math:`i=0,...,n-1`, where
+ *
+ *        .. math::
+ *        softmax(x) = \left[..., \frac{exp(x[j])}{exp(x[0])+...+exp(x[k-1])}, ...\right]
+ *
+ *        For general *N*-D input array with shape :math:`(d_1, ..., d_n)`. Denoted by
+ *        :math:`s=d_1d_2...d_n`. The way to compute softmax various:
+ *
+ *        - ``preserve_shape`` is false (default). Reshape input into a 2-D array with
+ *        shape :math:`(d_1, s/d_1)` beforing computing the softmax, and then reshaped
+ *        original shape.
+ *
+ *        - ``preserve_shape`` is true. For all :math:`i_1, ..., i_{n-1}`, compute
+ *
+ *        .. math::
+ *        out[i_1, ..., i_{n-1}, :] = softmax(data[i_1, ..., i_{n-1},:])
+ *
+ *        - ``multi_output`` is true. For all :math:`i_1, ..., i_{n-1}`, compute
+ *
+ *        .. math::
+ *        out[i_1, :, ..., i_{n-1}] = softmax(data[i_1, :, ..., i_{n-1}])
+ *
+ *        In the backward pass, the logit loss, also called cross-entroy loss, is
+ *        added. The provided label can be a *(N-1)*-D label index array or a *N*-D label
+ *        probability array.
+ *
+ *        Examples with a particular label can be ignored during backward by specifying
+ *        ``ignore_label`` (also need ``use_ignore`` to be true).
+ *
+ *        A scale can be applied to the gradient by ``grad_scale``, which is often used in
+ *        mutli-loss object function in which we can given each loss different weight. It
+ *        also supports various ways to normalize the gradient by ``normalization``:
+ *
+ *        - **null**: do nothing
+ *        - **batch**: divide by batch size (number of examples)
+ *        - **valid**: divide by the number of examples which are not ignored.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/softmax_output.cc:L77
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data.
+ * \param label Ground truth label.
+ * \param grad_scale Scale the gradient by a float factor
+ * \param ignore_label the labels with value equals to ``ignore_label`` will be ignored
+ * \param multi_output If set to true, softmax will applied on axis 1
+ * \param use_ignore If set to true, the ignore_label value will not contribute to the
+ * \param preserve_shape If true, softmax will applied on the last axis
+ * \param normalization Normalize the gradient
+ * \param out_grad Apply weighting from output gradient
+ * \return new symbol
+ */
+inline Symbol SoftmaxOutput(const std::string& symbol_name,
+                            Symbol data,
+                            Symbol label,
+                            mx_float grad_scale = 1,
+                            mx_float ignore_label = -1,
+                            bool multi_output = false,
+                            bool use_ignore = false,
+                            bool preserve_shape = false,
+                            SoftmaxOutputNormalization normalization = SoftmaxOutputNormalization::null,
+                            bool out_grad = false) {
+  static const char *SoftmaxOutputNormalizationValues[] = {
+    "batch",
+    "null",
+    "valid"
+  };
+  return Operator("SoftmaxOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetParam("ignore_label", ignore_label)
+           .SetParam("multi_output", multi_output)
+           .SetParam("use_ignore", use_ignore)
+           .SetParam("preserve_shape", preserve_shape)
+           .SetParam("normalization", SoftmaxOutputNormalizationValues[int(normalization)])
+           .SetParam("out_grad", out_grad)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif Normalize the gradient
+ */
+enum class SoftmaxNormalization {
+  batch = 0,
+  null = 1,
+  valid = 2
+};
+
+/*!
+ * \breif DEPRECATED: Perform a softmax transformation on input. Please use SoftmaxOutput
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to softmax.
+ * \param grad_scale Scale the gradient by a float factor
+ * \param ignore_label the labels with value equals to ``ignore_label`` will be ignored
+ * \param multi_output If set to true, softmax will applied on axis 1
+ * \param use_ignore If set to true, the ignore_label value will not contribute to the
+ * \param preserve_shape If true, softmax will applied on the last axis
+ * \param normalization Normalize the gradient
+ * \param out_grad Apply weighting from output gradient
+ * \return new symbol
+ */
+inline Symbol Softmax(const std::string& symbol_name,
+                      Symbol data,
+                      mx_float grad_scale = 1,
+                      mx_float ignore_label = -1,
+                      bool multi_output = false,
+                      bool use_ignore = false,
+                      bool preserve_shape = false,
+                      SoftmaxNormalization normalization = SoftmaxNormalization::null,
+                      bool out_grad = false) {
+  static const char *SoftmaxNormalizationValues[] = {
+    "batch",
+    "null",
+    "valid"
+  };
+  return Operator("Softmax")
+           .SetParam("grad_scale", grad_scale)
+           .SetParam("ignore_label", ignore_label)
+           .SetParam("multi_output", multi_output)
+           .SetParam("use_ignore", use_ignore)
+           .SetParam("preserve_shape", preserve_shape)
+           .SetParam("normalization", SoftmaxNormalizationValues[int(normalization)])
+           .SetParam("out_grad", out_grad)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif transformation type
+ */
+enum class SpatialTransformerTransformType {
+  affine = 0
+};
+
+/*! \breif sampling type
+ */
+enum class SpatialTransformerSamplerType {
+  bilinear = 0
+};
+
+/*!
+ * \breif Apply spatial transformer to input feature map.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the SpatialTransformerOp.
+ * \param loc localisation net, the output dim should be 6 when transform_type is affine.
+ * \param transform_type transformation type
+ * \param sampler_type sampling type
+ * \param target_shape output shape(h, w) of spatial transformer: (y, x)
+ * \return new symbol
+ */
+inline Symbol SpatialTransformer(const std::string& symbol_name,
+                                 Symbol data,
+                                 Symbol loc,
+                                 SpatialTransformerTransformType transform_type,
+                                 SpatialTransformerSamplerType sampler_type,
+                                 Shape target_shape = Shape(0,0)) {
+  static const char *SpatialTransformerTransformTypeValues[] = {
+    "affine"
+  };
+  static const char *SpatialTransformerSamplerTypeValues[] = {
+    "bilinear"
+  };
+  return Operator("SpatialTransformer")
+           .SetParam("transform_type", SpatialTransformerTransformTypeValues[int(transform_type)])
+           .SetParam("sampler_type", SpatialTransformerSamplerTypeValues[int(sampler_type)])
+           .SetParam("target_shape", target_shape)
+           .SetInput("data", data)
+           .SetInput("loc", loc)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Support Vector Machine based transformation on input, backprop L2-SVM
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to svm.
+ * \param label Label data.
+ * \param margin Scale the DType(param_.margin) for activation size
+ * \param regularization_coefficient Scale the coefficient responsible for balacing
+ * \param use_linear If set true, uses L1-SVM objective function. Default uses L2-SVM
+ * \return new symbol
+ */
+inline Symbol SVMOutput(const std::string& symbol_name,
+                        Symbol data,
+                        Symbol label,
+                        mx_float margin = 1,
+                        mx_float regularization_coefficient = 1,
+                        bool use_linear = false) {
+  return Operator("SVMOutput")
+           .SetParam("margin", margin)
+           .SetParam("regularization_coefficient", regularization_coefficient)
+           .SetParam("use_linear", use_linear)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol(symbol_name);
+}
+
+/*! \breif transformation type
+ *        if transformation type is affine, data is affine matrix : (batch, 6)
+ *        if transformation type is warp, data is optical flow : (batch, 2, h, w)
+ */
+enum class GridGeneratorTransformType {
+  affine = 0,
+  warp = 1
+};
+
+/*!
+ * \breif generate sampling grid for bilinear sampling.
+ * \param symbol_name name of the resulting symbol
+ * \param data Input data to the GridGeneratorOp.
+ * \param transform_type transformation type
+ *        if transformation type is affine, data is affine matrix : (batch, 6)
+ *        if transformation type is warp, data is optical flow : (batch, 2, h, w)
+ * \param target_shape if transformation type is affine, the operator need a target_shape
+ *        if transofrmation type is warp, the operator will ignore target_shape
+ * \return new symbol
+ */
+inline Symbol GridGenerator(const std::string& symbol_name,
+                            Symbol data,
+                            GridGeneratorTransformType transform_type,
+                            Shape target_shape = Shape(0,0)) {
+  static const char *GridGeneratorTransformTypeValues[] = {
+    "affine",
+    "warp"
+  };
+  return Operator("GridGenerator")
+           .SetParam("transform_type", GridGeneratorTransformTypeValues[int(transform_type)])
+           .SetParam("target_shape", target_shape)
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Choose one element from each line(row for python, column for R/Julia) in lhs
+ *        according to index indicated by rhs. This function assume rhs uses 0-based
+ * \param symbol_name name of the resulting symbol
+ * \param lhs Left operand to the function.
+ * \param rhs Right operand to the function.
+ * \return new symbol
+ */
+inline Symbol choose_element_0index(const std::string& symbol_name,
+                                    Symbol lhs,
+                                    Symbol rhs) {
+  return Operator("choose_element_0index")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Fill one element of each line(row for python, column for R/Julia) in lhs
+ *        according to index indicated by rhs and values indicated by mhs. This function
+ * \param symbol_name name of the resulting symbol
+ * \param lhs Left operand to the function.
+ * \param mhs Middle operand to the function.
+ * \param rhs Right operand to the function.
+ * \return new symbol
+ */
+inline Symbol fill_element_0index(const std::string& symbol_name,
+                                  Symbol lhs,
+                                  Symbol mhs,
+                                  Symbol rhs) {
+  return Operator("fill_element_0index")
+           .SetInput("lhs", lhs)
+           .SetInput("mhs", mhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Batch normalization.
+ *
+ *        Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+ *        well as offset ``beta``.
+ *
+ *        Assume the input has more than one dimension and we normalize along axis 1.
+ *        We first compute the mean and variance along this axis:
+ *
+ *        .. math::
+ *
+ *        data\_mean[i] = mean(data[:,i,:,...]) \\
+ *        data\_var[i] = var(data[:,i,:,...])
+ *
+ *        Then compute the normalized output, which has the same shape as input, as
+ *
+ *        .. math::
+ *
+ *        out[:,i,:,...] = \frac{data[:,i,:,...] -
+ *
+ *        Both *mean* and *var* returns a scalar by treating the input as a vector.
+ *
+ *        Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+ *        have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both
+ *        ``data_var`` as well, which are needed for the backward pass.
+ *
+ *        Besides the inputs and the outputs, this operator accepts two auxiliary
+ *        states, ``moving_mean`` and ``moving_var``, which are *k*-length
+ *        vectors. They are global statistics for the whole dataset, which are updated
+ *        by::
+ *
+ *        moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+ *        moving_var = moving_var * momentum + data_var * (1 - momentum)
+ *
+ *        If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+ *        ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+ *        the output. It is often used during inference.
+ *
+ *        Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is
+ *        then set ``gamma`` to 1 and its gradient to 0.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/batch_norm.cc:L84
+ * \param data Input data to batch normalization
+ * \param gamma gamma array
+ * \param beta beta array
+ * \param eps Epsilon to prevent div 0
+ * \param momentum Momentum for moving average
+ * \param fix_gamma Fix gamma while training
+ * \param use_global_stats Whether use global moving statistics instead of local
+ * \param output_mean_var Output All,normal mean and var
+ * \return new symbol
+ */
+inline Symbol BatchNorm(Symbol data,
+                        Symbol gamma,
+                        Symbol beta,
+                        mx_float eps = 0.001,
+                        mx_float momentum = 0.9,
+                        bool fix_gamma = true,
+                        bool use_global_stats = false,
+                        bool output_mean_var = false) {
+  return Operator("BatchNorm")
+           .SetParam("eps", eps)
+           .SetParam("momentum", momentum)
+           .SetParam("fix_gamma", fix_gamma)
+           .SetParam("use_global_stats", use_global_stats)
+           .SetParam("output_mean_var", output_mean_var)
+           .SetInput("data", data)
+           .SetInput("gamma", gamma)
+           .SetInput("beta", beta)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Leaky ReLu activation
+ *
+ *        The following types are supported:
+ *
+ *        - *elu*: ``y = x > 0 ? x : slop * (exp(x)-1)``
+ *        - *leaky*: ``y = x > 0 ? x : slope * x``
+ *        - *prelu*: same as *leaky* but the ``slope`` is learnable.
+ *        - *rrelu*: same as *leaky* but the ``slope`` is uniformly randomly chosen from
+ *        *[lower_bound, upper_bound)* for training, while fixed to be
+ *        *(lower_bound+upper_bound)/2* for inference.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/leaky_relu.cc:L36
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \param slope Init slope for the activation. (For leaky and elu only)
+ * \param lower_bound Lower bound of random slope. (For rrelu only)
+ * \param upper_bound Upper bound of random slope. (For rrelu only)
+ * \return new symbol
+ */
+inline Symbol LeakyReLU(Symbol data,
+                        LeakyReLUActType act_type = LeakyReLUActType::leaky,
+                        mx_float slope = 0.25,
+                        mx_float lower_bound = 0.125,
+                        mx_float upper_bound = 0.334) {
+  static const char *LeakyReLUActTypeValues[] = {
+    "elu",
+    "leaky",
+    "prelu",
+    "rrelu"
+  };
+  return Operator("LeakyReLU")
+           .SetParam("act_type", LeakyReLUActTypeValues[int(act_type)])
+           .SetParam("slope", slope)
+           .SetParam("lower_bound", lower_bound)
+           .SetParam("upper_bound", upper_bound)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Concate a list of array along a given axis.
+ *
+ *        The dimension sizes of the input arrays on the given axis should be the same.
+ *
+ *        For example::
+ *
+ *        x = [[1,1],[1,1]]
+ *        y = [[2,2],[2,2]]
+ *        z = [[3,3],[3,3],[3,3]]
+ *
+ *        Concat(x,y,z,dim=0) = [[ 1.,  1.],
+ *        [ 1.,  1.],
+ *        [ 2.,  2.],
+ *        [ 2.,  2.],
+ *        [ 3.,  3.],
+ *        [ 3.,  3.],
+ *        [ 3.,  3.]]
+ *
+ *        Concat(x,y,z,dim=1) = [[ 1.,  1.,  2.,  2.],
+ *        [ 1.,  1.,  2.,  2.]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/concat.cc:L69
+ * \param data List of tensors to concatenate
+ * \param num_args Number of inputs to be concated.
+ * \param dim the dimension to be concated.
+ * \return new symbol
+ */
+inline Symbol Concat(const std::vector<Symbol>& data,
+                     int num_args,
+                     int dim = 1) {
+  return Operator("Concat")
+           .SetParam("num_args", num_args)
+           .SetParam("dim", dim)
+(data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply a sparse regularization to the output a sigmoid activation function.
+ * \param data Input data.
+ * \param sparseness_target The sparseness target
+ * \param penalty The tradeoff parameter for the sparseness penalty
+ * \param momentum The momentum for running average
+ * \return new symbol
+ */
+inline Symbol IdentityAttachKLSparseReg(Symbol data,
+                                        mx_float sparseness_target = 0.1,
+                                        mx_float penalty = 0.001,
+                                        mx_float momentum = 0.9) {
+  return Operator("IdentityAttachKLSparseReg")
+           .SetParam("sparseness_target", sparseness_target)
+           .SetParam("penalty", penalty)
+           .SetParam("momentum", momentum)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate cross_entropy(data, one_hot(label))
+ *
+ *        From:/home/xlidc/mxnet/src/operator/loss_binary_op.cc:12
+ * \param data Input data
+ * \param label Input label
+ * \return new symbol
+ */
+inline Symbol softmax_cross_entropy(Symbol data,
+                                    Symbol label) {
+  return Operator("softmax_cross_entropy")
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Pad an array.
+ *
+ *        Only supports 4-D and 5-D input array.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/pad.cc:L407
+ * \param data An n-dimensional input tensor.
+ * \param mode Padding type to use. "constant" pads all values with a constant value, the
+ *        value of which can be specified with the constant_value option. "edge" uses the
+ * \param pad_width A tuple of padding widths of length 2*r, where r is the rank of the
+ *        input tensor, specifying number of values padded to the edges of each axis.
+ *        (before_1, after_1, ... , before_N, after_N) unique pad widths for each axis.
+ * \param constant_value This option is only used when mode is "constant". This value
+ * \return new symbol
+ */
+inline Symbol Pad(Symbol data,
+                  PadMode mode,
+                  Shape pad_width,
+                  double constant_value = 0) {
+  static const char *PadModeValues[] = {
+    "constant",
+    "edge"
+  };
+  return Operator("Pad")
+           .SetParam("mode", PadModeValues[int(mode)])
+           .SetParam("pad_width", pad_width)
+           .SetParam("constant_value", constant_value)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Updater function for sgd optimizer
+ * \param lr learning_rate
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \return new symbol
+ */
+inline Symbol sgd_update(mx_float lr,
+                         mx_float wd = 0,
+                         mx_float rescale_grad = 1,
+                         mx_float clip_gradient = -1) {
+  return Operator("sgd_update")
+           .SetParam("lr", lr)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Updater function for sgd optimizer
+ * \param lr learning_rate
+ * \param momentum momentum
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \return new symbol
+ */
+inline Symbol sgd_mom_update(mx_float lr,
+                             mx_float momentum = 0,
+                             mx_float wd = 0,
+                             mx_float rescale_grad = 1,
+                             mx_float clip_gradient = -1) {
+  return Operator("sgd_mom_update")
+           .SetParam("lr", lr)
+           .SetParam("momentum", momentum)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Updater function for adam optimizer
+ * \param lr learning_rate
+ * \param beta1 beta1
+ * \param beta2 beta2
+ * \param epsilon epsilon
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \return new symbol
+ */
+inline Symbol adam_update(mx_float lr,
+                          mx_float beta1 = 0.9,
+                          mx_float beta2 = 0.999,
+                          mx_float epsilon = 1e-08,
+                          mx_float wd = 0,
+                          mx_float rescale_grad = 1,
+                          mx_float clip_gradient = -1) {
+  return Operator("adam_update")
+           .SetParam("lr", lr)
+           .SetParam("beta1", beta1)
+           .SetParam("beta2", beta2)
+           .SetParam("epsilon", epsilon)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Updater function for RMSProp optimizer. The RMSProp code follows the version in
+ *        http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+ * \param lr learning_rate
+ * \param gamma1 gamma1
+ * \param epsilon epsilon
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \param clip_weights If greater than 0, clip weights to weights = max(min(weights,
+ * \return new symbol
+ */
+inline Symbol rmsprop_update(mx_float lr,
+                             mx_float gamma1 = 0.95,
+                             mx_float epsilon = 1e-08,
+                             mx_float wd = 0,
+                             mx_float rescale_grad = 1,
+                             mx_float clip_gradient = -1,
+                             mx_float clip_weights = -1) {
+  return Operator("rmsprop_update")
+           .SetParam("lr", lr)
+           .SetParam("gamma1", gamma1)
+           .SetParam("epsilon", epsilon)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .SetParam("clip_weights", clip_weights)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Updater function for RMSPropAlex optimizer. The RMSPropAlex code follows the
+ *        version in http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves,
+ * \param lr learning_rate
+ * \param gamma1 gamma1
+ * \param gamma2 gamma2
+ * \param epsilon epsilon
+ * \param wd weight decay
+ * \param rescale_grad rescale gradient as grad = rescale_grad*grad.
+ * \param clip_gradient If greater than 0, clip gradient to grad = max(min(grad,
+ * \param clip_weights If greater than 0, clip weights to weights = max(min(weights,
+ * \return new symbol
+ */
+inline Symbol rmspropalex_update(mx_float lr,
+                                 mx_float gamma1 = 0.95,
+                                 mx_float gamma2 = 0.9,
+                                 mx_float epsilon = 1e-08,
+                                 mx_float wd = 0,
+                                 mx_float rescale_grad = 1,
+                                 mx_float clip_gradient = -1,
+                                 mx_float clip_weights = -1) {
+  return Operator("rmspropalex_update")
+           .SetParam("lr", lr)
+           .SetParam("gamma1", gamma1)
+           .SetParam("gamma2", gamma2)
+           .SetParam("epsilon", epsilon)
+           .SetParam("wd", wd)
+           .SetParam("rescale_grad", rescale_grad)
+           .SetParam("clip_gradient", clip_gradient)
+           .SetParam("clip_weights", clip_weights)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Interchange two axes of an array.
+ *
+ *        Examples::
+ *
+ *        x = [[1, 2, 3]])
+ *        swapaxes(x, 0, 1) = [[ 1],
+ *        [ 2],
+ *        [ 3]]
+ *
+ *        x = [[[ 0, 1],
+ *        [ 2, 3]],
+ *        [[ 4, 5],
+ *        [ 6, 7]]]  // (2,2,2) array
+ *
+ *        swapaxes(x, 0, 2) = [[[ 0, 4],
+ *        [ 2, 6]],
+ *        [[ 1, 5],
+ *        [ 3, 7]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/swapaxis.cc:L55
+ * \param data Input array.
+ * \param dim1 the first axis to be swapped.
+ * \param dim2 the second axis to be swapped.
+ * \return new symbol
+ */
+inline Symbol SwapAxis(Symbol data,
+                       uint32_t dim1 = 0,
+                       uint32_t dim2 = 0) {
+  return Operator("SwapAxis")
+           .SetParam("dim1", dim1)
+           .SetParam("dim2", dim2)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Split an array along a particular axis into multiple sub-arrays.
+ *
+ *        Assume the input array has shape ``(d_0, ..., d_n)`` and we slice it into *m*
+ *        (``num_outputs=m``) subarrays along axis *k*, then we will obtain a list of *m*
+ *        arrays with each of which has shape ``(d_0, ..., d_k/m, ..., d_n)``.
+ *
+ *        For example::
+ *
+ *        x = [[1, 2],
+ *        [3, 4],
+ *        [5, 6],
+ *        [7, 8]]  // 4x2 array
+ *
+ *        y = split(x, axis=0, num_outputs=4) // a list of 4 arrays
+ *        y[0] = [[ 1.,  2.]]  // 1x2 array
+ *
+ *        z = split(x, axis=0, num_outputs=2) // a list of 2 arrays
+ *        z[0] = [[ 1.,  2.],
+ *        [ 3.,  4.]]
+ *
+ *        When setting optional argument ``squeeze_axis=1``, then the *k*-dimension will
+ *        be removed from the shape if it becomes 1::
+ *
+ *        y = split(x, axis=0, num_outputs=4, squeeze_axis=1)
+ *        y[0] = [ 1.,  2.]  // (2,) vector
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/slice_channel.cc:L50
+ * \param num_outputs Number of outputs to be sliced.
+ * \param axis Dimension along which to slice.
+ * \param squeeze_axis If true, the dimension will be squeezed. Also, input.shape[axis]
+ * \return new symbol
+ */
+inline Symbol SliceChannel(int num_outputs,
+                           int axis = 1,
+                           bool squeeze_axis = false) {
+  return Operator("SliceChannel")
+           .SetParam("num_outputs", num_outputs)
+           .SetParam("axis", axis)
+           .SetParam("squeeze_axis", squeeze_axis)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Perform nearest neighboor/bilinear up sampling to inputs
+ * \param data Array of tensors to upsample
+ * \param scale Up sampling scale
+ * \param sample_type upsampling method
+ * \param num_args Number of inputs to be upsampled. For nearest neighbor upsampling,
+ *        this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other
+ *        inputs will be upsampled to thesame size. For bilinear upsampling this must be
+ * \param num_filter Input filter. Only used by bilinear sample_type.
+ * \param multi_input_mode How to handle multiple input. concat means concatenate
+ *        upsampled images along the channel dimension. sum means add all images
+ * \param workspace Tmp workspace for deconvolution (MB)
+ * \return new symbol
+ */
+inline Symbol UpSampling(const std::vector<Symbol>& data,
+                         uint32_t scale,
+                         UpSamplingSampleType sample_type,
+                         int num_args,
+                         uint32_t num_filter = 0,
+                         UpSamplingMultiInputMode multi_input_mode = UpSamplingMultiInputMode::concat,
+                         uint64_t workspace = 512) {
+  static const char *UpSamplingSampleTypeValues[] = {
+    "bilinear",
+    "nearest"
+  };
+  static const char *UpSamplingMultiInputModeValues[] = {
+    "concat",
+    "sum"
+  };
+  return Operator("UpSampling")
+           .SetParam("scale", scale)
+           .SetParam("sample_type", UpSamplingSampleTypeValues[int(sample_type)])
+           .SetParam("num_args", num_args)
+           .SetParam("num_filter", num_filter)
+           .SetParam("multi_input_mode", UpSamplingMultiInputModeValues[int(multi_input_mode)])
+           .SetParam("workspace", workspace)
+(data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol elemwise_add(Symbol lhs,
+                           Symbol rhs) {
+  return Operator("elemwise_add")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate Smooth L1 Loss(lhs, scalar)
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_binary_scalar_op_extended.cc:63
+ * \param data source input
+ * \param scalar scalar input
+ * \return new symbol
+ */
+inline Symbol smooth_l1(Symbol data,
+                        mx_float scalar) {
+  return Operator("smooth_l1")
+           .SetParam("scalar", scalar)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return the top *k* elements in an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 0.3,  0.2,  0.4],
+ *        [ 0.1,  0.3,  0.2]]
+ *
+ *        // return the index of the largest element on last axis
+ *        topk(x) = [[ 2.],
+ *        [ 1.]]
+ *
+ *        // return the value of the top-2 elements on last axis
+ *        topk(x, ret_typ='value', k=2) = [[ 0.4,  0.3],
+ *        [ 0.3,  0.2]]
+ *
+ *        // flatten and then return both index and value
+ *        topk(x, ret_typ='both', k=2, axis=None) = [ 0.4,  0.3], [ 2.,  0.]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/ordering_op.cc:L36
+ * \param src Source input
+ * \param axis Axis along which to choose the top k indices. If not given, the flattened
+ * \param k Number of top elements to select, should be always smaller than or equal to
+ * \param ret_typ The return type. "value" means returning the top k values, "indices"
+ *        means returning the indices of the top k values, "mask" means to return a mask
+ *        array containing 0 and 1. 1 means the top k values. "both" means to return both
+ * \param is_ascend Whether to choose k largest or k smallest. Top K largest elements
+ * \return new symbol
+ */
+inline Symbol topk(Symbol src,
+                   dmlc::optional<int> axis = dmlc::optional<int>(-1),
+                   int k = 1,
+                   TopkRetTyp ret_typ = TopkRetTyp::indices,
+                   bool is_ascend = false) {
+  static const char *TopkRetTypValues[] = {
+    "both",
+    "indices",
+    "mask",
+    "value"
+  };
+  return Operator("topk")
+           .SetParam("axis", axis)
+           .SetParam("k", k)
+           .SetParam("ret_typ", TopkRetTypValues[int(ret_typ)])
+           .SetParam("is_ascend", is_ascend)
+           .SetInput("src", src)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return a sorted copy of an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 1, 4],
+ *        [ 3, 1]]
+ *
+ *        // sort along the last axis
+ *        sort(x) = [[ 1.,  4.],
+ *        [ 1.,  3.]]
+ *
+ *        // flatten and then sort
+ *        sort(x, axis=None) = [ 1.,  1.,  3.,  4.]
+ *
+ *        // sort long the first axis
+ *        sort(x, axis=0) = [[ 1.,  1.],
+ *        [ 3.,  4.]]
+ *
+ *        // in a descend order
+ *        sort(x, is_ascend=0) = [[ 4.,  1.],
+ *        [ 3.,  1.]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/ordering_op.cc:L99
+ * \param src Source input
+ * \param axis Axis along which to choose sort the input tensor. If not given, the
+ * \param is_ascend Whether sort in ascending or descending order.
+ * \return new symbol
+ */
+inline Symbol sort(Symbol src,
+                   dmlc::optional<int> axis = dmlc::optional<int>(-1),
+                   bool is_ascend = true) {
+  return Operator("sort")
+           .SetParam("axis", axis)
+           .SetParam("is_ascend", is_ascend)
+           .SetInput("src", src)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Returns the indices that can sort an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 0.3,  0.2,  0.4],
+ *        [ 0.1,  0.3,  0.2]]
+ *
+ *        // sort along axis -1
+ *        argsort(x) = [[ 1.,  0.,  2.],
+ *        [ 0.,  2.,  1.]]
+ *
+ *        // sort along axis 0
+ *        argsort(x, axis=0) = [[ 1.,  0.,  1.]
+ *        [ 0.,  1.,  0.]]
+ *
+ *        // flatten and then sort
+ *        argsort(x, axis=None) = [ 3.,  1.,  5.,  0.,  4.,  2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/ordering_op.cc:L146
+ * \param src Source input
+ * \param axis Axis along which to sort the input tensor. If not given, the flattened
+ * \param is_ascend Whether sort in ascending or descending order.
+ * \return new symbol
+ */
+inline Symbol argsort(Symbol src,
+                      dmlc::optional<int> axis = dmlc::optional<int>(-1),
+                      bool is_ascend = true) {
+  return Operator("argsort")
+           .SetParam("axis", axis)
+           .SetParam("is_ascend", is_ascend)
+           .SetInput("src", src)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Get output from a symbol and pass 0 gradient back
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:31
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol BlockGrad(Symbol data) {
+  return Operator("BlockGrad")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Cast to a specified type, element-wise.
+ *
+ *        For example::
+ *
+ *        cast([1e20, 11.1], dtype='float16') = [inf, 11.09375]
+ *        cast([300, 11.1, 10.9, -1, -3], dtype='uint8') = [44, 11, 10, 255, 253]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L65
+ * \param data Source input
+ * \param dtype Output data type.
+ * \return new symbol
+ */
+inline Symbol Cast(Symbol data,
+                   CastDtype dtype) {
+  static const char *CastDtypeValues[] = {
+    "float16",
+    "float32",
+    "float64",
+    "int32",
+    "uint8"
+  };
+  return Operator("Cast")
+           .SetParam("dtype", CastDtypeValues[int(dtype)])
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Negate src
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:84
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol negative(Symbol data) {
+  return Operator("negative")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Returns the absolute value of array elements, element-wise.
+ *
+ *        For example:
+ *        abs([-2, 0, 3]) = [2, 0, 3]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L95
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol abs(Symbol data) {
+  return Operator("abs")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Returns the indication sign of array elements, element-wise.
+ *
+ *        For example::
+ *        sign([-2, 0, 3]) = [-1, 0, 1]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L109
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sign(Symbol data) {
+  return Operator("sign")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Round elements of the array to the nearest integer, element-wise.
+ *
+ *        For example::
+ *        round([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -2.,  2.,  2.,  2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L122
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol round(Symbol data) {
+  return Operator("round")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return the ceiling of the input, element-wise.
+ *
+ *        For example::
+ *        ceil([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  2.,  2.,  3.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L132
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol ceil(Symbol data) {
+  return Operator("ceil")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return the floor of the input, element-wise.
+ *
+ *        For example::
+ *        floor([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-3., -2.,  1.,  1.,  2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L141
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol floor(Symbol data) {
+  return Operator("floor")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Round elements of the array to the nearest integer, element-wise.
+ *
+ *        For example::
+ *        rint([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -2.,  1.,  2.,  2.]
+ *
+ *        The difference to ``round`` is that ``rint`` returns ``n`` for input ``n.5``
+ *        while ``round`` returns ``n+1`` for ``n>=0``.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L154
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol rint(Symbol data) {
+  return Operator("rint")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Round elements of the array to the nearest integer towards
+ *        zero, element-wise.
+ *
+ *        For example::
+ *        fix([-2.1, -1.9, 1.9, 2.1]) = [-2., -1.,  1., 2.]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L164
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol fix(Symbol data) {
+  return Operator("fix")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate the square of an array, element-wise.
+ *
+ *        For example::
+ *        square(x) = x^2
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L174
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol square(Symbol data) {
+  return Operator("square")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate the square-root of an array, element-wise.
+ *
+ *        For example::
+ *        sqrt(x) = \sqrt{x}
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L187
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sqrt(Symbol data) {
+  return Operator("sqrt")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate the inverse square-root of an array, element-wise.
+ *
+ *        For example::
+ *        rsqrt(x) = 1/\sqrt{x}
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L200
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol rsqrt(Symbol data) {
+  return Operator("rsqrt")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate the exponential of the array, element-wise
+ *
+ *        For example::
+ *        exp(x) = e^x \approx 2.718^x
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L215
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol exp(Symbol data) {
+  return Operator("exp")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Natural logarithm, element-wise.
+ *
+ *        The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L225
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log(Symbol data) {
+  return Operator("log")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate the base 10 logarithm of the array, element-wise.
+ *
+ *        ``10**log10(x) = x``
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L235
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log10(Symbol data) {
+  return Operator("log10")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate the base 2 logarithm of the array, element-wise.
+ *
+ *        ``2**log2(x) = x``
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L245
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log2(Symbol data) {
+  return Operator("log2")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Trigonometric sine, element-wise.
+ *
+ *        Then input is in radians (:math:`2\pi` rad equals 360 degress).
+ *
+ *        .. math::
+ *        sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L261
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sin(Symbol data) {
+  return Operator("sin")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate ``log(1 + x)``
+ *
+ *        This function is more accurate than ``log(1 + x)``  for small ``x`` so that
+ *        :math:`1+x\approx 1`
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L275
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol log1p(Symbol data) {
+  return Operator("log1p")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Calculate ``exp(x) - 1``
+ *
+ *        This function provides greater precision than ``exp(x) - 1`` for small values
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L288
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol expm1(Symbol data) {
+  return Operator("expm1")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Cosine, element-wise.
+ *
+ *        Then input is in radians (:math:`2\pi` rad equals 360 degress).
+ *
+ *        .. math::
+ *        cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L304
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol cos(Symbol data) {
+  return Operator("cos")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Tangent, element-wise.
+ *
+ *        Then input is in radians (:math:`2\pi` rad equals 360 degress).
+ *
+ *        .. math::
+ *        tan([0, \pi/4, \pi/2]) = [0, 1, -inf]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L320
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol tan(Symbol data) {
+  return Operator("tan")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Inverse sine, element-wise.
+ *
+ *        The input should be in range :math:`[-1, 1]`.
+ *        The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+ *
+ *        .. math::
+ *        arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L337
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arcsin(Symbol data) {
+  return Operator("arcsin")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Inverse cosine, element-wise.
+ *
+ *        The input should be in range :math:`[-1, 1]`.
+ *        The output is in the closed interval :math:`[0, \pi]`
+ *
+ *        .. math::
+ *        arccos([-1, -.707, 0, .707, 1]) = [\pi, 3\pi/4, \pi/2, \pi/4, 0]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L354
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arccos(Symbol data) {
+  return Operator("arccos")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Inverse tangent, element-wise.
+ *
+ *        The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+ *
+ *        .. math::
+ *        arccos([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L370
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arctan(Symbol data) {
+  return Operator("arctan")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Convert angles from radians to degrees.
+ *
+ *        .. math::
+ *        degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L384
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol degrees(Symbol data) {
+  return Operator("degrees")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Convert angles from degrees to radians.
+ *
+ *        .. math::
+ *        radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L398
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol radians(Symbol data) {
+  return Operator("radians")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Hyperbolic sine, element-wise.
+ *
+ *        For example::
+ *        sinh(x) = 0.5\times(exp(x) - exp(-x))
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L412
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol sinh(Symbol data) {
+  return Operator("sinh")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Hyperbolic cosine, element-wise.
+ *
+ *        For example::
+ *        cosh(x) = 0.5\times(exp(x) + exp(-x))
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L426
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol cosh(Symbol data) {
+  return Operator("cosh")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Hyperbolic tangent element-wise.
+ *
+ *        For example::
+ *        tanh(x) = sinh(x) / cosh(x)
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L440
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol tanh(Symbol data) {
+  return Operator("tanh")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Inverse hyperbolic sine, element-wise.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L450
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arcsinh(Symbol data) {
+  return Operator("arcsinh")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Inverse hyperbolic cosine, element-wise.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L460
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arccosh(Symbol data) {
+  return Operator("arccosh")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Inverse hyperbolic tangent, element-wise.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:L470
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol arctanh(Symbol data) {
+  return Operator("arctanh")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif The gamma function (extension of the factorial function), element-wise
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:479
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol gamma(Symbol data) {
+  return Operator("gamma")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Log of the absolute value of the gamma function, element-wise
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/elemwise_unary_op.cc:488
+ * \param data The input
+ * \return new symbol
+ */
+inline Symbol gammaln(Symbol data) {
+  return Operator("gammaln")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Map integer index to vector representations (embeddings). Those embeddings are
+ *        learnable parameters. For a input of shape (d1, ..., dK), the output shape is
+ *        (d1, ..., dK, output_dim). All the input values should be integers in the range
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:19
+ * \param data Input data to the EmbeddingOp.
+ * \param weight Embedding weight matrix.
+ * \param input_dim vocabulary size of the input indices.
+ * \param output_dim dimension of the embedding vectors.
+ * \return new symbol
+ */
+inline Symbol Embedding(Symbol data,
+                        Symbol weight,
+                        int input_dim,
+                        int output_dim) {
+  return Operator("Embedding")
+           .SetParam("input_dim", input_dim)
+           .SetParam("output_dim", output_dim)
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Take elements from an array along an axis.
+ *
+ *        Slice along a particular axis with the provided indices. E.g., given an input
+ *        with shape ``(d0, d1, d2)`` and indices with shape ``(i0, i1)``, then the output
+ *        will have shape ``(i0, i1, d1, d2)``, with::
+ *
+ *        output[i,j,:,:] = input[indices[i,j],:,:]
+ *
+ *        Examples::
+ *
+ *        x = [[ 1.,  2.],
+ *        [ 3.,  4.],
+ *        [ 5.,  6.]]
+ *
+ *        take(x, [[0,1],[1,2]]) = [[[ 1.,  2.],
+ *        [ 3.,  4.]],
+ *
+ *        [[ 3.,  4.],
+ *        [ 5.,  6.]]]
+ *
+ *        .. note::
+ *        Only slicing axis 0 is supported now.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:L79
+ * \param a The source array.
+ * \param indices The indices of the values to extract.
+ * \param axis the axis of data tensor to be taken.
+ * \param mode specify how out-of-bound indices bahave.
+ * \return new symbol
+ */
+inline Symbol take(Symbol a,
+                   Symbol indices,
+                   int axis = 0,
+                   TakeMode mode = TakeMode::raise) {
+  static const char *TakeModeValues[] = {
+    "clip",
+    "raise",
+    "wrap"
+  };
+  return Operator("take")
+           .SetParam("axis", axis)
+           .SetParam("mode", TakeModeValues[int(mode)])
+           .SetInput("a", a)
+           .SetInput("indices", indices)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Take elements from a data batch.
+ *
+ *        Given an ``(d0, d1)`` input array, and ``(d0,)`` indices, the output will be a
+ *        ``(d0,)`` computed by::
+ *
+ *        output[i] = input[i, indices[i]]
+ *
+ *        Examples::
+ *
+ *        x = [[ 1.,  2.],
+ *        [ 3.,  4.],
+ *        [ 5.,  6.]]
+ *
+ *        batch_take(x, [0,1,0]) = [ 1.  4.  5.]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:L131
+ * \param a Input data array
+ * \param indices index array
+ * \return new symbol
+ */
+inline Symbol batch_take(Symbol a,
+                         Symbol indices) {
+  return Operator("batch_take")
+           .SetInput("a", a)
+           .SetInput("indices", indices)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Returns a one-hot array.
+ *
+ *        The locations represented by ``indices`` take value ``on_value``, while all
+ *        other locations take value ``off_value``.
+ *
+ *        Assume ``indices`` has shape ``(i0, i1)``, then the output will have shape
+ *        ``(i0, i1, depth)`` and::
+ *
+ *        output[i,j,:] = off_value
+ *        output[i,j,indices[i,j]] = on_value
+ *
+ *        Examples::
+ *
+ *        one_hot([1,0,2,0], 3) = [[ 0.  1.  0.]
+ *        [ 1.  0.  0.]
+ *        [ 0.  0.  1.]
+ *        [ 1.  0.  0.]]
+ *
+ *        one_hot([1,0,2,0], 3, on_value=8, off_value=1,
+ *        dtype='int32') = [[1 8 1]
+ *        [8 1 1]
+ *        [1 1 8]
+ *        [8 1 1]]
+ *
+ *        one_hot([[1,0],[1,0],[2,0]], 3) = [[[ 0.  1.  0.]
+ *        [ 1.  0.  0.]]
+ *
+ *        [[ 0.  1.  0.]
+ *        [ 1.  0.  0.]]
+ *
+ *        [[ 0.  0.  1.]
+ *        [ 1.  0.  0.]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/indexing_op.cc:L177
+ * \param indices array of locations where to set on_value
+ * \param depth The dimension size at dim = axis.
+ * \param on_value The value assigned to the locations represented by indices.
+ * \param off_value The value assigned to the locations not represented by indices.
+ * \param dtype DType of the output
+ * \return new symbol
+ */
+inline Symbol one_hot(Symbol indices,
+                      int depth,
+                      double on_value = 1,
+                      double off_value = 0,
+                      One_hotDtype dtype = One_hotDtype::float32) {
+  static const char *One_hotDtypeValues[] = {
+    "float16",
+    "float32",
+    "float64",
+    "int32",
+    "uint8"
+  };
+  return Operator("one_hot")
+           .SetParam("depth", depth)
+           .SetParam("on_value", on_value)
+           .SetParam("off_value", off_value)
+           .SetParam("dtype", One_hotDtypeValues[int(dtype)])
+           .SetInput("indices", indices)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Reshape array into a new shape.
+ *
+ *        The shape is a tuple of int such as (2,3,4). The new shape should not change the
+ *        array size. For example::
+ *
+ *        reshape([1,2,3,4], shape=(2,2)) = [[1,2], [3,4]]
+ *
+ *        In addition, we can use special codes, which are integers less than
+ *        1, on some shape dimensions. To inference the output shape, we set it to an
+ *        empty tuple at beginning. When continuously pop dimensions from the original
+ *        shape starting from the beginning, and then push translated results into the
+ *        shape.
+ *
+ *        Each special code presents a way of translation.
+ *
+ *        - ``0`` for copying one. Pop one input dimension and push into the output. For
+ *
+ *        - input=(2,3,4), shape=(4,0,2), output=(4,3,2)
+ *        - input=(2,3,4), shape=(2,0,0), output=(2,3,4)
+ *
+ *        - ``-1`` for inference. Push a placeholder into the output whose value will be
+ *
+ *        - input=(2,3,4), shape=(6,1,-1), output=(6,1,4)
+ *        - input=(2,3,4), shape=(3,-1,8), output=(3,1,8)
+ *        - input=(2,3,4), shape=(-1,), output=(24,)
+ *
+ *        - ``-2`` for copying all. Pop all remaining input dimensions and push them into
+ *        the output::
+ *
+ *        - input=(2,3,4), shape=(-2), output=(9,8,7)
+ *        - input=(2,3,4), shape=(2,-2), output=(2,3,4)
+ *        - input=(2,3,4), shape=(-2,1,1), output=(2,3,4,1,1)
+ *
+ *        - ``-3`` for merging two dimensions. Pop two input dimensions, compute the
+ *        push into the output::
+ *
+ *        - input=(2,3,4), shape=(-3,4), output=(6,4)
+ *        - input=(2,3,4), shape=(0,-3), output=(2,12)
+ *        - input=(2,3,4), shape=(-3,-2), output=(6,4)
+ *
+ *        - ``-4`` for splitting two dimensions. Pop one input dimensions, next split it
+ *        according to the next two dimensions (can contain one ``-1``) specified after
+ *        this code, then push into the output::
+ *
+ *        - input=(2,3,4), shape=(-4,1,2,-2), output=(1,2,3,4)
+ *        - input=(2,3,4), shape=(2,-4,-1,3,-2), output=(2,1,3,4)
+ *
+ *        If the argument ``reverse`` is set to be true, then translating the input shape
+ *        from right to left. For example, with input shape (10, 5, 4) target shape (-1,
+ *        0), then the output shape will be (50,4) if ``reverse=1``, otherwise it will be
+ *        (40,5).
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L78
+ * \param data Input data to reshape.
+ * \param target_shape (Deprecated! Use ``shape`` instead.) Target new shape. One and
+ * \param keep_highest (Deprecated! Use ``shape`` instead.) Whether keep the highest dim
+ *        unchanged.If set to true, then the first dim in target_shape is ignored,and
+ * \param shape The target shape
+ * \param reverse If true then translating the input shape from right to left
+ * \return new symbol
+ */
+inline Symbol Reshape(Symbol data,
+                      Shape target_shape = Shape(0,0),
+                      bool keep_highest = false,
+                      Shape shape = Shape(),
+                      bool reverse = false) {
+  return Operator("Reshape")
+           .SetParam("target_shape", target_shape)
+           .SetParam("keep_highest", keep_highest)
+           .SetParam("shape", shape)
+           .SetParam("reverse", reverse)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Flatten input into a 2-D array by collapsing the higher dimensions.
+ *
+ *        Assume the input array has shape ``(d1, d2, ..., dk)``, then ``flatten``
+ *        the input array into shape ``(d1, d2*...*dk)``.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L101
+ * \param data Input data to reshape.
+ * \return new symbol
+ */
+inline Symbol Flatten(Symbol data) {
+  return Operator("Flatten")
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Permute the dimensions of an array.
+ *
+ *        Examples::
+ *
+ *        x = [[ 1, 2],
+ *        [ 3, 4]]
+ *
+ *        transpose(x) = [[ 1.,  3.],
+ *        [ 2.,  4.]]
+ *
+ *        x = [[[ 1.,  2.],
+ *        [ 3.,  4.]],
+ *
+ *        [[ 5.,  6.],
+ *        [ 7.,  8.]]]
+ *
+ *        transpose(x) = [[[ 1.,  5.],
+ *        [ 3.,  7.]],
+ *
+ *        [[ 2.,  6.],
+ *        [ 4.,  8.]]]
+ *
+ *        transpose(x, axes=(1,0,2)) = [[[ 1.,  2.],
+ *        [ 5.,  6.]],
+ *
+ *        [[ 3.,  4.],
+ *        [ 7.,  8.]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L142
+ * \param data Source input
+ * \param axes Target axis order. By default the axes will be inverted.
+ * \return new symbol
+ */
+inline Symbol transpose(Symbol data,
+                        Shape axes = Shape()) {
+  return Operator("transpose")
+           .SetParam("axes", axes)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Insert a new axis with size 1 into the array shape
+ *
+ *        For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
+ *        will return a new array with shape ``(2,1,3,4)``.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L175
+ * \param data Source input
+ * \param axis Position (amongst axes) where new axis is to be inserted.
+ * \return new symbol
+ */
+inline Symbol expand_dims(Symbol data,
+                          uint32_t axis) {
+  return Operator("expand_dims")
+           .SetParam("axis", axis)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Crop a continuous region from the array.
+ *
+ *        Assume the input array has *n* dimensions, given ``begin=(b_1, ..., b_n)`` and
+ *        ``end=(e_1, ..., e_n)``, then ``crop`` will return a region with shape
+ *        ``(e_1-b_1, ..., e_n-b_n)``. The result's *k*-th dimension contains elements
+ *        from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``.
+ *
+ *        For example::
+ *
+ *        x = [[  1.,   2.,   3.,   4.],
+ *        [  5.,   6.,   7.,   8.],
+ *        [  9.,  10.,  11.,  12.]]
+ *
+ *        crop(x, begin=(0,1), end=(2,4)) = [[ 2.,  3.,  4.],
+ *        [ 6.,  7.,  8.]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L207
+ * \param data Source input
+ * \param begin starting coordinates
+ * \param end ending coordinates
+ * \return new symbol
+ */
+inline Symbol slice(Symbol data,
+                    Shape begin,
+                    Shape end) {
+  return Operator("slice")
+           .SetParam("begin", begin)
+           .SetParam("end", end)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Slice along a given axis.
+ *
+ *        Examples:
+ *
+ *        x = [[  1.,   2.,   3.,   4.],
+ *        [  5.,   6.,   7.,   8.],
+ *        [  9.,  10.,  11.,  12.]]
+ *
+ *        slice_axis(x, axis=0, begin=1, end=3) = [[  5.,   6.,   7.,   8.],
+ *        [  9.,  10.,  11.,  12.]]
+ *
+ *        slice_axis(x, axis=1, begin=0, end=2) = [[  1.,   2.],
+ *        [  5.,   6.],
+ *        [  9.,  10.]]
+ *
+ *        slice_axis(x, axis=1, begin=-3, end=-1) = [[  2.,   3.],
+ *        [  6.,   7.],
+ *        [ 10.,  11.]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L285
+ * \param data Source input
+ * \param axis The axis to be sliced. Negative axis means to count from the last to the
+ * \param begin The beginning index to be sliced. Negative values are interpreted as
+ * \param end The end index to be sliced. The end can be None, in which case all the rest
+ *        elements are used. Also, negative values are interpreted as counting from the
+ * \return new symbol
+ */
+inline Symbol slice_axis(Symbol data,
+                         int axis,
+                         int begin,
+                         dmlc::optional<int> end) {
+  return Operator("slice_axis")
+           .SetParam("axis", axis)
+           .SetParam("begin", begin)
+           .SetParam("end", end)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Dot product of two arrays.
+ *
+ *        ``dot``'s behavior depends on the input array dimensions:
+ *
+ *        - 1-D arrays: inner product of vectors
+ *        - 2-D arrays: matrix multiplication
+ *        - N-D arrays: a sum product over the last axis of the first input and the first
+ *        axis of the second input
+ *
+ *        For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape
+ *        result array will have shape `(n,m,r,s)`. It is computed by::
+ *
+ *        dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L318
+ * \param lhs The first input
+ * \param rhs The second input
+ * \param transpose_a If true then transpose the first input before dot.
+ * \param transpose_b If true then transpose the second input before dot.
+ * \return new symbol
+ */
+inline Symbol dot(Symbol lhs,
+                  Symbol rhs,
+                  bool transpose_a = false,
+                  bool transpose_b = false) {
+  return Operator("dot")
+           .SetParam("transpose_a", transpose_a)
+           .SetParam("transpose_b", transpose_b)
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Batchwise dot product.
+ *
+ *        ``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
+ *        ``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+ *
+ *        For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
+ *        `(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+ *        which is computed by::
+ *
+ *        batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L354
+ * \param lhs The first input
+ * \param rhs The second input
+ * \param transpose_a If true then transpose the first input before dot.
+ * \param transpose_b If true then transpose the second input before dot.
+ * \return new symbol
+ */
+inline Symbol batch_dot(Symbol lhs,
+                        Symbol rhs,
+                        bool transpose_a = false,
+                        bool transpose_b = false) {
+  return Operator("batch_dot")
+           .SetParam("transpose_a", transpose_a)
+           .SetParam("transpose_b", transpose_b)
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Clip (limit) the values in an array, elementwise
+ *
+ *        Given an interval, values outside the interval are clipped to the interval
+ *        edges. That is::
+ *
+ *        clip(x) = max(min(x, a_max)), a_min)
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L393
+ * \param data Source input
+ * \param a_min Minimum value
+ * \param a_max Maximum value
+ * \return new symbol
+ */
+inline Symbol clip(Symbol data,
+                   mx_float a_min,
+                   mx_float a_max) {
+  return Operator("clip")
+           .SetParam("a_min", a_min)
+           .SetParam("a_max", a_max)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Repeat elements of an array.
+ *
+ *        In default, ``repeat`` flatten the input array into 1-D and then repeat the
+ *        elements::
+ *
+ *        x = [[ 1, 2],
+ *        [ 3, 4]]
+ *
+ *        repeat(x, repeats=2) = [ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.]
+ *
+ *        We can also choose a particular axis to repeat, in which a negative axis is
+ *        interpreted counting from the backward::
+ *
+ *        repeat(x, repeats=2, axis=1) = [[ 1.,  1.,  2.,  2.],
+ *        [ 3.,  3.,  4.,  4.]]
+ *
+ *        repeat(x, repeats=2, axis=-1) = [[ 1.,  2.],
+ *        [ 1.,  2.],
+ *        [ 3.,  4.],
+ *        [ 3.,  4.]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L432
+ * \param data Input data array
+ * \param repeats The number of repetitions for each element.
+ * \param axis The axis along which to repeat values. The negative numbers are
+ *        interpreted counting from the backward. By default, use the flattened input
+ * \return new symbol
+ */
+inline Symbol repeat(Symbol data,
+                     int repeats,
+                     dmlc::optional<int> axis = dmlc::optional<int>()) {
+  return Operator("repeat")
+           .SetParam("repeats", repeats)
+           .SetParam("axis", axis)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Repeat the whole array by multiple times.
+ *
+ *        If ``reps`` has length *d*, and input array has dimension of *n*. There are
+ *        there cases:
+ *
+ *        - **n=d**. Repeat *i*-th dimension of the input by ``reps[i]`` times::
+ *
+ *        x = [[1, 2],
+ *        [3, 4]]
+ *
+ *        tile(x, reps=(2,3)) = [[ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.],
+ *        [ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.]]
+ *
+ *        - **n>d**. ``reps`` is promoted to length *n* by pre-pending 1’s to it. Thus
+ *        an input shape ``(2,3)``, ``repos=(2,)`` is treated as ``(1,2)``::
+ *
+ *
+ *        tile(x, reps=(2,)) = [[ 1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.]]
+ *
+ *        - **n<d**. The input is promoted to be d-dimensional by prepending new axes. So
+ *        shape ``(2,2)`` array is promoted to ``(1,2,2)`` for 3-D replication::
+ *
+ *        tile(x, reps=(2,2,3)) = [[[ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.],
+ *        [ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.]],
+ *
+ *        [[ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.],
+ *        [ 1.,  2.,  1.,  2.,  1.,  2.],
+ *        [ 3.,  4.,  3.,  4.,  3.,  4.]]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:L489
+ * \param data Input data array
+ * \param reps The number of times for repeating the tensor a. If reps has length d, the
+ *        result will have dimension of max(d, a.ndim); If a.ndim < d, a is promoted to
+ *        be d-dimensional by prepending new axes. If a.ndim > d, reps is promoted to
+ * \return new symbol
+ */
+inline Symbol tile(Symbol data,
+                   Shape reps) {
+  return Operator("tile")
+           .SetParam("reps", reps)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Reverse elements of an array with axis
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/matrix_op.cc:512
+ * \param data Input data array
+ * \param axis The axis which to reverse elements.
+ * \return new symbol
+ */
+inline Symbol reverse(Symbol data,
+                      Shape axis) {
+  return Operator("reverse")
+           .SetParam("axis", axis)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Draw samples from a uniform distribution.
+ *
+ *        Samples are uniformly distributed over the half-open interval [low, high)
+ *        (includes low, but excludes high)::
+ *
+ *        nd.uniform(low=0, high=1, shape=(2,2)) = [[ 0.60276335,  0.85794562],
+ *        [ 0.54488319,  0.84725171]]
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/sample_op.cc:L24
+ * \param low The lower bound of distribution
+ * \param high The upper bound of distribution
+ * \param shape The shape of the output
+ * \param ctx Context of output, in format [cpu|gpu|cpu_pinned](n).Only used for
+ * \param dtype DType of the output. If output given, set to type of output.If output not
+ * \return new symbol
+ */
+inline Symbol uniform(mx_float low = 0,
+                      mx_float high = 1,
+                      Shape shape = Shape(),
+                      const std::string& ctx = "",
+                      UniformDtype dtype = UniformDtype::None) {
+  static const char *UniformDtypeValues[] = {
+    "None",
+    "float16",
+    "float32",
+    "float64"
+  };
+  return Operator("uniform")
+           .SetParam("low", low)
+           .SetParam("high", high)
+           .SetParam("shape", shape)
+           .SetParam("dtype", UniformDtypeValues[int(dtype)])
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Draw random samples from a normal (Gaussian) distribution.
+ *
+ *        Examples::
+ *
+ *        normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478],
+ *        [-1.23474145,  1.55807114]]
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/sample_op.cc:L35
+ * \param loc Mean of the distribution.
+ * \param scale Standard deviation of the distribution.
+ * \param shape The shape of the output
+ * \param ctx Context of output, in format [cpu|gpu|cpu_pinned](n).Only used for
+ * \param dtype DType of the output. If output given, set to type of output.If output not
+ * \return new symbol
+ */
+inline Symbol normal(mx_float loc = 0,
+                     mx_float scale = 1,
+                     Shape shape = Shape(),
+                     const std::string& ctx = "",
+                     NormalDtype dtype = NormalDtype::None) {
+  static const char *NormalDtypeValues[] = {
+    "None",
+    "float16",
+    "float32",
+    "float64"
+  };
+  return Operator("normal")
+           .SetParam("loc", loc)
+           .SetParam("scale", scale)
+           .SetParam("shape", shape)
+           .SetParam("dtype", NormalDtypeValues[int(dtype)])
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Returns the indices of the maximum values along an axis.
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/broadcast_reduce_op_index.cc:11
+ * \param data The input
+ * \param axis Empty or unsigned. The axis to perform the reduction.If left empty, a
+ * \param keepdims If true, the axis which is reduced is left in the result as dimension
+ * \return new symbol
+ */
+inline Symbol argmax(Symbol data,
+                     int axis = -1,
+                     bool keepdims = false) {
+  return Operator("argmax")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Returns the indices of the minimum values along an axis.
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/broadcast_reduce_op_index.cc:16
+ * \param data The input
+ * \param axis Empty or unsigned. The axis to perform the reduction.If left empty, a
+ * \param keepdims If true, the axis which is reduced is left in the result as dimension
+ * \return new symbol
+ */
+inline Symbol argmin(Symbol data,
+                     int axis = -1,
+                     bool keepdims = false) {
+  return Operator("argmin")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif
+ * \param src Source input
+ * \return new symbol
+ */
+inline Symbol argmax_channel(Symbol src) {
+  return Operator("argmax_channel")
+           .SetInput("src", src)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the sum of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol sum(Symbol data,
+                  Shape axis = Shape(),
+                  bool keepdims = false) {
+  return Operator("sum")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the mean of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol mean(Symbol data,
+                   Shape axis = Shape(),
+                   bool keepdims = false) {
+  return Operator("mean")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the product of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol prod(Symbol data,
+                   Shape axis = Shape(),
+                   bool keepdims = false) {
+  return Operator("prod")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the sum of array elements over given axes with ``NaN`` ignored
+ *
+ *        Refer to ``sum`` for more details.
+ *
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol nansum(Symbol data,
+                     Shape axis = Shape(),
+                     bool keepdims = false) {
+  return Operator("nansum")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the product of array elements over given axes with ``NaN`` ignored
+ *
+ *        Refer to ``prod`` for more details.
+ *
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol nanprod(Symbol data,
+                      Shape axis = Shape(),
+                      bool keepdims = false) {
+  return Operator("nanprod")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the max of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol max(Symbol data,
+                  Shape axis = Shape(),
+                  bool keepdims = false) {
+  return Operator("max")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the min of array elements over given axes.
+ *
+ *        The argument ``axis`` specifies the axes to compute over:
+ *
+ *        - **()**: compute over all elements into a scalar array with shape ``(1,)``.
+ *        the default option.
+ *        - **int**: compute over along a particular axis. If input has shape ``(n, m,
+ *        use ``axis=0`` will result in an array with shape ``(m, k)``.
+ *        - **tuple of int**: compute over multiple axes. Again assume input shape ``(n,
+ *        k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+ *
+ *        If ``keepdims = 1``, then the result array will has the same number of
+ *        as the input, while the reduced axes will have size 1.
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the reduction.
+ * \param keepdims If true, the axes which are reduced are left in the result as
+ * \return new symbol
+ */
+inline Symbol min(Symbol data,
+                  Shape axis = Shape(),
+                  bool keepdims = false) {
+  return Operator("min")
+           .SetParam("axis", axis)
+           .SetParam("keepdims", keepdims)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Broadcast an array over particular axes.
+ *
+ *        Broadcasting is allowed on axes which size 1, such as from ``(2,1,3,1)`` to
+ *        ``(2,8,3,9)``. Elemenets will be duplicated on the broadcasted axes.
+ *
+ *        For example::
+ *
+ *        // given (1,2,1) shape x
+ *        x = [[[ 1.],
+ *        [ 2.]]]
+ *
+ *        // broadcast on axis 2
+ *        broadcast_axis(x, axis=2, size=3) = [[[ 1.,  1.,  1.],
+ *        [ 2.,  2.,  2.]]]
+ *        // broadcast on axes 0 and 2
+ *        broadcast_axis(x, axis=(0,2), size=(2,3)) = [[[ 1.,  1.,  1.],
+ *        [ 2.,  2.,  2.]],
+ *        [[ 1.,  1.,  1.],
+ *        [ 2.,  2.,  2.]]]
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param axis The axes to perform the broadcasting.
+ * \param size Target sizes of the broadcasting axes.
+ * \return new symbol
+ */
+inline Symbol broadcast_axis(Symbol data,
+                             Shape axis = Shape(),
+                             Shape size = Shape()) {
+  return Operator("broadcast_axis")
+           .SetParam("axis", axis)
+           .SetParam("size", size)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Broadcast an array to a new shape.
+ *
+ *        Broadcasting is allowed on axes which size 1, such as from ``(2,1,3,1)`` to
+ *        ``(2,8,3,9)``. Elemenets will be duplicated on the broadcasted axes.
+ *
+ *        For example::
+ *
+ *        broadcast_to([[1,2,3]], shape=(2,3)) = [[ 1.,  2.,  3.],
+ *        [ 1.,  2.,  3.]])
+ *
+ *        The dimensions that will not be changed can also use the special code ``0`` that
+ *        means copy the original value. So with ``shape=(2,0)`` we will obtain the same
+ *        results in the above example.
+ *
+ *
+ *
+ *        Defined in
+ * \param data The input
+ * \param shape The shape of the desired array. We can set the dim to zero if it's same
+ *        as the original. E.g `A = broadcast_to(B, shape=(10, 0, 0))` has the same
+ * \return new symbol
+ */
+inline Symbol broadcast_to(Symbol data,
+                           Shape shape = Shape()) {
+  return Operator("broadcast_to")
+           .SetParam("shape", shape)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute the L2 norm.
+ *
+ *        Flatten then input array and then compute the l2 norm.
+ *
+ *        Examples::
+ *
+ *        x = [[1, 2],
+ *        [3, 4]]
+ *
+ *        norm(x) = [5.47722578]
+ *
+ *
+ *
+ *        Defined in
+ * \param src Source input
+ * \return new symbol
+ */
+inline Symbol norm(Symbol src) {
+  return Operator("norm")
+           .SetInput("src", src)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Given three ndarrays, condition, x, and y, return an ndarray with the elements
+ *        from x or y, depending on the elements from condition are true or false. x and
+ *        y must have the same shape. If condition has the same shape as x, each element
+ *        in the output array is from x if the corresponding element in the condition is
+ *        true, and from y if false. If condtion does not have the same shape as x, it
+ *        must be a 1D array whose size is the same as x's first dimension size. Each row
+ *        of the output array is from x's row if the corresponding element from condition
+ *
+ *        From:/home/xlidc/mxnet/src/operator/tensor/control_flow_op.cc:21
+ * \param condition condition array
+ * \param x
+ * \param y
+ * \return new symbol
+ */
+inline Symbol where(Symbol condition,
+                    Symbol x,
+                    Symbol y) {
+  return Operator("where")
+           .SetInput("condition", condition)
+           .SetInput("x", x)
+           .SetInput("y", y)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Add arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_add(Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_add")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Substract arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_sub(Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_sub")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Multiply arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_mul(Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_mul")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Divide arguments, element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_div(Symbol lhs,
+                            Symbol rhs) {
+  return Operator("broadcast_div")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif First array elements raised to powers from second array,
+ *        element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L16
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_power(Symbol lhs,
+                              Symbol rhs) {
+  return Operator("broadcast_power")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Element-wise maximum of array elements with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L34
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_maximum(Symbol lhs,
+                                Symbol rhs) {
+  return Operator("broadcast_maximum")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Element-wise minimum of array elements with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L52
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_minimum(Symbol lhs,
+                                Symbol rhs) {
+  return Operator("broadcast_minimum")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Given the "legs" of a right triangle, return its hypotenuse
+ *        with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ *        /home/xlidc/mxnet/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc:L71
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_hypot(Symbol lhs,
+                              Symbol rhs) {
+  return Operator("broadcast_hypot")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return (lhs == rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_equal(Symbol lhs,
+                              Symbol rhs) {
+  return Operator("broadcast_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return (lhs != rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_not_equal(Symbol lhs,
+                                  Symbol rhs) {
+  return Operator("broadcast_not_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return (lhs > rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_greater(Symbol lhs,
+                                Symbol rhs) {
+  return Operator("broadcast_greater")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return (lhs >= rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_greater_equal(Symbol lhs,
+                                      Symbol rhs) {
+  return Operator("broadcast_greater_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return (lhs < rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_lesser(Symbol lhs,
+                               Symbol rhs) {
+  return Operator("broadcast_lesser")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Return (lhs <= rhs), element-wise with broadcasting.
+ *
+ *
+ *
+ *        Defined in
+ * \param lhs first input
+ * \param rhs second input
+ * \return new symbol
+ */
+inline Symbol broadcast_lesser_equal(Symbol lhs,
+                                     Symbol rhs) {
+  return Operator("broadcast_lesser_equal")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Add all input arguments element-wise.
+ *
+ *        .. math::
+ *        add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
+ *
+ *        ``add_n`` is potentially more efficient than calling ``add`` by `n` times.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/tensor/elemwise_sum.cc:L63
+ * \param args Positional input arguments
+ * \return new symbol
+ */
+inline Symbol add_n(const std::vector<Symbol>& args) {
+  return Operator("add_n")
+(args)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Custom operator implemented in frontend.
+ * \param op_type Type of custom operator. Must be registered first.
+ * \return new symbol
+ */
+inline Symbol Custom(const std::string& op_type) {
+  return Operator("Custom")
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Elementwise activation function.
+ *        The activation operations are applied elementwisely to each array elements. The
+ *        following types are supported:
+ *
+ *        - `relu`: Rectified Linear Unit, `y = max(x, 0)`
+ *        - `sigmoid`: `y = 1 / (1 + exp(-x))`
+ *        - `tanh`: Hyperbolic tangent, `y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))`
+ *        - `softrelu`: Soft ReLU, or SoftPlus, `y = log(1 + exp(x))`
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/activation.cc:L76
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \return new symbol
+ */
+inline Symbol Activation(Symbol data,
+                         ActivationActType act_type) {
+  static const char *ActivationActTypeValues[] = {
+    "relu",
+    "sigmoid",
+    "softrelu",
+    "tanh"
+  };
+  return Operator("Activation")
+           .SetParam("act_type", ActivationActTypeValues[int(act_type)])
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply bilinear sampling to input feature map, which is the key of "[NIPS2015]
+ *        output[batch, channel, y_dst, x_dst] = G(data[batch, channel, y_src, x_src)
+ *        x_dst, y_dst enumerate all spatial locations in output
+ *        x_src = grid[batch, 0, y_dst, x_dst]
+ *        y_src = grid[batch, 1, y_dst, x_dst]
+ *        G() denotes the bilinear interpolation kernel
+ *        The out-boundary points will be padded as zeros. (The boundary is defined to be
+ *        The shape of output will be (data.shape[0], data.shape[1], grid.shape[2],
+ *        The operator assumes that grid has been nomalized. If you want to design a
+ * \param data Input data to the BilinearsamplerOp.
+ * \param grid Input grid to the BilinearsamplerOp.grid has two channels: x_src, y_src
+ * \return new symbol
+ */
+inline Symbol BilinearSampler(Symbol data,
+                              Symbol grid) {
+  return Operator("BilinearSampler")
+           .SetInput("data", data)
+           .SetInput("grid", grid)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Compute *N*-D convolution on *(N+2)*-D input.
+ *
+ *        In the simplest 2-D convolution, given input data with shape *(batch_size,
+ *        channel, height, weight)*, the output is computed by
+ *
+ *        .. math::
+ *
+ *        out[n,i,:,:] = bias[i] + \sum_{j=0}^{num\_filter} data[n,j,:,:] \star
+ *        weight[i,j,:,:]
+ *
+ *        where :math:`\star` is the 2-D cross-correlation operator.
+ *
+ *        For general 2-D convolution, the shapes are
+ *
+ *        - **data**: *(batch_size, channel, height, weight)*
+ *        - **weight**: *(num_filter, channel, kernel[0], kernel[1])*
+ *        - **bias**: *(num_filter,)*
+ *        - **out**: *(batch_size, num_filter, out_height, out_weight)*.
+ *
+ *        Define::
+ *
+ *        f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1
+ *
+ *        then we have::
+ *
+ *        out_height=f(height, kernel[0], pad[0], stride[0], dilate[0])
+ *        out_weight=f(weight, kernel[1], pad[1], stride[1], dilate[1])
+ *
+ *        If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+ *
+ *        The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height,
+ *        weight)*. We can choose other layouts such as *NHWC*.
+ *
+ *        If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data``
+ *        evenly into *g* parts along the channel axis, and also evenly split ``weight``
+ *        along the first dimension. Next compute the convolution on the *i*-th part of
+ *        the data with the *i*-th weight part. The output is obtained by concating all
+ *        the *g* results.
+ *
+ *        To perform 1-D convolution, simply use 2-D convolution but set the last axis
+ *        size to be 1 for both data and weight.
+ *
+ *        3-D convolution adds an additional depth dimension besides height and
+ *        weight. The shapes are
+ *
+ *        - **data**: *(batch_size, channel, depth, height, weight)*
+ *        - **weight**: *(num_filter, channel, kernel[0], kernel[1], kernel[2])*
+ *        - **bias**: *(num_filter,)*
+ *        - **out**: *(batch_size, num_filter, out_depth, out_height, out_weight)*.
+ *
+ *        Both ``weight`` and ``bias`` are learnable parameters.
+ *
+ *        There are other options to tune the performance.
+ *
+ *        - **cudnn_tune**: enable this option leads to higher startup time but may give
+ *        faster speed. Options are
+ *
+ *        - **off**: no tuning
+ *        - **limited_workspace**:run test and pick the fastest algorithm that doesn't
+ *        exceed workspace limit.
+ *        - **fastest**: pick the fastest algorithm and ignore workspace limit.
+ *        - **None** (default): the behavior is determined by environment variable
+ *        ``MXNET_CUDNN_AUTOTUNE_DEFAULT``. 0 for off, 1 for limited workspace
+ *        (default), 2 for fastest.
+ *
+ *        - **workspace**: A large number leads to more (GPU) memory usage but may improve
+ *        the performance.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/convolution.cc:L143
+ * \param data Input data to the ConvolutionOp.
+ * \param weight Weight matrix.
+ * \param bias Bias parameter.
+ * \param kernel convolution kernel size: (h, w) or (d, h, w)
+ * \param num_filter convolution filter(channel) number
+ * \param stride convolution stride: (h, w) or (d, h, w)
+ * \param dilate convolution dilate: (h, w) or (d, h, w)
+ * \param pad pad for convolution: (h, w) or (d, h, w)
+ * \param num_group Number of group partitions.
+ * \param workspace Maximum temperal workspace allowed for convolution (MB).
+ * \param no_bias Whether to disable bias parameter.
+ * \param cudnn_tune Whether to pick convolution algo by running performance test.
+ * \param cudnn_off Turn off cudnn for this layer.
+ * \param layout Set layout for input, output and weight. Empty for
+ *        default layout: NCHW for 2d and NCDHW for 3d.
+ * \return new symbol
+ */
+inline Symbol Convolution(Symbol data,
+                          Symbol weight,
+                          Symbol bias,
+                          Shape kernel,
+                          uint32_t num_filter,
+                          Shape stride = Shape(),
+                          Shape dilate = Shape(),
+                          Shape pad = Shape(),
+                          uint32_t num_group = 1,
+                          uint64_t workspace = 1024,
+                          bool no_bias = false,
+                          ConvolutionCudnnTune cudnn_tune = ConvolutionCudnnTune::None,
+                          bool cudnn_off = false,
+                          ConvolutionLayout layout = ConvolutionLayout::None) {
+  static const char *ConvolutionCudnnTuneValues[] = {
+    "None",
+    "fastest",
+    "limited_workspace",
+    "off"
+  };
+  static const char *ConvolutionLayoutValues[] = {
+    "None",
+    "NCDHW",
+    "NCHW",
+    "NDHWC",
+    "NHWC"
+  };
+  return Operator("Convolution")
+           .SetParam("kernel", kernel)
+           .SetParam("num_filter", num_filter)
+           .SetParam("stride", stride)
+           .SetParam("dilate", dilate)
+           .SetParam("pad", pad)
+           .SetParam("num_group", num_group)
+           .SetParam("workspace", workspace)
+           .SetParam("no_bias", no_bias)
+           .SetParam("cudnn_tune", ConvolutionCudnnTuneValues[int(cudnn_tune)])
+           .SetParam("cudnn_off", cudnn_off)
+           .SetParam("layout", ConvolutionLayoutValues[int(layout)])
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .SetInput("bias", bias)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply correlation to inputs
+ * \param data1 Input data1 to the correlation.
+ * \param data2 Input data2 to the correlation.
+ * \param kernel_size kernel size for Correlation must be an odd number
+ * \param max_displacement Max displacement of Correlation
+ * \param stride1 stride1 quantize data1 globally
+ * \param stride2 stride2 quantize data2 within the neighborhood centered around data1
+ * \param pad_size pad for Correlation
+ * \param is_multiply operation type is either multiplication or subduction
+ * \return new symbol
+ */
+inline Symbol Correlation(Symbol data1,
+                          Symbol data2,
+                          uint32_t kernel_size = 1,
+                          uint32_t max_displacement = 1,
+                          uint32_t stride1 = 1,
+                          uint32_t stride2 = 1,
+                          uint32_t pad_size = 0,
+                          bool is_multiply = true) {
+  return Operator("Correlation")
+           .SetParam("kernel_size", kernel_size)
+           .SetParam("max_displacement", max_displacement)
+           .SetParam("stride1", stride1)
+           .SetParam("stride2", stride2)
+           .SetParam("pad_size", pad_size)
+           .SetParam("is_multiply", is_multiply)
+           .SetInput("data1", data1)
+           .SetInput("data2", data2)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Crop the 2nd and 3rd dim of input data, with the corresponding size of h_w or
+ *        with width and height of the second input symbol, i.e., with one input, we need
+ *        h_w to specify the crop height and width, otherwise the second input symbol's
+ * \param data Tensor or List of Tensors, the second input will be used as crop_like
+ * \param num_args Number of inputs for crop, if equals one, then we will use the h_wfor
+ *        crop height and width, else if equals two, then we will use the heightand width
+ * \param offset crop offset coordinate: (y, x)
+ * \param h_w crop height and weight: (h, w)
+ * \param center_crop If set to true, then it will use be the center_crop,or it will crop
+ * \return new symbol
+ */
+inline Symbol Crop(const std::vector<Symbol>& data,
+                   int num_args,
+                   Shape offset = Shape(0,0),
+                   Shape h_w = Shape(0,0),
+                   bool center_crop = false) {
+  return Operator("Crop")
+           .SetParam("num_args", num_args)
+           .SetParam("offset", offset)
+           .SetParam("h_w", h_w)
+           .SetParam("center_crop", center_crop)
+(data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply deconvolution to input then add a bias.
+ * \param data Input data to the DeconvolutionOp.
+ * \param weight Weight matrix.
+ * \param bias Bias parameter.
+ * \param kernel deconvolution kernel size: (y, x)
+ * \param num_filter deconvolution filter(channel) number
+ * \param stride deconvolution stride: (y, x)
+ * \param pad pad for deconvolution: (y, x), a good number is : (kernel-1)/2, if
+ * \param adj adjustment for output shape: (y, x), if target_shape set, adj will be
+ * \param target_shape output shape with targe shape : (y, x)
+ * \param num_group number of groups partition
+ * \param workspace Tmp workspace for deconvolution (MB)
+ * \param no_bias Whether to disable bias parameter.
+ * \return new symbol
+ */
+inline Symbol Deconvolution(Symbol data,
+                            Symbol weight,
+                            Symbol bias,
+                            Shape kernel,
+                            uint32_t num_filter,
+                            Shape stride = Shape(1,1),
+                            Shape pad = Shape(0,0),
+                            Shape adj = Shape(0,0),
+                            Shape target_shape = Shape(0,0),
+                            uint32_t num_group = 1,
+                            uint64_t workspace = 512,
+                            bool no_bias = true) {
+  return Operator("Deconvolution")
+           .SetParam("kernel", kernel)
+           .SetParam("num_filter", num_filter)
+           .SetParam("stride", stride)
+           .SetParam("pad", pad)
+           .SetParam("adj", adj)
+           .SetParam("target_shape", target_shape)
+           .SetParam("num_group", num_group)
+           .SetParam("workspace", workspace)
+           .SetParam("no_bias", no_bias)
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .SetInput("bias", bias)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply dropout to input.
+ *        During training, each element of the input is randomly set to zero with
+ *        And then the whole tensor is rescaled by 1/(1-p) to keep the expectation the
+ *        before applying dropout. During the test time, this behaves as an identity map.
+ *
+ * \param data Input data to dropout.
+ * \param p Fraction of the input that gets dropped out at training time
+ * \return new symbol
+ */
+inline Symbol Dropout(Symbol data,
+                      mx_float p = 0.5) {
+  return Operator("Dropout")
+           .SetParam("p", p)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply a linear transformation: :math:`Y = XW^T + b`.
+ *
+ *        Shapes:
+ *
+ *        - **data**: `(batch_size, input_dim)`
+ *        - **weight**: `(num_hidden, input_dim)`
+ *        - **bias**: `(num_hidden,)`
+ *        - **out**: `(batch_size, num_hidden)`
+ *
+ *        The learnable parameters include both ``weight`` and ``bias``.
+ *
+ *        If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/fully_connected.cc:L94
+ * \param data Input data.
+ * \param weight Weight matrix.
+ * \param bias Bias parameter.
+ * \param num_hidden Number of hidden nodes of the output.
+ * \param no_bias Whether to disable bias parameter.
+ * \return new symbol
+ */
+inline Symbol FullyConnected(Symbol data,
+                             Symbol weight,
+                             Symbol bias,
+                             int num_hidden,
+                             bool no_bias = false) {
+  return Operator("FullyConnected")
+           .SetParam("num_hidden", num_hidden)
+           .SetParam("no_bias", no_bias)
+           .SetInput("data", data)
+           .SetInput("weight", weight)
+           .SetInput("bias", bias)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif An operator taking in a n-dimensional input tensor (n > 2), and normalizing the
+ *        input by subtracting the mean and variance calculated over the spatial
+ *        dimensions. This is an implemention of the operator described in "Instance
+ *        Normalization: The Missing Ingredient for Fast Stylization", D. Ulyanov, A.
+ *        Vedaldi, V. Lempitsky, 2016 (arXiv:1607.08022v2). This layer is similar to
+ *        batch normalization, with two differences: first, the normalization is carried
+ *        out per example ('instance'), not over a batch. Second, the same normalization
+ *        is applied both at test and train time. This operation is also known as
+ * \param data A n-dimensional tensor (n > 2) of the form [batch, channel, spatial_dim1,
+ * \param gamma A vector of length 'channel', which multiplies the normalized input.
+ * \param beta A vector of length 'channel', which is added to the product of the
+ * \param eps Epsilon to prevent division by 0.
+ * \return new symbol
+ */
+inline Symbol InstanceNorm(Symbol data,
+                           Symbol gamma,
+                           Symbol beta,
+                           mx_float eps = 0.001) {
+  return Operator("InstanceNorm")
+           .SetParam("eps", eps)
+           .SetInput("data", data)
+           .SetInput("gamma", gamma)
+           .SetInput("beta", beta)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Set the l2 norm of each instance to a constant.
+ * \param data Input data to the L2NormalizationOp.
+ * \param eps Epsilon to prevent div 0
+ * \param mode Normalization Mode. If set to instance, this operator will compute a norm
+ *        for each instance in the batch; this is the default mode. If set to channel,
+ *        this operator will compute a cross channel norm at each position of each
+ * \return new symbol
+ */
+inline Symbol L2Normalization(Symbol data,
+                              mx_float eps = 1e-10,
+                              L2NormalizationMode mode = L2NormalizationMode::instance) {
+  static const char *L2NormalizationModeValues[] = {
+    "channel",
+    "instance",
+    "spatial"
+  };
+  return Operator("L2Normalization")
+           .SetParam("eps", eps)
+           .SetParam("mode", L2NormalizationModeValues[int(mode)])
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply convolution to input then add a bias.
+ * \param data Input data to the ConvolutionOp.
+ * \param nsize normalization window width in elements.
+ * \param alpha value of the alpha variance scaling parameter in the normalization formula
+ * \param beta value of the beta power parameter in the normalization formula
+ * \param knorm value of the k parameter in normalization formula
+ * \return new symbol
+ */
+inline Symbol LRN(Symbol data,
+                  uint32_t nsize,
+                  mx_float alpha = 0.0001,
+                  mx_float beta = 0.75,
+                  mx_float knorm = 2) {
+  return Operator("LRN")
+           .SetParam("nsize", nsize)
+           .SetParam("alpha", alpha)
+           .SetParam("beta", beta)
+           .SetParam("knorm", knorm)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Get output from a symbol and pass 1 gradient back. This is used as a terminal
+ *        loss if unary and binary operator are used to composite a loss with no
+ * \param data Input data.
+ * \param grad_scale gradient scale as a supplement to unary and binary operators
+ * \param valid_thresh regard element valid when x > valid_thresh, this is used only in
+ * \param normalization If set to null, op will not normalize on output gradient.If set
+ *        to batch, op will normalize gradient by divide batch size.If set to valid, op
+ * \return new symbol
+ */
+inline Symbol MakeLoss(Symbol data,
+                       mx_float grad_scale = 1,
+                       mx_float valid_thresh = 0,
+                       MakeLossNormalization normalization = MakeLossNormalization::null) {
+  static const char *MakeLossNormalizationValues[] = {
+    "batch",
+    "null",
+    "valid"
+  };
+  return Operator("MakeLoss")
+           .SetParam("grad_scale", grad_scale)
+           .SetParam("valid_thresh", valid_thresh)
+           .SetParam("normalization", MakeLossNormalizationValues[int(normalization)])
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Perform pooling on the input.
+ *
+ *        The shapes for 2-D pooling is
+ *
+ *        - **data**: *(batch_size, channel, height, width)*
+ *        - **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+ *
+ *        out_height = f(height, kernel[0], pad[0], stride[0])
+ *        out_width = f(width, kernel[1], pad[1], stride[1])
+ *
+ *        The defintion of *f* depends on ``pooling_convention``, which has two options:
+ *
+ *        - **valid** (default)::
+ *
+ *        f(x, k, p, s) = floor(x+2*p-k)/s+1
+ *
+ *        - **full**, which is compatible with Caffe::
+ *
+ *        f(x, k, p, s) = ceil(x+2*p-k)/s+1
+ *
+ *        But ``global_pool`` is set to be true, then do a global pooling, namely reset
+ *        ``kernel=(height, width)``.
+ *
+ *        Three pooling options are supported by ``pool_type``:
+ *
+ *        - **avg**: average pooling
+ *        - **max**: max pooling
+ *        - **sum**: sum pooling
+ *
+ *        1-D pooling is special case of 2-D pooling with *weight=1* and
+ *        *kernel[1]=1*.
+ *
+ *        For 3-D pooling, an additional *depth* dimension is added before
+ *        *height*. Namely the input data will have shape *(batch_size, channel, depth,
+ *        height, width)*.
+ *
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/pooling.cc:L122
+ * \param data Input data to the pooling operator.
+ * \param kernel pooling kernel size: (y, x) or (d, y, x)
+ * \param pool_type Pooling type to be applied.
+ * \param global_pool Ignore kernel size, do global pooling based on current input
+ * \param pooling_convention Pooling convention to be applied.
+ * \param stride stride: for pooling (y, x) or (d, y, x)
+ * \param pad pad for pooling: (y, x) or (d, y, x)
+ * \return new symbol
+ */
+inline Symbol Pooling(Symbol data,
+                      Shape kernel,
+                      PoolingPoolType pool_type,
+                      bool global_pool = false,
+                      PoolingPoolingConvention pooling_convention = PoolingPoolingConvention::valid,
+                      Shape stride = Shape(),
+                      Shape pad = Shape()) {
+  static const char *PoolingPoolTypeValues[] = {
+    "avg",
+    "max",
+    "sum"
+  };
+  static const char *PoolingPoolingConventionValues[] = {
+    "full",
+    "valid"
+  };
+  return Operator("Pooling")
+           .SetParam("kernel", kernel)
+           .SetParam("pool_type", PoolingPoolTypeValues[int(pool_type)])
+           .SetParam("global_pool", global_pool)
+           .SetParam("pooling_convention", PoolingPoolingConventionValues[int(pooling_convention)])
+           .SetParam("stride", stride)
+           .SetParam("pad", pad)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Use linear regression for final output, this is used on final output of a net.
+ * \param data Input data to function.
+ * \param label Input label to function.
+ * \param grad_scale Scale the gradient by a float factor
+ * \return new symbol
+ */
+inline Symbol LinearRegressionOutput(Symbol data,
+                                     Symbol label,
+                                     mx_float grad_scale = 1) {
+  return Operator("LinearRegressionOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Use mean absolute error regression for final output, this is used on final
+ * \param data Input data to function.
+ * \param label Input label to function.
+ * \param grad_scale Scale the gradient by a float factor
+ * \return new symbol
+ */
+inline Symbol MAERegressionOutput(Symbol data,
+                                  Symbol label,
+                                  mx_float grad_scale = 1) {
+  return Operator("MAERegressionOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Use Logistic regression for final output, this is used on final output of a net.
+ *        Logistic regression is suitable for binary classification or probability
+ * \param data Input data to function.
+ * \param label Input label to function.
+ * \param grad_scale Scale the gradient by a float factor
+ * \return new symbol
+ */
+inline Symbol LogisticRegressionOutput(Symbol data,
+                                       Symbol label,
+                                       mx_float grad_scale = 1) {
+  return Operator("LogisticRegressionOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply a recurrent layer to input.
+ * \param data Input data to RNN
+ * \param parameters Vector of all RNN trainable parameters concatenated
+ * \param state initial hidden state of the RNN
+ * \param state_cell initial cell state for LSTM networks (only for LSTM)
+ * \param state_size size of the state for each layer
+ * \param num_layers number of stacked layers
+ * \param mode the type of RNN to compute
+ * \param bidirectional whether to use bidirectional recurrent layers
+ * \param p Dropout probability, fraction of the input that gets dropped out at training
+ * \param state_outputs Whether to have the states as symbol outputs.
+ * \return new symbol
+ */
+inline Symbol RNN(Symbol data,
+                  Symbol parameters,
+                  Symbol state,
+                  Symbol state_cell,
+                  uint32_t state_size,
+                  uint32_t num_layers,
+                  RNNMode mode,
+                  bool bidirectional = false,
+                  mx_float p = 0,
+                  bool state_outputs = false) {
+  static const char *RNNModeValues[] = {
+    "gru",
+    "lstm",
+    "rnn_relu",
+    "rnn_tanh"
+  };
+  return Operator("RNN")
+           .SetParam("state_size", state_size)
+           .SetParam("num_layers", num_layers)
+           .SetParam("mode", RNNModeValues[int(mode)])
+           .SetParam("bidirectional", bidirectional)
+           .SetParam("p", p)
+           .SetParam("state_outputs", state_outputs)
+           .SetInput("data", data)
+           .SetInput("parameters", parameters)
+           .SetInput("state", state)
+           .SetInput("state_cell", state_cell)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Performs region-of-interest pooling on inputs. Resize bounding box coordinates
+ *        by spatial_scale and crop input feature maps accordingly. The cropped feature
+ *        maps are pooled by max pooling to a fixed size output indicated by pooled_size.
+ * \param data Input data to the pooling operator, a 4D Feature maps
+ * \param rois Bounding box coordinates, a 2D array of [[batch_index, x1, y1, x2, y2]].
+ *        (x1, y1) and (x2, y2) are top left and down right corners of designated region
+ *        of interest. batch_index indicates the index of corresponding image in the
+ * \param pooled_size fix pooled size: (h, w)
+ * \param spatial_scale Ratio of input feature map height (or w) to raw image height (or
+ * \return new symbol
+ */
+inline Symbol ROIPooling(Symbol data,
+                         Symbol rois,
+                         Shape pooled_size,
+                         mx_float spatial_scale) {
+  return Operator("ROIPooling")
+           .SetParam("pooled_size", pooled_size)
+           .SetParam("spatial_scale", spatial_scale)
+           .SetInput("data", data)
+           .SetInput("rois", rois)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Takes the last element of a sequence. Takes an n-dimensional tensor of the form
+ *        [max sequence length, batchsize, other dims] and returns a (n-1)-dimensional
+ *        tensor of the form [batchsize, other dims]. This operator takes an optional
+ *        input tensor sequence_length of positive ints of dimension [batchsize] when the
+ *        sequence_length option is set to true. This allows the operator to handle
+ *        variable-length sequences. If sequence_length is false, then each example in
+ * \param data n-dimensional input tensor of the form [max sequence length, batchsize,
+ * \param sequence_length vector of sequence lengths of size batchsize
+ * \param use_sequence_length If set to true, this layer takes in extra input
+ * \return new symbol
+ */
+inline Symbol SequenceLast(Symbol data,
+                           Symbol sequence_length,
+                           bool use_sequence_length = false) {
+  return Operator("SequenceLast")
+           .SetParam("use_sequence_length", use_sequence_length)
+           .SetInput("data", data)
+           .SetInput("sequence_length", sequence_length)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Sets all elements outside the sequence to a constant value. Takes an
+ *        n-dimensional tensor of the form [max sequence length, batchsize, other dims]
+ *        and returns a tensor of the same shape. This operator takes an optional input
+ *        tensor sequence_length of positive ints of dimension [batchsize] when the
+ *        sequence_length option is set to true. This allows the operator to handle
+ *        variable-length sequences. If sequence_length is false, then each example in
+ *        the batch is assumed to have the max sequence length, and this operator becomes
+ * \param data n-dimensional input tensor of the form [max sequence length, batchsize,
+ * \param sequence_length vector of sequence lengths of size batchsize
+ * \param use_sequence_length If set to true, this layer takes in extra input
+ * \param value The value to be used as a mask.
+ * \return new symbol
+ */
+inline Symbol SequenceMask(Symbol data,
+                           Symbol sequence_length,
+                           bool use_sequence_length = false,
+                           mx_float value = 0) {
+  return Operator("SequenceMask")
+           .SetParam("use_sequence_length", use_sequence_length)
+           .SetParam("value", value)
+           .SetInput("data", data)
+           .SetInput("sequence_length", sequence_length)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Reverses the elements of each sequence. Takes an n-dimensional tensor of the
+ *        form [max sequence length, batchsize, other dims] and returns a tensor of the
+ *        same shape. This operator takes an optional input tensor sequence_length of
+ *        positive ints of dimension [batchsize] when the sequence_length option is set
+ *        to true. This allows the operator to handle variable-length sequences. If
+ *        sequence_length is false, then each example in the batch is assumed to have the
+ * \param data n-dimensional input tensor of the form [max sequence length, batchsize,
+ * \param sequence_length vector of sequence lengths of size batchsize
+ * \param use_sequence_length If set to true, this layer takes in extra input
+ * \return new symbol
+ */
+inline Symbol SequenceReverse(Symbol data,
+                              Symbol sequence_length,
+                              bool use_sequence_length = false) {
+  return Operator("SequenceReverse")
+           .SetParam("use_sequence_length", use_sequence_length)
+           .SetInput("data", data)
+           .SetInput("sequence_length", sequence_length)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply softmax activation to input. This is intended for internal layers. For
+ *        output (loss layer) please use SoftmaxOutput. If mode=instance, this operator
+ *        will compute a softmax for each instance in the batch; this is the default
+ *        mode. If mode=channel, this operator will compute a num_channel-class softmax
+ *        at each position of each instance; this can be used for fully convolutional
+ * \param data Input data to activation function.
+ * \param mode Softmax Mode. If set to instance, this operator will compute a softmax for
+ *        each instance in the batch; this is the default mode. If set to channel, this
+ *        operator will compute a num_channel-class softmax at each position of each
+ *        instance; this can be used for fully convolutional network, image segmentation,
+ * \return new symbol
+ */
+inline Symbol SoftmaxActivation(Symbol data,
+                                SoftmaxActivationMode mode = SoftmaxActivationMode::instance) {
+  static const char *SoftmaxActivationModeValues[] = {
+    "channel",
+    "instance"
+  };
+  return Operator("SoftmaxActivation")
+           .SetParam("mode", SoftmaxActivationModeValues[int(mode)])
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Softmax with logit loss.
+ *
+ *        In the forward pass, the softmax output is returned. Assume the input data has
+ *        shape *(n,k)*, then the output will have the same shape as the input, which is
+ *
+ *        .. math::
+ *        out[i,:] = softmax(data[i,:])
+ *
+ *        for :math:`i=0,...,n-1`, where
+ *
+ *        .. math::
+ *        softmax(x) = \left[..., \frac{exp(x[j])}{exp(x[0])+...+exp(x[k-1])}, ...\right]
+ *
+ *        For general *N*-D input array with shape :math:`(d_1, ..., d_n)`. Denoted by
+ *        :math:`s=d_1d_2...d_n`. The way to compute softmax various:
+ *
+ *        - ``preserve_shape`` is false (default). Reshape input into a 2-D array with
+ *        shape :math:`(d_1, s/d_1)` beforing computing the softmax, and then reshaped
+ *        original shape.
+ *
+ *        - ``preserve_shape`` is true. For all :math:`i_1, ..., i_{n-1}`, compute
+ *
+ *        .. math::
+ *        out[i_1, ..., i_{n-1}, :] = softmax(data[i_1, ..., i_{n-1},:])
+ *
+ *        - ``multi_output`` is true. For all :math:`i_1, ..., i_{n-1}`, compute
+ *
+ *        .. math::
+ *        out[i_1, :, ..., i_{n-1}] = softmax(data[i_1, :, ..., i_{n-1}])
+ *
+ *        In the backward pass, the logit loss, also called cross-entroy loss, is
+ *        added. The provided label can be a *(N-1)*-D label index array or a *N*-D label
+ *        probability array.
+ *
+ *        Examples with a particular label can be ignored during backward by specifying
+ *        ``ignore_label`` (also need ``use_ignore`` to be true).
+ *
+ *        A scale can be applied to the gradient by ``grad_scale``, which is often used in
+ *        mutli-loss object function in which we can given each loss different weight. It
+ *        also supports various ways to normalize the gradient by ``normalization``:
+ *
+ *        - **null**: do nothing
+ *        - **batch**: divide by batch size (number of examples)
+ *        - **valid**: divide by the number of examples which are not ignored.
+ *
+ *
+ *        Defined in /home/xlidc/mxnet/src/operator/softmax_output.cc:L77
+ * \param data Input data.
+ * \param label Ground truth label.
+ * \param grad_scale Scale the gradient by a float factor
+ * \param ignore_label the labels with value equals to ``ignore_label`` will be ignored
+ * \param multi_output If set to true, softmax will applied on axis 1
+ * \param use_ignore If set to true, the ignore_label value will not contribute to the
+ * \param preserve_shape If true, softmax will applied on the last axis
+ * \param normalization Normalize the gradient
+ * \param out_grad Apply weighting from output gradient
+ * \return new symbol
+ */
+inline Symbol SoftmaxOutput(Symbol data,
+                            Symbol label,
+                            mx_float grad_scale = 1,
+                            mx_float ignore_label = -1,
+                            bool multi_output = false,
+                            bool use_ignore = false,
+                            bool preserve_shape = false,
+                            SoftmaxOutputNormalization normalization = SoftmaxOutputNormalization::null,
+                            bool out_grad = false) {
+  static const char *SoftmaxOutputNormalizationValues[] = {
+    "batch",
+    "null",
+    "valid"
+  };
+  return Operator("SoftmaxOutput")
+           .SetParam("grad_scale", grad_scale)
+           .SetParam("ignore_label", ignore_label)
+           .SetParam("multi_output", multi_output)
+           .SetParam("use_ignore", use_ignore)
+           .SetParam("preserve_shape", preserve_shape)
+           .SetParam("normalization", SoftmaxOutputNormalizationValues[int(normalization)])
+           .SetParam("out_grad", out_grad)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif DEPRECATED: Perform a softmax transformation on input. Please use SoftmaxOutput
+ * \param data Input data to softmax.
+ * \param grad_scale Scale the gradient by a float factor
+ * \param ignore_label the labels with value equals to ``ignore_label`` will be ignored
+ * \param multi_output If set to true, softmax will applied on axis 1
+ * \param use_ignore If set to true, the ignore_label value will not contribute to the
+ * \param preserve_shape If true, softmax will applied on the last axis
+ * \param normalization Normalize the gradient
+ * \param out_grad Apply weighting from output gradient
+ * \return new symbol
+ */
+inline Symbol Softmax(Symbol data,
+                      mx_float grad_scale = 1,
+                      mx_float ignore_label = -1,
+                      bool multi_output = false,
+                      bool use_ignore = false,
+                      bool preserve_shape = false,
+                      SoftmaxNormalization normalization = SoftmaxNormalization::null,
+                      bool out_grad = false) {
+  static const char *SoftmaxNormalizationValues[] = {
+    "batch",
+    "null",
+    "valid"
+  };
+  return Operator("Softmax")
+           .SetParam("grad_scale", grad_scale)
+           .SetParam("ignore_label", ignore_label)
+           .SetParam("multi_output", multi_output)
+           .SetParam("use_ignore", use_ignore)
+           .SetParam("preserve_shape", preserve_shape)
+           .SetParam("normalization", SoftmaxNormalizationValues[int(normalization)])
+           .SetParam("out_grad", out_grad)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Apply spatial transformer to input feature map.
+ * \param data Input data to the SpatialTransformerOp.
+ * \param loc localisation net, the output dim should be 6 when transform_type is affine.
+ * \param transform_type transformation type
+ * \param sampler_type sampling type
+ * \param target_shape output shape(h, w) of spatial transformer: (y, x)
+ * \return new symbol
+ */
+inline Symbol SpatialTransformer(Symbol data,
+                                 Symbol loc,
+                                 SpatialTransformerTransformType transform_type,
+                                 SpatialTransformerSamplerType sampler_type,
+                                 Shape target_shape = Shape(0,0)) {
+  static const char *SpatialTransformerTransformTypeValues[] = {
+    "affine"
+  };
+  static const char *SpatialTransformerSamplerTypeValues[] = {
+    "bilinear"
+  };
+  return Operator("SpatialTransformer")
+           .SetParam("transform_type", SpatialTransformerTransformTypeValues[int(transform_type)])
+           .SetParam("sampler_type", SpatialTransformerSamplerTypeValues[int(sampler_type)])
+           .SetParam("target_shape", target_shape)
+           .SetInput("data", data)
+           .SetInput("loc", loc)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Support Vector Machine based transformation on input, backprop L2-SVM
+ * \param data Input data to svm.
+ * \param label Label data.
+ * \param margin Scale the DType(param_.margin) for activation size
+ * \param regularization_coefficient Scale the coefficient responsible for balacing
+ * \param use_linear If set true, uses L1-SVM objective function. Default uses L2-SVM
+ * \return new symbol
+ */
+inline Symbol SVMOutput(Symbol data,
+                        Symbol label,
+                        mx_float margin = 1,
+                        mx_float regularization_coefficient = 1,
+                        bool use_linear = false) {
+  return Operator("SVMOutput")
+           .SetParam("margin", margin)
+           .SetParam("regularization_coefficient", regularization_coefficient)
+           .SetParam("use_linear", use_linear)
+           .SetInput("data", data)
+           .SetInput("label", label)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif generate sampling grid for bilinear sampling.
+ * \param data Input data to the GridGeneratorOp.
+ * \param transform_type transformation type
+ *        if transformation type is affine, data is affine matrix : (batch, 6)
+ *        if transformation type is warp, data is optical flow : (batch, 2, h, w)
+ * \param target_shape if transformation type is affine, the operator need a target_shape
+ *        if transofrmation type is warp, the operator will ignore target_shape
+ * \return new symbol
+ */
+inline Symbol GridGenerator(Symbol data,
+                            GridGeneratorTransformType transform_type,
+                            Shape target_shape = Shape(0,0)) {
+  static const char *GridGeneratorTransformTypeValues[] = {
+    "affine",
+    "warp"
+  };
+  return Operator("GridGenerator")
+           .SetParam("transform_type", GridGeneratorTransformTypeValues[int(transform_type)])
+           .SetParam("target_shape", target_shape)
+           .SetInput("data", data)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Choose one element from each line(row for python, column for R/Julia) in lhs
+ *        according to index indicated by rhs. This function assume rhs uses 0-based
+ * \param lhs Left operand to the function.
+ * \param rhs Right operand to the function.
+ * \return new symbol
+ */
+inline Symbol choose_element_0index(Symbol lhs,
+                                    Symbol rhs) {
+  return Operator("choose_element_0index")
+           .SetInput("lhs", lhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+/*!
+ * \breif Fill one element of each line(row for python, column for R/Julia) in lhs
+ *        according to index indicated by rhs and values indicated by mhs. This function
+ * \param lhs Left operand to the function.
+ * \param mhs Middle operand to the function.
+ * \param rhs Right operand to the function.
+ * \return new symbol
+ */
+inline Symbol fill_element_0index(Symbol lhs,
+                                  Symbol mhs,
+                                  Symbol rhs) {
+  return Operator("fill_element_0index")
+           .SetInput("lhs", lhs)
+           .SetInput("mhs", mhs)
+           .SetInput("rhs", rhs)
+           .CreateSymbol();
+}
+
+} //namespace cpp
+} //namespace mxnet
+#endif //ifndef _MXNETOP_H
diff --git a/cpp-package/include/mxnet-cpp/op_map.h b/cpp-package/include/mxnet-cpp/op_map.h
new file mode 100644
index 000000000000..4f8c58664df7
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_map.h
@@ -0,0 +1,92 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op_map.h
+* \brief definition of OpMap
+* \author Chuntao Hong
+*/
+
+#ifndef MXNETCPP_MXNET_H
+#define MXNETCPP_MXNET_H
+
+#include <map>
+#include <string>
+#include "mxnet-cpp/base.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief OpMap instance holds a map of all the symbol creators so we can
+*  get symbol creators by name.
+*  This is used internally by Symbol and Operator.
+*/
+class OpMap {
+ public:
+  /*!
+  * \brief Create an Mxnet instance
+  */
+  inline OpMap() {
+    mx_uint num_symbol_creators = 0;
+    AtomicSymbolCreator *symbol_creators = nullptr;
+    int r =
+      MXSymbolListAtomicSymbolCreators(&num_symbol_creators, &symbol_creators);
+    CHECK_EQ(r, 0);
+    for (mx_uint i = 0; i < num_symbol_creators; i++) {
+      const char *name;
+      const char *description;
+      mx_uint num_args;
+      const char **arg_names;
+      const char **arg_type_infos;
+      const char **arg_descriptions;
+      const char *key_var_num_args;
+      r = MXSymbolGetAtomicSymbolInfo(symbol_creators[i], &name, &description,
+        &num_args, &arg_names, &arg_type_infos,
+        &arg_descriptions, &key_var_num_args);
+      CHECK_EQ(r, 0);
+      symbol_creators_[name] = symbol_creators[i];
+    }
+
+    nn_uint num_ops;
+    const char **op_names;
+    r = NNListAllOpNames(&num_ops, &op_names);
+    CHECK_EQ(r, 0);
+    for (nn_uint i = 0; i < num_ops; i++) {
+      OpHandle handle;
+      r = NNGetOpHandle(op_names[i], &handle);
+      CHECK_EQ(r, 0);
+      op_handles_[op_names[i]] = handle;
+    }
+  }
+
+  /*!
+  * \brief Get a symbol creator with its name.
+  *
+  * \param name name of the symbol creator
+  * \return handle to the symbol creator
+  */
+  inline AtomicSymbolCreator GetSymbolCreator(const std::string &name) {
+    if (symbol_creators_.count(name) == 0)
+      return GetOpHandle(name);
+    return symbol_creators_[name];
+  }
+
+  /*!
+  * \brief Get an op handle with its name.
+  *
+  * \param name name of the op
+  * \return handle to the op
+  */
+  inline OpHandle GetOpHandle(const std::string &name) {
+    return op_handles_[name];
+  }
+
+ private:
+  std::map<std::string, AtomicSymbolCreator> symbol_creators_;
+  std::map<std::string, OpHandle> op_handles_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_MXNET_H
diff --git a/cpp-package/include/mxnet-cpp/op_suppl.h b/cpp-package/include/mxnet-cpp/op_suppl.h
new file mode 100644
index 000000000000..8c4405286b60
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_suppl.h
@@ -0,0 +1,188 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op_suppl.h
+* \brief A supplement and amendment of the operators from op.h
+* \author Zhang Chen, zhubuntu, Xin Li
+*/
+
+#ifndef OP_SUPPL_H
+#define OP_SUPPL_H
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+#include "mxnet-cpp/operator.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline Symbol _Plus(Symbol lhs, Symbol rhs) {
+  return Operator("_Plus")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Mul(Symbol lhs, Symbol rhs) {
+  return Operator("_Mul")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Minus(Symbol lhs, Symbol rhs) {
+  return Operator("_Minus")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Div(Symbol lhs, Symbol rhs) {
+  return Operator("_Div")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Power(Symbol lhs, Symbol rhs) {
+  return Operator("_Power")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Maximum(Symbol lhs, Symbol rhs) {
+  return Operator("_Maximum")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Minimum(Symbol lhs, Symbol rhs) {
+  return Operator("_Minimum")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _PlusScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_PlusScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MinusScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MinusScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RMinusScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RMinusScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MulScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MulScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _DivScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_DivScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RDivScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RDivScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _PowerScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_PowerScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RPowerScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RPowerScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MaximumScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MaximumScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MinimumScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MinimumScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+// TODO(zhangcheng-qinyinghua)
+//  make crop function run in op.h
+//  This function is due to [zhubuntu](https://github.com/zhubuntu)
+inline Symbol Crop(const std::string& symbol_name,
+    int num_args,
+    Symbol data,
+    Symbol crop_like,
+    Shape offset = Shape(0, 0),
+    Shape h_w = Shape(0, 0),
+    bool center_crop = false) {
+  return Operator("Crop")
+    .SetParam("num_args", num_args)
+    .SetParam("offset", offset)
+    .SetParam("h_w", h_w)
+    .SetParam("center_crop", center_crop)
+    .SetInput("arg0", data)
+    .SetInput("arg1", crop_like)
+    .CreateSymbol(symbol_name);
+}
+
+
+/*!
+ * \breif Slice input equally along specified axis.
+ * \param data input symbol.
+ * \param num_outputs Number of outputs to be sliced.
+ * \param axis Dimension along which to slice.
+ * \param squeeze_axis If true AND the sliced dimension becomes 1, squeeze that dimension.
+ * \return new symbol
+ */
+inline Symbol SliceChannel(Symbol data,
+                           int num_outputs,
+                           int axis = 1,
+                           bool squeeze_axis = false) {
+  return Operator("SliceChannel")
+           .SetParam("num_outputs", num_outputs)
+           .SetParam("axis", axis)
+           .SetParam("squeeze_axis", squeeze_axis) (data)
+           .CreateSymbol();
+}
+
+
+/*!
+ * \breif Slice input equally along specified axis.
+ * \param symbol_name name of the resulting symbol.
+ * \param data input symbol.
+ * \param num_outputs Number of outputs to be sliced.
+ * \param axis Dimension along which to slice.
+ * \param squeeze_axis If true AND the sliced dimension becomes 1, squeeze that dimension.
+ * \return new symbol
+ */
+inline Symbol SliceChannel(const std::string& symbol_name,
+                           Symbol data,
+                           int num_outputs,
+                           int axis = 1,
+                           bool squeeze_axis = false) {
+  return Operator("SliceChannel")
+           .SetParam("num_outputs", num_outputs)
+           .SetParam("axis", axis)
+           .SetParam("squeeze_axis", squeeze_axis) (data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply activation function to input.
+ *        Softmax Activation is only available with CUDNN on GPUand will be
+ *        computed at each location across channel if input is 4D.
+ * \param symbol_name name of the resulting symbol.
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \return new symbol
+ */
+inline Symbol Activation(const std::string& symbol_name,
+                         Symbol data,
+                         const std::string& act_type) {
+  assert(act_type == "relu" ||
+         act_type == "sigmoid" ||
+         act_type == "softrelu" ||
+         act_type == "tanh");
+  return Operator("Activation")
+           .SetParam("act_type", act_type.c_str())
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif /* end of include guard: OP_SUPPL_H */
+
diff --git a/cpp-package/include/mxnet-cpp/operator.h b/cpp-package/include/mxnet-cpp/operator.h
new file mode 100644
index 000000000000..66aec7fa0eda
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/operator.h
@@ -0,0 +1,188 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.h
+* \brief definition of operator
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_OPERATOR_H
+#define MXNETCPP_OPERATOR_H
+
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+class Mxnet;
+/*!
+* \brief Operator interface
+*/
+class Operator {
+ public:
+  /*!
+  * \brief Operator constructor
+  * \param operator_name type of the operator
+  */
+  explicit Operator(const std::string &operator_name);
+  Operator &operator=(const Operator &rhs);
+  /*!
+  * \brief set config parameters
+  * \param name name of the config parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Operator &SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return *this;
+  }
+  /*!
+  * \brief set config parameters from positional inputs
+  * \param pos the position of parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Operator &SetParam(int pos, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[arg_names_[pos]] = value_str;
+    return *this;
+  }
+  /*!
+  * \brief add an input symbol
+  * \param name name of the input symbol
+  * \param symbol the input symbol
+  * \return reference of self
+  */
+  Operator &SetInput(const std::string &name, Symbol symbol);
+  /*!
+  * \brief add an input symbol
+  * \param symbol the input symbol
+  */
+  template<int N = 0>
+  void PushInput(const Symbol &symbol) {
+    input_symbols.push_back(symbol.GetHandle());
+  }
+  /*!
+  * \brief add input symbols
+  * \return reference of self
+  */
+  Operator &operator()() { return *this; }
+  /*!
+  * \brief add input symbols
+  * \param symbol the input symbol
+  * \return reference of self
+  */
+  Operator &operator()(const Symbol &symbol) {
+    input_symbols.push_back(symbol.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add a list of input symbols
+  * \param symbols the vector of the input symbols
+  * \return reference of self
+  */
+  Operator &operator()(const std::vector<Symbol> &symbols) {
+    for (auto &s : symbols) {
+      input_symbols.push_back(s.GetHandle());
+    }
+    return *this;
+  }
+  /*!
+  * \brief create a Symbol from the current operator
+  * \param name the name of the operator
+  * \return the operator Symbol
+  */
+  Symbol CreateSymbol(const std::string &name = "");
+
+  /*!
+  * \brief add an input ndarray
+  * \param name name of the input ndarray
+  * \param ndarray the input ndarray
+  * \return reference of self
+  */
+  Operator &SetInput(const std::string &name, NDArray ndarray);
+  /*!
+  * \brief add an input ndarray
+  * \param ndarray the input ndarray
+  */
+  template<int N = 0>
+  void PushInput(const NDArray &ndarray) {
+    input_ndarrays.push_back(ndarray.GetHandle());
+  }
+  /*!
+  * \brief add positional inputs
+  */
+  template <class T, class... Args, int N = 0>
+  void PushInput(const T &t, Args... args) {
+    SetParam(N, t);
+    PushInput<Args..., N+1>(args...);
+  }
+  /*!
+  * \brief add the last positional input
+  */
+  template <class T, int N = 0>
+  void PushInput(const T &t) {
+    SetParam(N, t);
+  }
+  /*!
+  * \brief add input ndarrays
+  * \param ndarray the input ndarray
+  * \return reference of self
+  */
+  Operator &operator()(const NDArray &ndarray) {
+    input_ndarrays.push_back(ndarray.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add a list of input ndarrays
+  * \param ndarrays the vector of the input ndarrays
+  * \return reference of self
+  */
+  Operator &operator()(const std::vector<NDArray> &ndarrays) {
+    for (auto &s : ndarrays) {
+      input_ndarrays.push_back(s.GetHandle());
+    }
+    return *this;
+  }
+  /*!
+  * \brief add input ndarrays
+  * \return reference of self
+  */
+  template <typename... Args>
+  Operator &operator()(Args... args) {
+    PushInput(args...);
+    return *this;
+  }
+  std::vector<NDArray> Invoke();
+  void Invoke(NDArray &output);
+  void Invoke(std::vector<NDArray> &outputs);
+
+ private:
+  std::map<std::string, std::string> params_desc_;
+  bool variable_params_ = false;
+  std::map<std::string, std::string> params_;
+  std::vector<SymbolHandle> input_symbols;
+  std::vector<NDArrayHandle> input_ndarrays;
+  std::vector<std::string> input_keys;
+  std::vector<std::string> arg_names_;
+  AtomicSymbolCreator handle_;
+  static OpMap *op_map_;
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_OPERATOR_H
diff --git a/cpp-package/include/mxnet-cpp/operator.hpp b/cpp-package/include/mxnet-cpp/operator.hpp
new file mode 100644
index 000000000000..3c8d1afe9b5f
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/operator.hpp
@@ -0,0 +1,155 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.hpp
+* \brief implementation of operator
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_OPERATOR_HPP
+#define MXNETCPP_OPERATOR_HPP
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <iterator>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/operator.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*
+ * Pushing NDArray or Symbol as inputs here to avoid partial specialization
+ * like PushInput<NDArray, Args..., N>, which is not allowed in C++
+ */
+template <>
+Operator& Operator::SetParam<NDArray>(int pos, const NDArray &value) {
+  input_ndarrays.push_back(value.GetHandle());
+  return *this;
+}
+template <>
+Operator& Operator::SetParam<Symbol>(int pos, const Symbol &value) {
+  input_symbols.push_back(value.GetHandle());
+  return *this;
+}
+
+OpMap *Operator::op_map_ = new OpMap();
+
+Operator::Operator(const std::string &operator_name) {
+  handle_ = op_map_->GetSymbolCreator(operator_name);
+  const char *name;
+  const char *description;
+  mx_uint num_args;
+  const char **arg_names;
+  const char **arg_type_infos;
+  const char **arg_descriptions;
+  const char *key_var_num_args;
+  MXSymbolGetAtomicSymbolInfo(handle_,
+      &name,
+      &description,
+      &num_args,
+      &arg_names,
+      &arg_type_infos,
+      &arg_descriptions,
+      &key_var_num_args);
+  for (mx_uint i = 0; i < num_args; ++i) {
+    arg_names_.push_back(arg_names[i]);
+  }
+}
+
+Symbol Operator::CreateSymbol(const std::string &name) {
+  if (input_keys.size() > 0) {
+    CHECK_EQ(input_keys.size(), input_symbols.size());
+  }
+  const char *pname = name == "" ? nullptr : name.c_str();
+
+  SymbolHandle symbol_handle;
+  std::vector<const char *> input_keys;
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+  for (auto &data : this->input_keys) {
+    input_keys.push_back(data.c_str());
+  }
+  const char **input_keys_p =
+      (input_keys.size() > 0) ? input_keys.data() : nullptr;
+
+  MXSymbolCreateAtomicSymbol(handle_, param_keys.size(), param_keys.data(),
+                             param_values.data(), &symbol_handle);
+  MXSymbolCompose(symbol_handle, pname, input_symbols.size(), input_keys_p,
+                  input_symbols.data());
+  return Symbol(symbol_handle);
+}
+
+void Operator::Invoke(std::vector<NDArray> &outputs) {
+  if (input_keys.size() > 0) {
+    CHECK_EQ(input_keys.size(), input_ndarrays.size());
+  }
+
+  std::vector<const char *> input_keys;
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+
+  int num_inputs = input_ndarrays.size();
+  int num_outputs = outputs.size();
+  std::vector<NDArrayHandle> output_handles;
+  std::transform(outputs.begin(), outputs.end(),
+      std::back_inserter(output_handles), [](NDArray& a) {
+        return a.GetHandle();
+      });
+
+  NDArrayHandle *outputs_receiver = nullptr;
+  if (num_outputs > 0) {
+    outputs_receiver = output_handles.data();
+  }
+
+  MXImperativeInvoke(handle_, num_inputs, input_ndarrays.data(),
+      &num_outputs, &outputs_receiver,
+      param_keys.size(), param_keys.data(), param_values.data());
+
+  if (outputs.size() > 0)
+    return;
+
+  std::transform(outputs_receiver, outputs_receiver+num_outputs,
+      std::back_inserter(outputs), [](const NDArrayHandle& handle) {
+        return NDArray(handle);
+      });
+}
+
+std::vector<NDArray> Operator::Invoke() {
+  std::vector<NDArray> outputs;
+  Invoke(outputs);
+  return outputs;
+}
+
+void Operator::Invoke(NDArray &output) {
+  std::vector<NDArray> outputs{output};
+  Invoke(outputs);
+}
+
+Operator &Operator::SetInput(const std::string &name, Symbol symbol) {
+  input_keys.push_back(name.c_str());
+  input_symbols.push_back(symbol.GetHandle());
+  return *this;
+}
+
+Operator &Operator::SetInput(const std::string &name, NDArray ndarray) {
+  input_keys.push_back(name.c_str());
+  input_ndarrays.push_back(ndarray.GetHandle());
+  return *this;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_OPERATOR_HPP
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
new file mode 100644
index 000000000000..03c0d90c7b97
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -0,0 +1,122 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file optimizer.h
+* \brief definition of optimizer
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_OPTIMIZER_H
+#define MXNETCPP_OPTIMIZER_H
+
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include <functional>
+#include "mxnet-cpp/base.h"
+#include "dmlc/logging.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief Optimizer interface
+*/
+class Optimizer {
+ public:
+  /*!
+  * \brief get optimizer type
+  * \return string of optimizer type
+  */
+  virtual std::string GetType() const = 0;
+  /*!
+  * \brief destructor
+  */
+  virtual ~Optimizer();
+  /*!
+  * \brief set config parameters
+  * \param name name of the config parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Optimizer *SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return this;
+  }
+  /*!
+  *  \brief Update a weight with gradient.
+  *  \param index the unique index for the weight.
+  *  \param weight the weight to update.
+  *  \param grad gradient for the weight.
+  *  \param lr learning rate.
+  *  \param wd weight decay.
+  */
+  void Update(int index, NDArray weight, NDArray grad, mx_float lr,
+              mx_float wd);
+  /*!
+  *  \brief Update a weight with gradient.
+  *  \param index the unique index for the weight.
+  *  \param weight the weight to update.
+  *  \param grad gradient for the weight.
+  */
+  virtual void Update(int index, NDArray weight, NDArray grad) = 0;
+  // TODO(zhangcheng-qinyinghua)
+  // implement Update a list of arrays, maybe in the form of map
+  // void Update(int index, std::vector<NDArray> weights, std::vector<NDArray>
+  // grad, mx_float lr);
+
+  /*!
+  *  \brief Serialize the optimizer parameters to a string.
+  *  \return serialization
+  */
+  std::string Serialize() const;
+
+ protected:
+  std::map<std::string, std::string> params_;
+  static OpMap *op_map_;
+  const std::vector<const char*> GetParamKeys_() const;
+  const std::vector<const char*> GetParamValues_() const;
+};
+
+typedef std::function<Optimizer*()> OptimizerCreator;
+
+class OptimizerRegistry {
+ public:
+  static Optimizer* Find(const std::string& name);
+  static int __REGISTER__(const std::string& name, OptimizerCreator creator);
+ private:
+  static std::map<std::string, OptimizerCreator> cmap_;
+  OptimizerRegistry() = delete;
+  ~OptimizerRegistry() = delete;
+};
+
+#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType)          \
+  static int __make_ ## OptimizerType ## _ ## Name ## __ = \
+       OptimizerRegistry::__REGISTER__(#Name, [](){return new OptimizerType();})
+
+class SGDOptimizer : public Optimizer {
+ public:
+  SGDOptimizer();
+  virtual std::string GetType() const;
+  virtual void Update(int index, NDArray weight, NDArray grad);
+ private:
+  virtual ~SGDOptimizer();
+  virtual void CreateState_(int index, NDArray weight);
+  std::map<int, NDArray*> states_;
+  AtomicSymbolCreator update_handle_;
+  AtomicSymbolCreator mom_update_handle_;
+};
+
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_OPTIMIZER_H
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
new file mode 100644
index 000000000000..94af0ec759a2
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -0,0 +1,134 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file optimizer.hpp
+* \brief implementation of optimizer
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_OPTIMIZER_HPP
+#define MXNETCPP_OPTIMIZER_HPP
+
+#include <algorithm>
+#include <utility>
+#include <numeric>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/optimizer.h"
+#include "mxnet-cpp/op.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+OpMap* Optimizer::op_map_ = new OpMap();
+
+std::map<std::string, OptimizerCreator> OptimizerRegistry::cmap_;
+
+MXNETCPP_REGISTER_OPTIMIZER(sgd, SGDOptimizer);
+MXNETCPP_REGISTER_OPTIMIZER(ccsgd, SGDOptimizer);  // For backward compatibility
+
+Optimizer::~Optimizer() {}
+
+void Optimizer::Update(int index, NDArray weight, NDArray grad, mx_float lr,
+                       mx_float wd) {
+  params_["lr"] = std::to_string(lr);
+  params_["wd"] = std::to_string(wd);
+  Update(index, weight, grad);
+}
+
+std::string Optimizer::Serialize() const {
+  using ValueType = std::map<std::string, std::string>::value_type;
+  auto params = params_;
+  params.emplace("opt_type", GetType());
+  return std::accumulate(params.cbegin(), params.cend(), std::string(""),
+    [](const std::string& sum, const ValueType& i) {
+      return sum + '\n' + i.first + '=' + i.second;
+    }).substr(1);
+}
+
+const std::vector<const char*> Optimizer::GetParamKeys_() const {
+  std::vector<const char*> keys;
+  for (auto& iter : params_)
+    keys.push_back(iter.first.c_str());
+  return keys;
+}
+
+const std::vector<const char*> Optimizer::GetParamValues_() const {
+  std::vector<const char*> values;
+  for (auto& iter : params_)
+    values.push_back(iter.second.c_str());
+  return values;
+}
+
+Optimizer* OptimizerRegistry::Find(const std::string& name) {
+  auto it = cmap_.find(name);
+  if (it == cmap_.end())
+    return nullptr;
+  return it->second();
+}
+
+int OptimizerRegistry::__REGISTER__(const std::string& name, OptimizerCreator creator) {
+  CHECK_EQ(cmap_.count(name), 0) << name << " already registered";
+  cmap_.emplace(name, std::move(creator));
+  return 0;
+}
+
+std::string SGDOptimizer::GetType() const {
+  return "sgd";
+}
+
+SGDOptimizer::SGDOptimizer() {
+  update_handle_ = op_map_->GetSymbolCreator("sgd_update");
+  mom_update_handle_ = op_map_->GetSymbolCreator("sgd_mom_update");
+}
+
+SGDOptimizer::~SGDOptimizer() {
+  for (auto &it : states_) {
+    delete it.second;
+  }
+}
+
+void SGDOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (states_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  auto keys = GetParamKeys_();
+  auto values = GetParamValues_();
+  CHECK_EQ(keys.size(), values.size());
+
+  NDArrayHandle inputs[3];
+  inputs[0] = weight.GetHandle();
+  inputs[1] = grad.GetHandle();
+
+  int num_outputs = 1;
+  NDArrayHandle output = weight.GetHandle();
+  NDArrayHandle *outputs = &output;
+
+  if (states_[index] == nullptr) {
+    MXImperativeInvoke(update_handle_, 2, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data());
+  } else {
+    inputs[2] = states_[index]->GetHandle();
+    MXImperativeInvoke(mom_update_handle_, 3, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data());
+  }
+}
+
+void SGDOptimizer::CreateState_(int index, NDArray weight) {
+  if (params_.count("momentum") == 0) {
+    states_[index] = nullptr;
+  } else {
+    states_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+    *states_[index] = 0;
+  }
+}
+
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_OPTIMIZER_HPP
diff --git a/cpp-package/include/mxnet-cpp/shape.h b/cpp-package/include/mxnet-cpp/shape.h
new file mode 100644
index 000000000000..c321c9717869
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/shape.h
@@ -0,0 +1,389 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file shape.h
+* \brief definition of shape
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNET_CPP_Shape_H
+#define MXNET_CPP_Shape_H
+
+#include <istream>
+#include <ostream>
+#include <algorithm>
+#include <vector>
+#include "mxnet-cpp/base.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief dynamic shape class that can hold shape
+*   of arbirary dimension
+*/
+struct Shape {
+ public:
+  /*! \brief constructor */
+  Shape()
+    : ndim_(0),
+    num_heap_allocated_(0),
+    data_heap_(NULL) {}
+  /*!
+  * \brief constructor from a vector of index_t
+  * \param v the vector
+  */
+  explicit Shape(const std::vector<index_t> &v)
+    : ndim_(v.size()) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      std::copy(v.begin(), v.end(), data_stack_);
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      std::copy(v.begin(), v.end(), data_heap_);
+    }
+  }
+  /*!
+  * \brief constructor one dimmension shape
+  * \param s1 size of the first dimmension
+  */
+  explicit Shape(index_t s1)
+    : ndim_(1) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+    }
+  }
+  /*!
+  * \brief constructor two dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  */
+  Shape(index_t s1, index_t s2)
+    : ndim_(2) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+    }
+  }
+  /*!
+  * \brief constructor three dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3)
+    : ndim_(3) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+    }
+  }
+  /*!
+  * \brief constructor four dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  * \param s4 size of the fourth dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4)
+    : ndim_(4) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+      data_stack_[3] = s4;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+      data_heap_[3] = s4;
+    }
+  }
+  /*!
+  * \brief constructor five dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  * \param s4 size of the fourth dimmension
+  * \param s5 size of the fifth dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4, index_t s5)
+    : ndim_(5) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+      data_stack_[3] = s4;
+      data_stack_[4] = s5;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+      data_heap_[3] = s4;
+      data_heap_[5] = s5;
+    }
+  }
+  /*!
+  * \brief constructor from Shape
+  * \param s the source shape
+  */
+  Shape(const Shape &s)
+    : ndim_(s.ndim_) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      std::copy(s.data_heap_, s.data_heap_ + ndim_, data_heap_);
+    }
+  }
+#if MSHADOW_IN_CXX11
+  /*!
+  * \brief move constructor from Shape
+  * \param s the source shape
+  */
+  Shape(Shape &&s)
+    : ndim_(s.ndim_),
+    num_heap_allocated_(s.num_heap_allocated_),
+    data_heap_(s.data_heap_) {
+    if (ndim_ <= kStackCache) {
+      std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
+    }
+    // remove data heap space from s
+    s.data_heap_ = NULL;
+  }
+#endif
+  /*! \brief destructor */
+  ~Shape() {
+    // data_heap_ can be NULL
+    delete[] data_heap_;
+  }
+  /*!
+  * \brief copy shape from content betwen two iterators
+  * \param begin the beginning of iterator
+  * \param end the end of the iterator
+  * \tparam RandomAccessIterator iterator type
+  */
+  template<typename RandomAccessIterator>
+  inline void CopyFrom(RandomAccessIterator begin,
+    RandomAccessIterator end) {
+    this->SetDim(end - begin);
+    std::copy(begin, end, data());
+  }
+  /*!
+  * \brief assignment from shape
+  * \param shape source shape
+  * \return reference of self
+  */
+  inline Shape &operator=(const Shape &shape) {
+    this->SetDim(shape.ndim_);
+    const index_t *src = shape.data();
+    std::copy(src, src + ndim_, data());
+    return *this;
+  }
+  /*!
+  * \brief assignment from vector
+  * \param shape source shape
+  * \return reference of self
+  */
+  inline Shape &operator=(const std::vector<index_t> &shape) {
+    this->CopyFrom(shape.begin(), shape.end());
+    return *this;
+  }
+  /*! \return the data content of the shape */
+  inline const index_t *data() const {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the data content of the shape */
+  inline index_t *data() {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \brief return number of dimension of the tensor inside */
+  inline index_t ndim(void) const {
+    return ndim_;
+  }
+  /*!
+  * \brief get corresponding index
+  * \param i dimension index
+  * \return the corresponding dimension size
+  */
+  inline index_t &operator[](index_t i) {
+    return data()[i];
+  }
+  /*!
+  * \brief get corresponding index
+  * \param i dimension index
+  * \return the corresponding dimension size
+  */
+  inline const index_t &operator[](index_t i) const {
+    return data()[i];
+  }
+  /*! \brief total number of elements in the tensor */
+  inline size_t Size(void) const {
+    size_t size = 1;
+    const index_t *d = this->data();
+    for (index_t i = 0; i < ndim_; ++i) {
+      size *= d[i];
+    }
+    return size;
+  }
+  /*!
+  * \return whether two shape equals
+  * \param s the shape to compare against
+  */
+  inline bool operator==(const Shape &s) const {
+    if (ndim_ != s.ndim_) return false;
+    if (ndim_ <= kStackCache) {
+      for (index_t i = 0; i < ndim_; ++i) {
+        if (data_stack_[i] != s.data_stack_[i]) return false;
+      }
+    } else {
+      for (index_t i = 0; i < ndim_; ++i) {
+        if (data_heap_[i] != s.data_heap_[i]) return false;
+      }
+    }
+    return true;
+  }
+  /*!
+  * \return whether two shape not equals
+  * \param s the shape to compare against
+  */
+  inline bool operator!=(const Shape &s) const {
+    return !(*this == s);
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const Shape &shape);
+  friend std::istream &operator>>(std::istream &is, Shape &shape);
+
+ private:
+  // the shape will be stored in data_stack_
+  // when dimension is smaller than kStackCache
+  // when it is bigger, it will be stored in data_heap_;
+  /*! \brief size of in stack space */
+  static const index_t kStackCache = 5;
+  /*! \brief number of dimnsion of the shape */
+  index_t ndim_;
+  /*! \brief number of cells allocated in data_heap_ */
+  index_t num_heap_allocated_;
+  /*! \brief in stack space used to store shape when it is small */
+  index_t data_stack_[kStackCache];
+  /*! \brief space to store shape when dimension is big*/
+  index_t *data_heap_;
+  /*!
+  * \brief internal function to set the dimension
+  * \param dim the dimension of the shape
+  */
+  inline void SetDim(index_t dim) {
+    if (dim > kStackCache &&
+      dim > num_heap_allocated_) {
+      // data_heap_ can be NULL
+      delete[] data_heap_;
+      data_heap_ = new index_t[dim];
+      num_heap_allocated_ = dim;
+    }
+    ndim_ = dim;
+  }
+};
+
+/*!
+* \brief allow string printing of the shape
+* \param os the output stream
+* \param shape the shape
+* \return the ostream
+*/
+inline std::ostream &operator<<(std::ostream &os, const Shape &shape) {
+  os << '(';
+  for (index_t i = 0; i < shape.ndim(); ++i) {
+    if (i != 0) os << ',';
+    os << static_cast<int>(shape[i]);  // Supports negative Shape 'special codes' for inferring
+  }
+  // python style tuple
+  if (shape.ndim() == 1) os << ',';
+  os << ')';
+  return os;
+}
+
+/*!
+* \brief read shape from the istream
+* \param is the input stream
+* \param shape the shape
+* \return the istream
+*/
+inline std::istream &operator>>(std::istream &is, Shape &shape) {
+  // get (
+  while (true) {
+    char ch = is.get();
+    if (ch == '(') break;
+    if (!isspace(ch)) {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  index_t idx;
+  std::vector<index_t> tmp;
+  while (is >> idx) {
+    tmp.push_back(idx);
+    char ch;
+    do {
+      ch = is.get();
+    } while (isspace(ch));
+    if (ch == ',') {
+      while (true) {
+        ch = is.peek();
+        if (isspace(ch)) {
+          is.get(); continue;
+        }
+        if (ch == ')') {
+          is.get(); break;
+        }
+        break;
+      }
+      if (ch == ')') break;
+    } else if (ch == ')') {
+      break;
+    } else {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  shape.CopyFrom(tmp.begin(), tmp.end());
+  return is;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_ShapeImpl_H
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
new file mode 100644
index 000000000000..63ef9b1a03e3
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -0,0 +1,257 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file symbol.h
+* \brief definition of symbol
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef MXNETCPP_SYMBOL_H
+#define MXNETCPP_SYMBOL_H
+
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Executor;
+
+/*!
+* \brief struct to store SymbolHandle
+*/
+struct SymBlob {
+ public:
+  /*!
+  * \brief default constructor
+  */
+  SymBlob() : handle_(nullptr) {}
+  /*!
+  * \brief construct with SymbolHandle to store
+  */
+  explicit SymBlob(SymbolHandle handle) : handle_(handle) {}
+  /*!
+  * \brief destructor, free the SymbolHandle
+  */
+  ~SymBlob() { MXSymbolFree(handle_); }
+  /*!
+  * \brief the SymbolHandle to store
+  */
+  SymbolHandle handle_;
+
+ private:
+  SymBlob(const SymBlob &);
+  SymBlob &operator=(const SymBlob &);
+};
+
+/*!
+* \brief Symbol interface
+*/
+class Symbol {
+ public:
+  Symbol() {}
+  /*!
+  * \brief construct a Symbol with SymbolHandle
+  * \param handle the given SymbolHandle
+  */
+  explicit Symbol(SymbolHandle handle);
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  explicit Symbol(const char *name);
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  explicit Symbol(const std::string &name);
+  Symbol operator+(const Symbol &rhs) const;
+  Symbol operator-(const Symbol &rhs) const;
+  Symbol operator*(const Symbol &rhs) const;
+  Symbol operator/(const Symbol &rhs) const;
+
+  Symbol operator+(mx_float scalar) const;
+  Symbol operator-(mx_float scalar) const;
+  Symbol operator*(mx_float scalar) const;
+  Symbol operator/(mx_float scalar) const;
+  Symbol Copy() const;
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  static Symbol Variable(const std::string &name = "");
+  Symbol operator[](int index);
+  Symbol operator[](const std::string &index);
+  /*!
+  * \brief Create a symbol that groups symbols together
+  * \param symbols List of symbols to be groupe
+  */
+  static Symbol Group(const std::vector<Symbol> &symbols);
+  /*!
+  * \brief load Symbol from a JSON file
+  * \param file_name the name of the file
+  */
+  static Symbol Load(const std::string &file_name);
+  /*!
+  * \brief load Symbol from a JSON string
+  * \param json_str the JSON string
+  */
+  static Symbol LoadJSON(const std::string &json_str);
+  /*!
+  * \brief save Symbol to a file
+  * \param file_name the name of the file
+  */
+  void Save(const std::string &file_name) const;
+  /*!
+  * \brief save Symbol into a JSON string
+  */
+  std::string ToJSON() const;
+  /*!
+  * \brief save Symbol into a JSON string
+  * \retutrn the symbol whose outputs are all the internals.
+  */
+  Symbol GetInternals() const;
+  /*!
+  * \return the SymbolHandle
+  */
+  SymbolHandle GetHandle() const { return blob_ptr_->handle_; }
+  /*!
+  * \brief construct an operator Symbol, with given input Symbol and config
+  * \param name the name of the Symbol
+  * \param input_keys the vector of keys of the input
+  * \param input_values the vector of the intput Symbols
+  * \param config_keys the vector of keys of the config
+  * \param config_values the vecotr of values of the config
+  */
+  Symbol(const std::string &operator_name, const std::string &name,
+         std::vector<const char *> input_keys,
+         std::vector<SymbolHandle> input_values,
+         std::vector<const char *> config_keys,
+         std::vector<const char *> config_values);
+  /*!
+  * \brief infer the shapes by providing shapes of known argument shapes.
+  * \param arg_shapes map of argument name to shape of arguments with known
+  * shapes.
+  * \param in_shapes used to store infered shapes of input arguments.
+  * \param out_shapes used to store infered shapes of outputs.
+  * \param aux_shapes use to store the infered shapes of auxiliary states
+  */
+  void InferShape(
+      const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
+      std::vector<std::vector<mx_uint> > *in_shape,
+      std::vector<std::vector<mx_uint> > *aux_shape,
+      std::vector<std::vector<mx_uint> > *out_shape) const;
+  /*!
+  * \brief List the arguments names.
+  *
+  * The position of the returned list also corresponds to calling position in
+  *operator()
+  * \return the arguments list of this symbol, they can be either named or
+  *unnamed (empty string).
+  */
+  std::vector<std::string> ListArguments() const;
+  /*! \return get the descriptions of outputs for this symbol */
+  std::vector<std::string> ListOutputs() const;
+  /*! \return get the descriptions of auxiliary data for this symbol */
+  std::vector<std::string> ListAuxiliaryStates() const;
+  /*!
+  * \brief infer and construct all the arrays to bind to executor by providing
+  * some known arrays.
+  * \param context the context of all the infered arrays
+  * \param arg_arrays infered input arguments arrays.
+  * \param arad_arrays infered arrays to store the gradient output of the input
+  * arguments.
+  * \param aux_arrays infered arrays that is used as internal state in op.
+  * \param args_map map of some given arguments arrays.
+  * \param args_grad_store map of some gradient given store arrays.
+  * \param args_req_type map of some given type of gradient saving. Can only be
+  * in {kNullOp, kAddTo, kWriteTo}.
+  * \param aux_map NDArray that stores the internal state in op
+  */
+  void InferExecutorArrays(
+      const Context &context, std::vector<NDArray> *arg_arrays,
+      std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+      std::vector<NDArray> *aux_arrays,
+      const std::map<std::string, NDArray> &args_map,
+      const std::map<std::string, NDArray> &arg_grad_store =
+          std::map<std::string, NDArray>(),
+      const std::map<std::string, OpReqType> &grad_req_type =
+          std::map<std::string, OpReqType>(),
+      const std::map<std::string, NDArray> &aux_map =
+          std::map<std::string, NDArray>()) const;
+  /*!
+  * \brief infer and construct all the input arguments arrays to bind to
+  * executor by providing some known arguments arrays.
+  * \param context the context of all the infered arrays.
+  * \param args_map map of all the infered input arguments arrays.
+  * \param known_args map of some given arguments arrays.
+  */
+  void InferArgsMap(const Context &context,
+                    std::map<std::string, NDArray> *args_map,
+                    const std::map<std::string, NDArray> &known_args) const;
+  /*!
+  * \brief Create an executor by bind symbol with context and arguments.
+  *  If user do not want to compute the gradients of i-th argument,
+  *grad_req_type[i] can be kNullOp.
+  *  The input arrays in the given maps should have the same name with the input
+  *symbol.
+  *  Only need some of the necessary arrays, and the other arrays can be infered
+  *automatically.
+  *
+  * \param context the context of binding.
+  * \param args_map the NDArray that stores the input arguments to the symbol.
+  * \param arg_grad_store NDArray that is used to store the gradient output of
+  *the input arguments.
+  * \param grad_req_type requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_map NDArray that stores the internal state in op
+  * \return a new executor, which need to be free manually.
+  */
+  Executor *SimpleBind(const Context &context,
+                       const std::map<std::string, NDArray> &args_map,
+                       const std::map<std::string, NDArray> &arg_grad_store =
+                           std::map<std::string, NDArray>(),
+                       const std::map<std::string, OpReqType> &grad_req_type =
+                           std::map<std::string, OpReqType>(),
+                       const std::map<std::string, NDArray> &aux_map =
+                           std::map<std::string, NDArray>());
+  /*!
+  * \brief Create an executor by bind symbol with context and arguments.
+  *  If user do not want to compute the gradients of i-th argument,
+  *grad_req_type[i] can be kNullOp.
+  *
+  * \param context the context of binding.
+  * \param arg_arrays the NDArray that stores the input arguments to the symbol.
+  * \param grad_arrays NDArray that is used to store the gradient output of the
+  *input arguments.
+  * \param grad_reqs requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_arrays NDArray that is used as internal state in op
+  * \param group_to_ctx dict of string to mx.Context
+  * \param shared_exec Executor to share memory with. This is intended for
+  *runtime reshaping, variable length sequencesn etc.  The returned executor
+  *shares state with shared_exec, and should not be used in parallel with it.
+  * \return a new executor, which need to be free manually.
+  */
+  Executor *Bind(const Context &context, const std::vector<NDArray> &arg_arrays,
+                 const std::vector<NDArray> &grad_arrays,
+                 const std::vector<OpReqType> &grad_reqs,
+                 const std::vector<NDArray> &aux_arrays,
+                 const std::map<std::string, Context> &group_to_ctx =
+                     std::map<std::string, Context>(),
+                 Executor *shared_exec = nullptr);
+
+ private:
+  std::shared_ptr<SymBlob> blob_ptr_;
+  static OpMap *op_map_;
+};
+Symbol operator+(mx_float lhs, const Symbol &rhs);
+Symbol operator-(mx_float lhs, const Symbol &rhs);
+Symbol operator*(mx_float lhs, const Symbol &rhs);
+Symbol operator/(mx_float lhs, const Symbol &rhs);
+}  // namespace cpp
+}  // namespace mxnet
+#endif  // MXNETCPP_SYMBOL_H
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
new file mode 100644
index 000000000000..f79e96a59fc4
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -0,0 +1,339 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file symbol.hpp
+ * \brief implementation of the symbol
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef MXNETCPP_SYMBOL_HPP
+#define MXNETCPP_SYMBOL_HPP
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dmlc/logging.h"
+#include "mxnet-cpp/symbol.h"
+
+#include "mxnet-cpp/op_suppl.h"
+
+namespace mxnet {
+namespace cpp {
+OpMap *Symbol::op_map_ = new OpMap();
+Symbol::Symbol(SymbolHandle handle) {
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+Symbol::Symbol(const char *name) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateVariable(name, &(handle)), 0);
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+Symbol::Symbol(const std::string &name) : Symbol(name.c_str()) {}
+Symbol Symbol::Variable(const std::string &name) { return Symbol(name); }
+Symbol Symbol::operator+(const Symbol &rhs) const { return _Plus(*this, rhs); }
+Symbol Symbol::operator-(const Symbol &rhs) const { return _Minus(*this, rhs); }
+Symbol Symbol::operator*(const Symbol &rhs) const { return _Mul(*this, rhs); }
+Symbol Symbol::operator/(const Symbol &rhs) const { return _Div(*this, rhs); }
+Symbol Symbol::operator+(mx_float scalar) const {
+  return _PlusScalar(*this, scalar);
+}
+Symbol Symbol::operator-(mx_float scalar) const {
+  return _MinusScalar(*this, scalar);
+}
+Symbol Symbol::operator*(mx_float scalar) const {
+  return _MulScalar(*this, scalar);
+}
+Symbol Symbol::operator/(mx_float scalar) const {
+  return _DivScalar(*this, scalar);
+}
+Symbol Symbol::operator[](int index) {
+  SymbolHandle out;
+  MXSymbolGetOutput(GetHandle(), index, &out);
+  return Symbol(out);
+}
+Symbol Symbol::operator[](const std::string &index) {
+  auto outputs = ListOutputs();
+  for (mx_uint i = 0; i < outputs.size(); ++i) {
+    if (outputs[i] == index) {
+      return (*this)[i];
+    }
+  }
+  LOG(FATAL) << "Cannot find output that matches name " << index;
+  return (*this)[0];
+}
+Symbol Symbol::Group(const std::vector<Symbol> &symbols) {
+  SymbolHandle out;
+  std::vector<SymbolHandle> handle_list;
+  for (const auto &t : symbols) {
+    handle_list.push_back(t.GetHandle());
+  }
+  MXSymbolCreateGroup(handle_list.size(), handle_list.data(), &out);
+  return Symbol(out);
+}
+Symbol Symbol::Load(const std::string &file_name) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateFromFile(file_name.c_str(), &(handle)), 0);
+  return Symbol(handle);
+}
+Symbol Symbol::LoadJSON(const std::string &json_str) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateFromJSON(json_str.c_str(), &(handle)), 0);
+  return Symbol(handle);
+}
+void Symbol::Save(const std::string &file_name) const {
+  CHECK_EQ(MXSymbolSaveToFile(GetHandle(), file_name.c_str()), 0);
+}
+std::string Symbol::ToJSON() const {
+  const char *out_json;
+  CHECK_EQ(MXSymbolSaveToJSON(GetHandle(), &out_json), 0);
+  return std::string(out_json);
+}
+Symbol Symbol::GetInternals() const {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolGetInternals(GetHandle(), &handle), 0);
+  return Symbol(handle);
+}
+Symbol::Symbol(const std::string &operator_name, const std::string &name,
+               std::vector<const char *> input_keys,
+               std::vector<SymbolHandle> input_values,
+               std::vector<const char *> config_keys,
+               std::vector<const char *> config_values) {
+  SymbolHandle handle;
+  AtomicSymbolCreator creator = op_map_->GetSymbolCreator(operator_name);
+  MXSymbolCreateAtomicSymbol(creator, config_keys.size(), config_keys.data(),
+                             config_values.data(), &handle);
+  MXSymbolCompose(handle, operator_name.c_str(), input_keys.size(),
+                  input_keys.data(), input_values.data());
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+
+Symbol Symbol::Copy() const {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCopy(GetHandle(), &handle), 0);
+  return Symbol(handle);
+}
+
+std::vector<std::string> Symbol::ListArguments() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListArguments(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+std::vector<std::string> Symbol::ListOutputs() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListOutputs(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+std::vector<std::string> Symbol::ListAuxiliaryStates() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListAuxiliaryStates(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+
+void Symbol::InferShape(
+    const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
+    std::vector<std::vector<mx_uint> > *in_shape,
+    std::vector<std::vector<mx_uint> > *aux_shape,
+    std::vector<std::vector<mx_uint> > *out_shape) const {
+
+  std::vector<const char *> keys;
+  std::vector<mx_uint> arg_ind_ptr;
+  std::vector<mx_uint> arg_shape_data;
+
+  for (const auto &arg : arg_shapes) {
+    keys.push_back(arg.first.c_str());
+    arg_ind_ptr.push_back(arg_shape_data.size());
+    for (auto i : arg.second) {
+      arg_shape_data.push_back(i);
+    }
+  }
+  arg_ind_ptr.push_back(arg_shape_data.size());
+
+  mx_uint in_shape_size;
+  const mx_uint *in_shape_ndim;
+  const mx_uint **in_shape_data;
+  mx_uint out_shape_size;
+  const mx_uint *out_shape_ndim;
+  const mx_uint **out_shape_data;
+  mx_uint aux_shape_size;
+  const mx_uint *aux_shape_ndim;
+  const mx_uint **aux_shape_data;
+  int complete;
+
+  CHECK_EQ(MXSymbolInferShape(GetHandle(), keys.size(), keys.data(),
+                              arg_ind_ptr.data(), arg_shape_data.data(),
+                              &in_shape_size, &in_shape_ndim, &in_shape_data,
+                              &out_shape_size, &out_shape_ndim, &out_shape_data,
+                              &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
+                              &complete),
+           0);
+
+  if (complete) {
+    for (mx_uint i = 0; i < in_shape_size; ++i) {
+      in_shape->push_back(std::vector<mx_uint>());
+      for (mx_uint j = 0; j < in_shape_ndim[i]; ++j) {
+        (*in_shape)[i].push_back(in_shape_data[i][j]);
+      }
+    }
+    for (mx_uint i = 0; i < aux_shape_size; ++i) {
+      aux_shape->push_back(std::vector<mx_uint>());
+      for (mx_uint j = 0; j < aux_shape_ndim[i]; ++j) {
+        (*aux_shape)[i].push_back(aux_shape_data[i][j]);
+      }
+    }
+    for (mx_uint i = 0; i < out_shape_size; ++i) {
+      out_shape->push_back(std::vector<mx_uint>());
+      for (mx_uint j = 0; j < out_shape_ndim[i]; ++j) {
+        (*out_shape)[i].push_back(out_shape_data[i][j]);
+      }
+    }
+  }
+}
+
+void Symbol::InferExecutorArrays(
+    const Context &context, std::vector<NDArray> *arg_arrays,
+    std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+    std::vector<NDArray> *aux_arrays,
+    const std::map<std::string, NDArray> &args_map,
+    const std::map<std::string, NDArray> &arg_grad_store,
+    const std::map<std::string, OpReqType> &grad_req_type,
+    const std::map<std::string, NDArray> &aux_map) const {
+
+  const auto arg_name_list = ListArguments();
+  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+
+  for (const auto &arg_name : arg_name_list) {
+    auto iter = args_map.find(arg_name);
+    if (iter != args_map.end()) {
+      arg_shapes[arg_name] = iter->second.GetShape();
+    }
+  }
+
+  InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = args_map.find(arg_name);
+    if (iter_arg != args_map.end()) {
+      arg_arrays->push_back(iter_arg->second);
+    } else {
+      arg_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &arg_arrays->back());
+    }
+    auto iter_grad = arg_grad_store.find(arg_name);
+    if (iter_grad != arg_grad_store.end()) {
+      grad_arrays->push_back(iter_grad->second);
+    } else {
+      grad_arrays->push_back(NDArray(shape, context, false));
+    }
+    auto iter_req = grad_req_type.find(arg_name);
+    if (iter_req != grad_req_type.end()) {
+      grad_reqs->push_back(iter_req->second);
+    } else if (arg_name.rfind("data") == arg_name.length() - 4
+            || arg_name.rfind("label") == arg_name.length() - 5) {
+      grad_reqs->push_back(OpReqType::kNullOp);
+    } else {
+      grad_reqs->push_back(OpReqType::kWriteTo);
+    }
+  }
+
+  const auto aux_name_list = ListAuxiliaryStates();
+  for (size_t i = 0; i < aux_shapes.size(); ++i) {
+    const auto &shape = aux_shapes[i];
+    const auto &aux_name = aux_name_list[i];
+    auto iter_aux = aux_map.find(aux_name);
+    if (iter_aux != aux_map.end()) {
+      aux_arrays->push_back(iter_aux->second);
+    } else {
+      aux_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &aux_arrays->back());
+    }
+  }
+}
+void Symbol::InferArgsMap(
+    const Context &context, std::map<std::string, NDArray> *args_map,
+    const std::map<std::string, NDArray> &known_args) const {
+
+  const auto arg_name_list = ListArguments();
+  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+
+  for (const auto &arg_name : arg_name_list) {
+    auto iter = known_args.find(arg_name);
+    if (iter != known_args.end()) {
+      arg_shapes[arg_name] = iter->second.GetShape();
+    }
+  }
+
+  InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = known_args.find(arg_name);
+    if (iter_arg != known_args.end()) {
+      (*args_map)[arg_name] = iter_arg->second;
+    } else {
+      (*args_map)[arg_name] = NDArray(shape, context, false);
+      NDArray::SampleGaussian(0, 1, &(*args_map)[arg_name]);
+    }
+  }
+}
+
+Executor *Symbol::SimpleBind(
+    const Context &context, const std::map<std::string, NDArray> &args_map,
+    const std::map<std::string, NDArray> &arg_grad_store,
+    const std::map<std::string, OpReqType> &grad_req_type,
+    const std::map<std::string, NDArray> &aux_map) {
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
+  InferExecutorArrays(context, &arg_arrays, &grad_arrays, &grad_reqs,
+                      &aux_arrays, args_map, arg_grad_store, grad_req_type,
+                      aux_map);
+
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays);
+}
+
+Executor *Symbol::Bind(const Context &context,
+                       const std::vector<NDArray> &arg_arrays,
+                       const std::vector<NDArray> &grad_arrays,
+                       const std::vector<OpReqType> &grad_reqs,
+                       const std::vector<NDArray> &aux_arrays,
+                       const std::map<std::string, Context> &group_to_ctx,
+                       Executor *shared_exec) {
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays, group_to_ctx, shared_exec);
+}
+Symbol operator+(mx_float lhs, const Symbol &rhs) { return rhs + lhs; }
+Symbol operator-(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RMinusScalar(lhs, rhs);
+}
+Symbol operator*(mx_float lhs, const Symbol &rhs) { return rhs * lhs; }
+Symbol operator/(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RDivScalar(lhs, rhs);
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNETCPP_SYMBOL_HPP
diff --git a/cpp-package/scripts/lint.py b/cpp-package/scripts/lint.py
new file mode 100644
index 000000000000..89492eda4d82
--- /dev/null
+++ b/cpp-package/scripts/lint.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# pylint: disable=protected-access, unused-variable, locally-disabled, redefined-variable-type
+"""Lint helper to generate lint summary of source.
+Copyright by Contributors
+"""
+import codecs
+import sys
+import re
+import os
+import cpplint
+from cpplint import _cpplint_state
+from pylint import epylint
+
+CXX_SUFFIX = set(['cc', 'c', 'cpp', 'h', 'cu', 'hpp'])
+PYTHON_SUFFIX = set(['py'])
+
+class LintHelper(object):
+    """Class to help runing the lint and records summary"""
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = len([x for k, x in result_map.iteritems() if len(x) == 0])
+        strm.write('=====%d/%d %s files passed check=====\n' % (npass, len(result_map), ftype))
+        for fname, emap in result_map.iteritems():
+            if len(emap) == 0:
+                continue
+            strm.write('%s: %d Errors of %d Categories map=%s\n' % (
+                fname, sum(emap.values()), len(emap), str(emap)))
+        return len(result_map) - npass
+
+    def __init__(self):
+        self.project_name = None
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+        pylint_disable = ['superfluous-parens',
+                          'too-many-instance-attributes',
+                          'too-few-public-methods']
+        # setup pylint
+        self.pylint_opts = ['--extension-pkg-whitelist=numpy',
+                            '--disable=' + ','.join(pylint_disable)]
+
+        self.pylint_cats = set(['error', 'warning', 'convention', 'refactor'])
+        # setup cpp lint
+        cpplint_args = ['.', '--extensions=' + (','.join(CXX_SUFFIX))]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(','.join(['-build/c++11',
+                                      '-build/namespaces',
+                                      '-build/include',
+                                      '-build/header_guard',
+                                      '+build/include_what_you_use',
+                                      '+build/include_order']))
+        cpplint._SetCountingStyle('toplevel')
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == 'h':
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    def process_python(self, path):
+        """Process a python file."""
+        (pylint_stdout, pylint_stderr) = epylint.py_run(
+            ' '.join([str(path)] + self.pylint_opts), return_std=True)
+        emap = {}
+        print pylint_stderr.read()
+        for line in pylint_stdout:
+            sys.stderr.write(line)
+            key = line.split(':')[-1].split('(')[0].strip()
+            if key not in self.pylint_cats:
+                continue
+            if key not in emap:
+                emap[key] = 1
+            else:
+                emap[key] += 1
+        sys.stderr.write('\n')
+        self.python_map[str(path)] = emap
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += LintHelper._print_summary_map(strm, self.cpp_header_map, 'cpp-header')
+        nerr += LintHelper._print_summary_map(strm, self.cpp_src_map, 'cpp-soruce')
+        nerr += LintHelper._print_summary_map(strm, self.python_map, 'python')
+        if nerr == 0:
+            strm.write('All passed!\n')
+        else:
+            strm.write('%d files failed lint\n' % nerr)
+        return nerr
+
+# singleton helper for lint check
+_HELPER = LintHelper()
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ['include', 'api', 'wrapper']
+
+    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find('src/')
+        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
+    else:
+        for spath in inc_list:
+            prefix = spath + os.sep
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub('^' + prefix, '', file_path_from_root)
+                break
+    return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    # HACK: ignore op.h which is automatically generated
+    if fname.endswith('op.h'):
+      return
+    arr = fname.rsplit('.', 1)
+    if fname.find('#') != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+    if arr[-1] in PYTHON_SUFFIX:
+        _HELPER.process_python(fname)
+
+def main():
+    """Main entry function."""
+    if len(sys.argv) < 3:
+        print('Usage: <project-name> <filetype> <list-of-path to traverse>')
+        print('\tfiletype can be python/cpp/all')
+        exit(-1)
+    _HELPER.project_name = sys.argv[1]
+    file_type = sys.argv[2]
+    allow_type = []
+    if file_type == 'python' or file_type == 'all':
+        allow_type += [x for x in PYTHON_SUFFIX]
+    if file_type == 'cpp' or file_type == 'all':
+        allow_type += [x for x in CXX_SUFFIX]
+    allow_type = set(allow_type)
+    if os.name != 'nt':
+        sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                               codecs.getreader('utf8'),
+                                               codecs.getwriter('utf8'),
+                                               'replace')
+    for path in sys.argv[3:]:
+        if os.path.isfile(path):
+            process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    process(os.path.join(root, name), allow_type)
+
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+if __name__ == '__main__':
+    main()
diff --git a/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
new file mode 100755
index 000000000000..19f4b6e36e93
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
@@ -0,0 +1,367 @@
+﻿# -*- coding: utf-8 -*-
+from ctypes import *
+from ctypes.util import find_library
+import logging
+import platform
+import re
+import sys
+
+class EnumType:
+    name = ''
+    enumValues = []
+    def __init__(self, typeName = 'ElementWiseOpType', \
+                 typeString = "{'avg', 'max', 'sum'}"):
+        self.name = typeName
+        if (typeString[0] == '{'):  # is a enum type
+            isEnum = True
+            # parse enum
+            self.enumValues = typeString[typeString.find('{') + 1:typeString.find('}')].split(',')
+            for i in range(0, len(self.enumValues)):
+                self.enumValues[i] = self.enumValues[i].strip().strip("'")
+        else:
+            logging.warn("trying to parse none-enum type as enum: %s" % typeString)
+    def GetDefinitionString(self, indent = 0):
+        indentStr = ' ' * indent
+        ret = indentStr + 'enum class %s {\n' % self.name
+        for i in range(0, len(self.enumValues)):
+            ret = ret + indentStr + '  %s = %d' % (self.enumValues[i], i)
+            if (i != len(self.enumValues) -1):
+                ret = ret + ","
+            ret = ret + "\n"
+        ret = ret + "};\n"
+        return ret
+    def GetDefaultValueString(self, value = ''):
+        return self.name + "::" + value
+    def GetEnumStringArray(self, indent = 0):
+        indentStr = ' ' * indent
+        ret = indentStr + 'static const char *%sValues[] = {\n' % self.name
+        for i in range(0, len(self.enumValues)):
+            ret = ret + indentStr + '  "%s"' % self.enumValues[i]
+            if (i != len(self.enumValues) -1):
+                ret = ret + ","
+            ret = ret + "\n"
+        ret = ret + indentStr + "};\n"
+        return ret
+    def GetConvertEnumVariableToString(self, variable=''):
+        return "%sValues[int(%s)]" % (self.name, variable)
+
+class Arg:
+    typeDict = {'boolean':'bool',\
+        'Shape(tuple)':'Shape',\
+        'Symbol':'Symbol',\
+        'NDArray':'Symbol',\
+        'ndarray-or-symbol':'Symbol',\
+        'Symbol[]':'const std::vector<Symbol>&',\
+        'Symbol or Symbol[]':'const std::vector<Symbol>&',\
+        'NDArray[]':'const std::vector<Symbol>&',\
+        'ndarray-or-symbol[]':'const std::vector<Symbol>&',\
+        'float':'mx_float',\
+        'real_t':'mx_float',\
+        'int':'int',\
+        'int (non-negative)': 'uint32_t',\
+        'long (non-negative)': 'uint64_t',\
+        'int or None':'dmlc::optional<int>',\
+        'long':'int64_t',\
+        'double':'double',\
+        'string':'const std::string&'}
+    name = ''
+    type = ''
+    description = ''
+    isEnum = False
+    enum = None
+    hasDefault = False
+    defaultString = ''
+    def __init__(self, opName = '', argName = '', typeString = '', descString = ''):
+        self.name = argName
+        self.description = descString
+        if (typeString[0] == '{'):  # is enum type
+            self.isEnum = True
+            self.enum = EnumType(self.ConstructEnumTypeName(opName, argName), typeString)
+            self.type = self.enum.name
+        else:
+            try:
+                self.type = self.typeDict[typeString.split(',')[0]]
+            except:
+                print 'argument "%s" of operator "%s" has unknown type "%s"' % (argName, opName, typeString)
+                pass
+        if typeString.find('default=') != -1:
+            self.hasDefault = True
+            self.defaultString = typeString.split('default=')[1].strip().strip("'")
+            if typeString.startswith('string'):
+                self.defaultString = '"' + self.defaultString + '"'
+            elif self.isEnum:
+                self.defaultString = self.enum.GetDefaultValueString(self.defaultString)
+            elif self.defaultString == 'None':
+                self.defaultString = self.type + '()'
+            elif self.defaultString == 'False':
+                self.defaultString = 'false'
+            elif self.defaultString == 'True':
+                self.defaultString = 'true'
+            elif self.defaultString[0] == '(':
+                self.defaultString = 'Shape' + self.defaultString
+            elif self.type == 'dmlc::optional<int>':
+                self.defaultString = self.type + '(' + self.defaultString + ')'
+
+    def ConstructEnumTypeName(self, opName = '', argName = ''):
+        a = opName[0].upper()
+        # format ArgName so instead of act_type it returns ActType
+        argNameWords = argName.split('_')
+        argName = ''
+        for an in argNameWords:
+            argName = argName + an[0].upper() + an[1:]
+        typeName = a + opName[1:] + argName
+        return typeName
+
+class Op:
+    name = ''
+    description = ''
+    args = []
+
+    def __init__(self, name = '', description = '', args = []):
+        self.name = name
+        self.description = description
+        # add a 'name' argument
+        nameArg = Arg(self.name, \
+                      'symbol_name', \
+                      'string', \
+                      'name of the resulting symbol')
+        args.insert(0, nameArg)
+        # reorder arguments, put those with default value to the end
+        orderedArgs = []
+        for arg in args:
+            if not arg.hasDefault:
+                orderedArgs.append(arg)
+        for arg in args:
+            if arg.hasDefault:
+                orderedArgs.append(arg)
+        self.args = orderedArgs
+    def WrapDescription(self, desc = ''):
+        ret = []
+        sentences = desc.split('.')
+        lines = desc.split('\n')
+        for line in lines:
+          line = line.strip()
+          if len(line) <= 80:
+            ret.append(line.strip())
+          else:
+            while len(line) > 80:
+              pos = line.rfind(' ', 0, 80)+1
+              if pos <= 0:
+                pos = line.find(' ')
+              if pos < 0:
+                pos = len(line)
+              ret.append(line[:pos].strip())
+              line = line[pos:]
+        return ret
+    def GenDescription(self, desc = '', \
+                        firstLineHead = ' * \\brief ', \
+                        otherLineHead = ' *        '):
+        ret = ''
+        descs = self.WrapDescription(desc)
+        ret = ret + firstLineHead
+        if len(descs) == 0:
+          return ret.rstrip()
+        ret = (ret + descs[0]).rstrip() + '\n'
+        for i in range(1, len(descs)):
+            ret = ret + (otherLineHead + descs[i]).rstrip() + '\n'
+        return ret
+    def GetOpDefinitionString(self, use_name, indent=0):
+        ret = ''
+        indentStr = ' ' * indent
+        # define enums if any
+        for arg in self.args:
+            if arg.isEnum and use_name:
+                # comments
+                ret = ret + self.GenDescription(arg.description, \
+                                        '/*! \\breif ', \
+                                        ' *        ')
+                ret = ret + " */\n"
+                # definition
+                ret = ret + arg.enum.GetDefinitionString(indent) + '\n'
+        # create function comments
+        ret = ret + self.GenDescription(self.description, \
+                                        '/*!\n * \\breif ', \
+                                        ' *        ')
+        for arg in self.args:
+            if arg.name != 'symbol_name' or use_name:
+                ret = ret + self.GenDescription(arg.name + ' ' + arg.description, \
+                                        ' * \\param ', \
+                                        ' *        ')
+        ret = ret + " * \\return new symbol\n"
+        ret = ret + " */\n"
+        # create function header
+        declFirstLine = indentStr + 'inline Symbol %s(' % self.name
+        ret = ret + declFirstLine
+        argIndentStr = ' ' * len(declFirstLine)
+        arg_start = 0 if use_name else 1
+        if len(self.args) > arg_start:
+            ret = ret + self.GetArgString(self.args[arg_start])
+        for i in range(arg_start+1, len(self.args)):
+            ret = ret + ',\n'
+            ret = ret + argIndentStr + self.GetArgString(self.args[i])
+        ret = ret + ') {\n'
+        # create function body
+        # if there is enum, generate static enum<->string mapping
+        for arg in self.args:
+            if arg.isEnum:
+                ret = ret + arg.enum.GetEnumStringArray(indent + 2)
+        # now generate code
+        ret = ret + indentStr + '  return Operator(\"%s\")\n' % self.name
+        for arg in self.args:   # set params
+            if arg.type == 'Symbol' or \
+                arg.type == 'const std::string&' or \
+                arg.type == 'const std::vector<Symbol>&':
+                continue
+            v = arg.name
+            if arg.isEnum:
+                v = arg.enum.GetConvertEnumVariableToString(v)
+            ret = ret + indentStr + ' ' * 11 + \
+                '.SetParam(\"%s\", %s)\n' % (arg.name, v)
+        #ret = ret[:-1]  # get rid of the last \n
+        symbols = ''
+        inputAlreadySet = False
+        for arg in self.args:   # set inputs
+            if arg.type != 'Symbol':
+                continue
+            inputAlreadySet = True
+            #if symbols != '':
+            #    symbols = symbols + ', '
+            #symbols = symbols + arg.name
+            ret = ret + indentStr + ' ' * 11 + \
+                '.SetInput(\"%s\", %s)\n' % (arg.name, arg.name)
+        for arg in self.args:   # set input arrays vector<Symbol>
+            if arg.type != 'const std::vector<Symbol>&':
+                continue
+            if (inputAlreadySet):
+                logging.error("op %s has both Symbol[] and Symbol inputs!" % self.name)
+            inputAlreadySet = True
+            symbols = arg.name
+            ret = ret + '(%s)\n' % symbols
+        ret = ret + indentStr + ' ' * 11
+        if use_name:
+            ret = ret + '.CreateSymbol(symbol_name);\n'
+        else:
+            ret = ret + '.CreateSymbol();\n'
+        ret = ret + indentStr + '}\n'
+        return ret
+    def GetArgString(self, arg):
+        ret = '%s %s' % (arg.type, arg.name)
+        if arg.hasDefault:
+            ret = ret + ' = ' + arg.defaultString
+        return ret
+
+
+def ParseAllOps():
+    """
+    MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
+                                                   AtomicSymbolCreator **out_array);
+
+    MXNET_DLL int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
+                                              const char **name,
+                                              const char **description,
+                                              mx_uint *num_args,
+                                              const char ***arg_names,
+                                              const char ***arg_type_infos,
+                                              const char ***arg_descriptions,
+                                              const char **key_var_num_args);
+    """
+    cdll.libmxnet = cdll.LoadLibrary(sys.argv[1])
+    ListOP = cdll.libmxnet.MXSymbolListAtomicSymbolCreators
+    GetOpInfo = cdll.libmxnet.MXSymbolGetAtomicSymbolInfo
+    ListOP.argtypes=[POINTER(c_int), POINTER(POINTER(c_void_p))]
+    GetOpInfo.argtypes=[c_void_p, \
+        POINTER(c_char_p), \
+        POINTER(c_char_p), \
+        POINTER(c_int), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(c_char_p), \
+        POINTER(c_char_p)
+        ]
+
+    nOps = c_int()
+    opHandlers = POINTER(c_void_p)()
+    r = ListOP(byref(nOps), byref(opHandlers))
+    ret = ''
+    ret2 = ''
+    for i in range(0, nOps.value):
+        handler = opHandlers[i]
+        name = c_char_p()
+        description = c_char_p()
+        nArgs = c_int()
+        argNames = POINTER(c_char_p)()
+        argTypes = POINTER(c_char_p)()
+        argDescs = POINTER(c_char_p)()
+        varArgName = c_char_p()
+        return_type = c_char_p()
+
+        GetOpInfo(handler, byref(name), byref(description), \
+            byref(nArgs), byref(argNames), byref(argTypes), \
+            byref(argDescs), byref(varArgName), byref(return_type))
+
+        if name.value[0]=='_':     # get rid of functions like __init__
+            continue
+
+        args = []
+
+        for i in range(0, nArgs.value):
+            arg = Arg(name.value,
+                      argNames[i],
+                      argTypes[i],
+                      argDescs[i])
+            args.append(arg)
+
+        op = Op(name.value, description.value, args)
+
+        ret = ret + op.GetOpDefinitionString(True) + "\n"
+        ret2 = ret2 + op.GetOpDefinitionString(False) + "\n"
+    return ret + ret2
+
+if __name__ == "__main__":
+    #et = EnumType(typeName = 'MyET')
+    reload(sys)
+    sys.setdefaultencoding('UTF8')
+    #print(et.GetDefinitionString())
+    #print(et.GetEnumStringArray())
+    #arg = Arg()
+    #print(arg.ConstructEnumTypeName('SoftmaxActivation', 'act_type'))
+    #arg = Arg(opName = 'FullConnected', argName='act_type', \
+    #    typeString="{'elu', 'leaky', 'prelu', 'rrelu'},optional, default='leaky'", \
+    #    descString='Activation function to be applied.')
+    #print(arg.isEnum)
+    #print(arg.defaultString)
+    #arg = Arg("fc", "alpha", "float, optional, default=0.0001", "alpha")
+    #decl = "%s %s" % (arg.type, arg.name)
+    #if arg.hasDefault:
+    #    decl = decl + "=" + arg.defaultString
+    #print(decl)
+
+    # generate file header
+    patternStr = ("/*!\n"
+                 "*  Copyright (c) 2016 by Contributors\n"
+                 "* \\file op.h\n"
+                 "* \\brief definition of all the operators\n"
+                 "* \\author Chuntao Hong, Xin Li\n"
+                 "*/\n"
+                 "\n"      
+                 "#ifndef _MXNETOP_H\n"
+                 "#define _MXNETOP_H\n"
+                 "\n"
+                 "#include <string>\n"
+                 "#include <vector>\n"
+                 "#include \"mxnet-cpp/base.h\"\n"
+                 "#include \"mxnet-cpp/shape.h\"\n"
+                 "#include \"mxnet-cpp/operator.h\"\n"
+                 "#include \"dmlc/optional.h\"\n"
+                 "\n"
+                 "namespace mxnet {\n"
+                 "namespace cpp {\n"
+                 "\n"
+                 "%s"
+                 "} //namespace cpp\n"
+                 "} //namespace mxnet\n"
+                 "#endif //ifndef _MXNETOP_H\n")
+    with open('../../include/mxnet-cpp/op.h', 'w') as f:
+        f.write(patternStr % ParseAllOps())
+    pass
diff --git a/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj
new file mode 100755
index 000000000000..b2d8448b830d
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj
@@ -0,0 +1,28 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" DefaultTargets="Build">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{027054bd-8dd3-4d2e-8032-22e339846ed1}</ProjectGuid>
+    <ProjectHome />
+    <StartupFile>OpWrapperGenerator.py</StartupFile>
+    <SearchPath />
+    <WorkingDirectory>.</WorkingDirectory>
+    <OutputPath>.</OutputPath>
+    <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
+    <LaunchProvider>Standard Python launcher</LaunchProvider>
+    <InterpreterId />
+    <InterpreterVersion />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
+  <PropertyGroup Condition="'$(Configuration)' == 'Release'" />
+  <PropertyGroup>
+    <VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
+    <PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="OpWrapperGenerator.py" />
+  </ItemGroup>
+  <Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
+  <Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
+</Project>
\ No newline at end of file
diff --git a/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln
new file mode 100755
index 000000000000..71dc32749769
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.40629.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "OpWrapperGenerator", "OpWrapperGenerator.pyproj", "{027054BD-8DD3-4D2E-8032-22E339846ED1}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{027054BD-8DD3-4D2E-8032-22E339846ED1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{027054BD-8DD3-4D2E-8032-22E339846ED1}.Release|Any CPU.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/cpp-package/src/OpWrapperGenerator/README.md b/cpp-package/src/OpWrapperGenerator/README.md
new file mode 100644
index 000000000000..8fb45ec661f2
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/README.md
@@ -0,0 +1 @@
+## This is a python script that generates operator wrappers such as FullyConnected, based on current libmxnet.dll. This script is written so that we don't need to write new operator wrappers when new ones are added to the library.
diff --git a/cpp-package/tests/travis/run_test.sh b/cpp-package/tests/travis/run_test.sh
new file mode 100755
index 000000000000..36c8d3356ba9
--- /dev/null
+++ b/cpp-package/tests/travis/run_test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+if [ ${TASK} == "lint" ]; then
+    make lint || exit -1
+    echo "Check documentations of c++ code..."
+    make doc 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
+    echo "---------Error Log----------"
+    cat logclean.txt
+    echo "----------------------------"
+    (cat logclean.txt|grep warning) && exit -1
+    (cat logclean.txt|grep error) && exit -1
+    exit 0
+fi
+
+if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+  # use g++-4.8 in linux
+  export CXX=g++-4.8
+fi
+
+if [ ${TASK} == "build" ]; then
+    make example
+    exit $?
+fi
diff --git a/cpp-package/tests/travis/setup.sh b/cpp-package/tests/travis/setup.sh
new file mode 100755
index 000000000000..4238c7654fe4
--- /dev/null
+++ b/cpp-package/tests/travis/setup.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+if [ ${TASK} == "lint" ]; then
+    pip install cpplint 'pylint==1.4.4' 'astroid==1.3.6' --user
+fi

From f5a283d95d27871fb32da455419ccb2e3c739a9a Mon Sep 17 00:00:00 2001
From: Xin Li <xlidc@cse.ust.hk>
Date: Sat, 11 Mar 2017 03:30:11 +0800
Subject: [PATCH 2/5] Add missing type name declaration for
 nnvm::Tuple<dmlc::optional<int>>

---
 include/mxnet/tensor_blob.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 10939c9b6318..cfb2511bf1b3 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -302,6 +302,7 @@ namespace dmlc {
 // Add a few patches to support TShape in dmlc/parameter.
 DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
 DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<int>, "Shape(tuple)");
+DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 
 namespace parameter {
 

From 8ac7074988624c56dc369a927c40b4f65f0ff1a0 Mon Sep 17 00:00:00 2001
From: Xin Li <xlidc@cse.ust.hk>
Date: Sat, 11 Mar 2017 03:30:48 +0800
Subject: [PATCH 3/5] Add trigger to update operation wrapper in cpp package
 when building mxnet

---
 CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe5cf32ae68a..d324c74a51d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -354,4 +354,12 @@ set(LINT_DIRS include src scripts python)
 add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${PROJECT_SOURCE_DIR} -DPROJECT_NAME=mxnet -P ${PROJECT_SOURCE_DIR}/dmlc-core/cmake/lint.cmake)
 
 add_subdirectory(tests/cpp)
-add_subdirectory(example/image-classification/predict-cpp)
\ No newline at end of file
+add_subdirectory(example/image-classification/predict-cpp)
+
+add_custom_command(TARGET mxnet
+  POST_BUILD
+  COMMAND cp $<TARGET_FILE:mxnet> .
+  COMMAND python OpWrapperGenerator.py $<TARGET_FILE_NAME:mxnet>
+  COMMAND rm $<TARGET_FILE_NAME:mxnet>
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/cpp-package/src/OpWrapperGenerator/
+)

From 82b54a827b4be70fc464ce3bf80d3278fcf5f86b Mon Sep 17 00:00:00 2001
From: Xin Li <xlidc@cse.ust.hk>
Date: Sat, 18 Mar 2017 16:31:35 +0800
Subject: [PATCH 4/5] Add test to travis scripts

---
 cpp-package/example/Makefile | 4 +---
 tests/travis/run_test.sh     | 6 ++++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp-package/example/Makefile b/cpp-package/example/Makefile
index 45f0c896aab9..3b146a1b8d58 100644
--- a/cpp-package/example/Makefile
+++ b/cpp-package/example/Makefile
@@ -14,7 +14,7 @@ ifneq ($(OS), Darwin)
 	LDFLAGS += -lgomp
 endif
 
-all: mlp lenet lenet_with_mxdataiter alexnet googlenet inception_bn resnet
+all: mlp lenet lenet_with_mxdataiter alexnet googlenet resnet
 
 lenet_with_mxdataiter: ./lenet_with_mxdataiter.cpp
 	$(CXX) -c -std=c++11 $(CFLAGS) $^
@@ -58,7 +58,6 @@ travis:
 	$(CXX) -c -std=c++11 $(CFLAGS) ./lenet_with_mxdataiter.cpp && rm -f lenet_with_mxdataiter.o
 	$(CXX) -c -std=c++11 $(CFLAGS) ./alexnet.cpp && rm -f alexnet.o
 	$(CXX) -c -std=c++11 $(CFLAGS) ./googlenet.cpp && rm -f googlenet.o
-	$(CXX) -c -std=c++11 $(CFLAGS) ./inception_bn.cpp && rm -f inception_bn.o
 	$(CXX) -c -std=c++11 $(CFLAGS) ./resnet.cpp && rm -f resnet.o
 
 
@@ -68,5 +67,4 @@ clean:
 	-rm -f lenet_with_mxdataiter
 	-rm -f alexnet
 	-rm -f googlenet
-	-rm -f inception_bn
 	-rm -f resnet
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index bad2679e0928..34fa80ba6122 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -179,3 +179,9 @@ if [ ${TASK} == "perl_test" ]; then
     make test || exit -1
     exit 0
 fi
+
+if [ ${TASK} == "cpp_package_test" ]; then
+    MXNET_HOME=${PWD}
+    make travis -C ${MXNET_HOME}/cpp-package/example
+    exit 0
+fi

From 2673e9a19ceff67353b6965b519de7bfda37002e Mon Sep 17 00:00:00 2001
From: Xin Li <xlidc@cse.ust.hk>
Date: Sun, 19 Mar 2017 20:54:07 +0800
Subject: [PATCH 5/5] Disable op generation on windows

---
 CMakeLists.txt | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43efa5169d27..08f25de10da6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -435,10 +435,12 @@ add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EX
 
 add_subdirectory(example/image-classification/predict-cpp)
 
-add_custom_command(TARGET mxnet
-  POST_BUILD
-  COMMAND cp $<TARGET_FILE:mxnet> .
-  COMMAND python OpWrapperGenerator.py $<TARGET_FILE_NAME:mxnet>
-  COMMAND rm $<TARGET_FILE_NAME:mxnet>
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/cpp-package/src/OpWrapperGenerator/
-)
+if(NOT MSVC)
+  add_custom_command(TARGET mxnet
+    POST_BUILD
+    COMMAND cp $<TARGET_FILE:mxnet> .
+    COMMAND python OpWrapperGenerator.py $<TARGET_FILE_NAME:mxnet>
+    COMMAND rm $<TARGET_FILE_NAME:mxnet>
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/cpp-package/src/OpWrapperGenerator/
+  )
+endif()