diff --git a/tfjs-core/src/backends/cpu/backend_cpu.ts b/tfjs-core/src/backends/cpu/backend_cpu.ts
index f45855309a8..851f118fd55 100644
--- a/tfjs-core/src/backends/cpu/backend_cpu.ts
+++ b/tfjs-core/src/backends/cpu/backend_cpu.ts
@@ -377,17 +377,6 @@ export class MathBackendCPU extends KernelBackend {
     return result.toTensor() as T;
   }
 
-  softmax<T extends Tensor>(logits: T, dim: number): T {
-    const axes = util.parseAxisParam([dim], logits.shape);
-    const maxLogit = this.max(logits, axes);
-    const expandedShape = axis_util.expandShapeToKeepDim(maxLogit.shape, axes);
-    const a = this.subtract(logits, maxLogit.reshape(expandedShape));
-    const b = this.exp(a);
-    const sumExp = this.sum(b, axes).reshape(expandedShape);
-
-    return this.realDivide(b, sumExp) as T;
-  }
-
   subtract(a: Tensor, b: Tensor): Tensor {
     if (a.dtype === 'complex64' || b.dtype === 'complex64') {
       return this.broadcastedBinaryComplexOp(
@@ -493,13 +482,13 @@ export class MathBackendCPU extends KernelBackend {
         (aValue, bValue) => aValue * bValue);
   }
 
-  realDivide(a: Tensor, b: Tensor): Tensor {
-    assertNotComplex([a, b], 'realDivide');
+  // realDivide(a: Tensor, b: Tensor): Tensor {
+  //   assertNotComplex([a, b], 'realDivide');
 
-    const op = (a: number, b: number) => a / b;
-    const outputDtype = 'float32';
-    return this.broadcastedBinaryOp(a, b, outputDtype, op);
-  }
+  //   const op = (a: number, b: number) => a / b;
+  //   const outputDtype = 'float32';
+  //   return this.broadcastedBinaryOp(a, b, outputDtype, op);
+  // }
 
   floorDiv(a: Tensor, b: Tensor): Tensor {
     assertNotComplex([a, b], 'floorDiv');
diff --git a/tfjs-core/src/backends/cpu/kernels/Div.ts b/tfjs-core/src/backends/cpu/kernels/Div.ts
new file mode 100644
index 00000000000..5483f92f857
--- /dev/null
+++ b/tfjs-core/src/backends/cpu/kernels/Div.ts
@@ -0,0 +1,23 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Div} from '../../../kernel_names';
+import {createBinaryKernelConfig} from '../utils/kernel_utils';
+import {createBinaryOp} from '../utils/kernel_utils';
+
+export const div = createBinaryOp((a: number, b: number) => a / b);
+export const divConfig = createBinaryKernelConfig(Div, div);
diff --git a/tfjs-core/src/backends/cpu/kernels/Exp.ts b/tfjs-core/src/backends/cpu/kernels/Exp.ts
new file mode 100644
index 00000000000..a6d40fdd1c4
--- /dev/null
+++ b/tfjs-core/src/backends/cpu/kernels/Exp.ts
@@ -0,0 +1,48 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Exp, ExpInputs} from '../../../kernel_names';
+import {KernelConfig} from '../../../kernel_registry';
+import {TypedArray} from '../../../types';
+import * as util from '../../../util';
+import {MathBackendCPU} from '../backend_cpu';
+
+export const exp = (x: TypedArray): TypedArray => {
+  const outValues = util.getTypedArrayFromDType('float32', x.length);
+
+  for (let i = 0; i < x.length; ++i) {
+    outValues[i] = Math.exp(x[i]);
+  }
+
+  return outValues;
+};
+
+export const expConfig: KernelConfig = {
+  kernelName: Exp,
+  backendName: 'cpu',
+  kernelFunc: ({inputs, backend}) => {
+    const {x} = inputs as ExpInputs;
+    const cpuBackend = backend as MathBackendCPU;
+
+    const xVals = cpuBackend.data.get(x.dataId).values as Float32Array;
+
+    const result = exp(xVals);
+
+    const dataId = cpuBackend.write(result, x.shape, x.dtype);
+    return {dataId, shape: x.shape, dtype: x.dtype};
+  }
+};
diff --git a/tfjs-core/src/backends/cpu/kernels/Max.ts b/tfjs-core/src/backends/cpu/kernels/Max.ts
new file mode 100644
index 00000000000..23b018ae9f6
--- /dev/null
+++ b/tfjs-core/src/backends/cpu/kernels/Max.ts
@@ -0,0 +1,69 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Max, MaxAttrs, MaxInputs} from '../../../kernel_names';
+import {KernelConfig} from '../../../kernel_registry';
+import * as axis_util from '../../../ops/axis_util';
+import {DataType, NumericDataType, TypedArray} from '../../../types';
+import * as util from '../../../util';
+import {sizeFromShape} from '../../../util';
+import {MathBackendCPU} from '../backend_cpu';
+import {assertNotComplex} from '../cpu_util';
+
+export const max =
+    (x: TypedArray, reduceSize: number, outShape: number[], dtype: DataType):
+        TypedArray => {
+          const outValues = util.getTypedArrayFromDType(
+              dtype as NumericDataType, util.sizeFromShape(outShape));
+
+          for (let i = 0; i < x.length; ++i) {
+            const offset = i * reduceSize;
+            let max = x[offset];
+            for (let j = 0; j < reduceSize; ++j) {
+              const value = x[offset + j];
+              if (value > max) {
+                max = value;
+              }
+            }
+            outValues[i] = max;
+          }
+
+          return outValues;
+        };
+
+export const maxConfig: KernelConfig = {
+  kernelName: Max,
+  backendName: 'cpu',
+  kernelFunc: ({inputs, attrs, backend}) => {
+    const {x} = inputs as MaxInputs;
+    const {axes} = attrs as {} as MaxAttrs;
+    const cpuBackend = backend as MathBackendCPU;
+
+    assertNotComplex(x, 'max');
+
+    axis_util.assertAxesAreInnerMostDims('max', axes, x.shape.length);
+
+    const [outShape, reduceShape] =
+        axis_util.computeOutAndReduceShapes(x.shape, axes);
+
+    const xVals = cpuBackend.data.get(x.dataId).values as Float32Array;
+    const result = max(xVals, sizeFromShape(reduceShape), outShape, x.dtype);
+
+    const dataId = cpuBackend.write(result, outShape, x.dtype);
+    return {dataId, shape: outShape, dtype: x.dtype};
+  }
+};
diff --git a/tfjs-core/src/backends/cpu/kernels/Softmax.ts b/tfjs-core/src/backends/cpu/kernels/Softmax.ts
new file mode 100644
index 00000000000..06731f62f2a
--- /dev/null
+++ b/tfjs-core/src/backends/cpu/kernels/Softmax.ts
@@ -0,0 +1,64 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Softmax, SoftmaxAttrs, SoftmaxInputs} from '../../../kernel_names';
+import {KernelConfig} from '../../../kernel_registry';
+import * as axis_util from '../../../ops/axis_util';
+import {parseAxisParam, sizeFromShape} from '../../../util';
+import {MathBackendCPU} from '../backend_cpu';
+import {assertNotComplex} from '../cpu_util';
+
+import {div} from './Div';
+import {exp} from './Exp';
+import {max} from './Max';
+import {sub} from './Sub';
+import {sum} from './Sum';
+
+export const softmaxConfig: KernelConfig = {
+  kernelName: Softmax,
+  backendName: 'cpu',
+  kernelFunc: ({inputs, attrs, backend}) => {
+    const {logits} = inputs as SoftmaxInputs;
+    const {dim} = attrs as {} as SoftmaxAttrs;
+    const cpuBackend = backend as MathBackendCPU;
+    assertNotComplex(logits, 'softmax');
+
+    const axes = parseAxisParam([dim], logits.shape);
+
+    const [reduceOutShape, reduceShape] =
+        axis_util.computeOutAndReduceShapes(logits.shape, axes);
+    const logitsValues =
+        cpuBackend.data.get(logits.dataId).values as Float32Array;
+    const maxLogit = max(
+        logitsValues, sizeFromShape(reduceShape), reduceOutShape, logits.dtype);
+
+    const expandedShape = axis_util.expandShapeToKeepDim(reduceOutShape, axes);
+
+    const [aValues,] =
+        sub(logits.shape, expandedShape, logitsValues, maxLogit, logits.dtype);
+
+    const b = exp(aValues);
+
+    const sumExp =
+        sum(b, sizeFromShape(reduceShape), reduceOutShape, logits.dtype);
+
+    const [resultData, resultShape] =
+        div(logits.shape, reduceShape, b, sumExp, logits.dtype);
+    const dataId = cpuBackend.write(resultData, resultShape, logits.dtype);
+    return {dataId, shape: resultShape, dtype: logits.dtype};
+  }
+};
diff --git a/tfjs-core/src/backends/cpu/kernels/SquaredDifference.ts b/tfjs-core/src/backends/cpu/kernels/SquaredDifference.ts
index a57bf1a156a..d89a0e71181 100644
--- a/tfjs-core/src/backends/cpu/kernels/SquaredDifference.ts
+++ b/tfjs-core/src/backends/cpu/kernels/SquaredDifference.ts
@@ -15,31 +15,14 @@
  * =============================================================================
  */
 
-import {SquaredDifference, SquaredDifferenceInputs} from '../../../kernel_names';
-import {KernelConfig} from '../../../kernel_registry';
-import {TypedArray} from '../../../types';
-import {MathBackendCPU} from '../backend_cpu';
-import {assertNotComplex} from '../cpu_util';
-import {broadcastedBinaryOp} from '../utils/kernel_utils';
+import {SquaredDifference} from '../../../kernel_names';
+import {createBinaryOp} from '../utils/kernel_utils';
+import {createBinaryKernelConfig} from '../utils/kernel_utils';
 
-export const squaredDifferenceConfig: KernelConfig = {
-  kernelName: SquaredDifference,
-  backendName: 'cpu',
-  kernelFunc: ({inputs, backend}) => {
-    const {a, b} = inputs as SquaredDifferenceInputs;
-    const cpuBackend = backend as MathBackendCPU;
-    assertNotComplex([a, b], SquaredDifference);
+const squaredDifferenceImpl = createBinaryOp((aVal, bVal) => {
+  const diff = aVal - bVal;
+  return diff * diff;
+});
 
-    const aVals = cpuBackend.data.get(a.dataId).values as TypedArray;
-    const bVals = cpuBackend.data.get(b.dataId).values as TypedArray;
-
-    const [resultData, resultShape] = broadcastedBinaryOp(
-        a.shape, b.shape, aVals, bVals, a.dtype, (aVal, bVal) => {
-          const diff = aVal - bVal;
-          return diff * diff;
-        });
-
-    const dataId = cpuBackend.write(resultData, resultShape, a.dtype);
-    return {dataId, shape: resultShape, dtype: a.dtype};
-  }
-};
+export const squaredDifferenceConfig =
+    createBinaryKernelConfig(SquaredDifference, squaredDifferenceImpl);
diff --git a/tfjs-core/src/backends/cpu/kernels/Sub.ts b/tfjs-core/src/backends/cpu/kernels/Sub.ts
new file mode 100644
index 00000000000..f37ebde371f
--- /dev/null
+++ b/tfjs-core/src/backends/cpu/kernels/Sub.ts
@@ -0,0 +1,24 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Sub} from '../../../kernel_names';
+import {createBinaryKernelConfig} from '../utils/kernel_utils';
+import {createBinaryOp} from '../utils/kernel_utils';
+
+export const sub = createBinaryOp((a: number, b: number) => a - b);
+
+export const subConfig = createBinaryKernelConfig(Sub, sub);
diff --git a/tfjs-core/src/backends/cpu/kernels/Sum.ts b/tfjs-core/src/backends/cpu/kernels/Sum.ts
new file mode 100644
index 00000000000..8779e9efb1d
--- /dev/null
+++ b/tfjs-core/src/backends/cpu/kernels/Sum.ts
@@ -0,0 +1,69 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Sum, SumAttrs, SumInputs} from '../../../kernel_names';
+import {KernelConfig} from '../../../kernel_registry';
+import * as axis_util from '../../../ops/axis_util';
+import {upcastType} from '../../../types';
+import {DataType, NumericDataType, TypedArray} from '../../../types';
+import * as util from '../../../util';
+import {sizeFromShape} from '../../../util';
+import {MathBackendCPU} from '../backend_cpu';
+import {assertNotComplex} from '../cpu_util';
+
+export const sum =
+    (x: TypedArray, reduceSize: number, outShape: number[], dtype: DataType):
+        TypedArray => {
+          const outValues = util.getTypedArrayFromDType(
+              dtype as NumericDataType, util.sizeFromShape(outShape));
+
+          for (let i = 0; i < x.length; ++i) {
+            const offset = i * reduceSize;
+            let sum = 0;
+            for (let j = 0; j < reduceSize; ++j) {
+              const value = x[offset + j];
+              sum += value;
+            }
+            outValues[i] = sum;
+          }
+
+          return outValues;
+        };
+
+export const sumConfig: KernelConfig = {
+  kernelName: Sum,
+  backendName: 'cpu',
+  kernelFunc: ({inputs, attrs, backend}) => {
+    const {x} = inputs as SumInputs;
+    const {axes} = attrs as {} as SumAttrs;
+    const cpuBackend = backend as MathBackendCPU;
+
+    assertNotComplex(x, 'sum');
+
+    axis_util.assertAxesAreInnerMostDims('sum', axes, x.shape.length);
+
+    const [outShape, reduceShape] =
+        axis_util.computeOutAndReduceShapes(x.shape, axes);
+    const resultDtype = upcastType(x.dtype, 'int32');
+
+    const xVals = cpuBackend.data.get(x.dataId).values as Float32Array;
+    const result = sum(xVals, sizeFromShape(reduceShape), outShape, x.dtype);
+
+    const dataId = cpuBackend.write(result, outShape, resultDtype);
+    return {dataId, shape: outShape, dtype: resultDtype};
+  }
+};
diff --git a/tfjs-core/src/backends/cpu/register_all_kernels.ts b/tfjs-core/src/backends/cpu/register_all_kernels.ts
index 9516f9af311..ef9282a38a2 100644
--- a/tfjs-core/src/backends/cpu/register_all_kernels.ts
+++ b/tfjs-core/src/backends/cpu/register_all_kernels.ts
@@ -19,15 +19,16 @@
 // the contents of this file and import only the kernels that are needed.
 import {KernelConfig, registerKernel} from '../../kernel_registry';
 
+import {divConfig} from './kernels/Div';
 import {nonMaxSuppressionV5Config} from './kernels/NonMaxSuppressionV5';
+import {softmaxConfig} from './kernels/Softmax';
 import {squareConfig} from './kernels/Square';
 import {squaredDifferenceConfig} from './kernels/SquaredDifference';
 
 // List all kernel configs here
 const kernelConfigs: KernelConfig[] = [
-  nonMaxSuppressionV5Config,
-  squareConfig,
-  squaredDifferenceConfig,
+  nonMaxSuppressionV5Config, squareConfig, squaredDifferenceConfig, divConfig,
+  softmaxConfig
 ];
 
 for (const kernelConfig of kernelConfigs) {
diff --git a/tfjs-core/src/backends/cpu/utils/kernel_utils.ts b/tfjs-core/src/backends/cpu/utils/kernel_utils.ts
index ce21517dba4..f9fa95e9903 100644
--- a/tfjs-core/src/backends/cpu/utils/kernel_utils.ts
+++ b/tfjs-core/src/backends/cpu/utils/kernel_utils.ts
@@ -16,50 +16,77 @@
  */
 
 import * as backend_util from '../../../backends/backend_util';
+import {BinaryInputs} from '../../../kernel_names';
+import {KernelConfig} from '../../../kernel_registry';
 import {DataType, NumericDataType, TypedArray} from '../../../types';
 import * as util from '../../../util';
+import {MathBackendCPU} from '../backend_cpu';
+import {assertNotComplex} from '../cpu_util';
 
-export function broadcastedBinaryOp(
-    aShape: number[], bShape: number[], aVals: TypedArray, bVals: TypedArray,
-    dtype: DataType,
-    op: (a: number, b: number) => number): [TypedArray, number[]] {
-  const newShape = backend_util.assertAndGetBroadcastShape(aShape, bShape);
+export const createBinaryKernelConfig =
+    (name: string,
+     op: (
+         aShape: number[], bShape: number[], aVals: TypedArray,
+         bVals: TypedArray, dtype: DataType) => [TypedArray, number[]]):
+        KernelConfig => ({
+          kernelName: name,
+          backendName: 'cpu',
+          kernelFunc: ({inputs, backend}) => {
+            const {a, b} = inputs as BinaryInputs;
+            const cpuBackend = backend as MathBackendCPU;
+            assertNotComplex([a, b], name);
 
-  const resultRank = newShape.length;
-  const resultStrides = util.computeStrides(newShape);
-  const resultSize = util.sizeFromShape(newShape);
+            const aVals = cpuBackend.data.get(a.dataId).values as TypedArray;
+            const bVals = cpuBackend.data.get(b.dataId).values as TypedArray;
 
-  const result =
-      util.getTypedArrayFromDType(dtype as NumericDataType, resultSize);
+            const [resultData, resultShape] =
+                op(a.shape, b.shape, aVals, bVals, a.dtype);
 
-  const aRank = aShape.length;
-  const bRank = bShape.length;
+            const dataId = cpuBackend.write(resultData, resultShape, a.dtype);
+            return {dataId, shape: resultShape, dtype: a.dtype};
+          }
+        });
 
-  const aStrides = util.computeStrides(aShape);
-  const bStrides = util.computeStrides(bShape);
+export const createBinaryOp = (op: (a: number, b: number) => number) =>
+    (aShape: number[], bShape: number[], aVals: TypedArray, bVals: TypedArray,
+     dtype: DataType): [TypedArray, number[]] => {
+      const newShape = backend_util.assertAndGetBroadcastShape(aShape, bShape);
 
-  const aBroadcastDims = backend_util.getBroadcastDims(aShape, newShape);
-  const bBroadcastDims = backend_util.getBroadcastDims(bShape, newShape);
+      const resultRank = newShape.length;
+      const resultStrides = util.computeStrides(newShape);
+      const resultSize = util.sizeFromShape(newShape);
 
-  if (aBroadcastDims.length + bBroadcastDims.length === 0) {
-    for (let i = 0; i < result.length; ++i) {
-      result[i] = op(aVals[i % aVals.length], bVals[i % bVals.length]);
-    }
-  } else {
-    for (let i = 0; i < result.length; ++i) {
-      const loc = util.indexToLoc(i, resultRank, resultStrides);
+      const result =
+          util.getTypedArrayFromDType(dtype as NumericDataType, resultSize);
 
-      const aLoc = loc.slice(-aRank);
-      aBroadcastDims.forEach(d => aLoc[d] = 0);
-      const aIndex = util.locToIndex(aLoc, aRank, aStrides);
+      const aRank = aShape.length;
+      const bRank = bShape.length;
 
-      const bLoc = loc.slice(-bRank);
-      bBroadcastDims.forEach(d => bLoc[d] = 0);
-      const bIndex = util.locToIndex(bLoc, bRank, bStrides);
+      const aStrides = util.computeStrides(aShape);
+      const bStrides = util.computeStrides(bShape);
 
-      result[i] = op(aVals[aIndex], bVals[bIndex]);
-    }
-  }
+      const aBroadcastDims = backend_util.getBroadcastDims(aShape, newShape);
+      const bBroadcastDims = backend_util.getBroadcastDims(bShape, newShape);
 
-  return [result, newShape];
-}
+      if (aBroadcastDims.length + bBroadcastDims.length === 0) {
+        for (let i = 0; i < result.length; ++i) {
+          result[i] = op(aVals[i % aVals.length], bVals[i % bVals.length]);
+        }
+      } else {
+        for (let i = 0; i < result.length; ++i) {
+          const loc = util.indexToLoc(i, resultRank, resultStrides);
+
+          const aLoc = loc.slice(-aRank);
+          aBroadcastDims.forEach(d => aLoc[d] = 0);
+          const aIndex = util.locToIndex(aLoc, aRank, aStrides);
+
+          const bLoc = loc.slice(-bRank);
+          bBroadcastDims.forEach(d => bLoc[d] = 0);
+          const bIndex = util.locToIndex(bLoc, bRank, bStrides);
+
+          result[i] = op(aVals[aIndex], bVals[bIndex]);
+        }
+      }
+
+      return [result, newShape];
+    };
diff --git a/tfjs-core/src/backends/webgl/backend_webgl.ts b/tfjs-core/src/backends/webgl/backend_webgl.ts
index f513265c7aa..321e693a55a 100644
--- a/tfjs-core/src/backends/webgl/backend_webgl.ts
+++ b/tfjs-core/src/backends/webgl/backend_webgl.ts
@@ -1330,18 +1330,18 @@ export class MathBackendWebGL extends KernelBackend {
     return this.compileAndRun(program, [a, b]);
   }
 
-  max(x: Tensor, axes: number[]): Tensor {
-    if (this.shouldExecuteOnCPU([x])) {
-      return this.cpuBackend.max(x, axes);
-    }
-
-    axis_util.assertAxesAreInnerMostDims('max', axes, x.rank);
-    const [outShape, reduceShape] =
-        axis_util.computeOutAndReduceShapes(x.shape, axes);
-    const inSize = util.sizeFromShape(reduceShape);
-    const a2D = x.as2D(-1, inSize);
-    return this.reduce(a2D, 'max', a2D.dtype).reshape(outShape);
-  }
+  // max(x: Tensor, axes: number[]): Tensor {
+  //   if (this.shouldExecuteOnCPU([x])) {
+  //     return this.cpuBackend.max(x, axes);
+  //   }
+
+  //   axis_util.assertAxesAreInnerMostDims('max', axes, x.rank);
+  //   const [outShape, reduceShape] =
+  //       axis_util.computeOutAndReduceShapes(x.shape, axes);
+  //   const inSize = util.sizeFromShape(reduceShape);
+  //   const a2D = x.as2D(-1, inSize);
+  //   return this.reduce(a2D, 'max', a2D.dtype).reshape(outShape);
+  // }
 
   maximum(a: Tensor, b: Tensor): Tensor {
     if (this.shouldExecuteOnCPU([a, b])) {
@@ -1372,17 +1372,17 @@ export class MathBackendWebGL extends KernelBackend {
     return this.reduce(a2D, 'any', a2D.dtype).reshape(outShape);
   }
 
-  realDivide(a: Tensor, b: Tensor): Tensor {
-    const op = binaryop_gpu.DIV;
-    const outputDtype = 'float32';
-    if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
-      const checkOutOfBounds = true;
-      return this.packedBinaryOp(
-          a, b, binaryop_packed_gpu.DIV, outputDtype, checkOutOfBounds);
-    }
-    const program = new BinaryOpProgram(op, a.shape, b.shape);
-    return this.compileAndRun<Tensor>(program, [a, b], outputDtype);
-  }
+  // realDivide(a: Tensor, b: Tensor): Tensor {
+  //   const op = binaryop_gpu.DIV;
+  //   const outputDtype = 'float32';
+  //   if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
+  //     const checkOutOfBounds = true;
+  //     return this.packedBinaryOp(
+  //         a, b, binaryop_packed_gpu.DIV, outputDtype, checkOutOfBounds);
+  //   }
+  //   const program = new BinaryOpProgram(op, a.shape, b.shape);
+  //   return this.compileAndRun<Tensor>(program, [a, b], outputDtype);
+  // }
 
   floorDiv(a: Tensor, b: Tensor): Tensor {
     const op = binaryop_gpu.INT_DIV;
@@ -1589,16 +1589,16 @@ export class MathBackendWebGL extends KernelBackend {
     return this.compileAndRun(program, [x]);
   }
 
-  softmax<T extends Tensor>(logits: T, dim: number): T {
-    const axes = util.parseAxisParam([dim], logits.shape);
-    const maxLogit = this.max(logits, axes);
-    const expandedShape = axis_util.expandShapeToKeepDim(maxLogit.shape, axes);
-    const a = this.subtract(logits, maxLogit.reshape(expandedShape));
-    const b = this.exp(a);
-    const sumExp = this.sum(b, axes).reshape(expandedShape);
+  // softmax<T extends Tensor>(logits: T, dim: number): T {
+  //   const axes = util.parseAxisParam([dim], logits.shape);
+  //   const maxLogit = this.max(logits, axes);
+  //   const expandedShape = axis_util.expandShapeToKeepDim(maxLogit.shape,
+  //   axes); const a = this.subtract(logits, maxLogit.reshape(expandedShape));
+  //   const b = this.exp(a);
+  //   const sumExp = this.sum(b, axes).reshape(expandedShape);
 
-    return this.realDivide(b, sumExp) as T;
-  }
+  //   return this.realDivide(b, sumExp) as T;
+  // }
 
   log<T extends Tensor>(x: T): T {
     if (this.shouldExecuteOnCPU([x])) {
@@ -2440,8 +2440,8 @@ export class MathBackendWebGL extends KernelBackend {
     const program = new PackProgram(input.shape);
     const preventEagerUnpackingOutput = true;
     return this.runWebGLProgram(
-        program, [input], input.dtype, null /* customSetup */,
-        preventEagerUnpackingOutput);
+        program, [input], input.dtype, null /* out info */,
+        null /* customSetup */, preventEagerUnpackingOutput);
   }
 
   private packedReshape(input: TensorInfo, afterShape: number[]): TensorInfo {
@@ -2461,8 +2461,8 @@ export class MathBackendWebGL extends KernelBackend {
     const program = new ReshapePackedProgram(afterShapeAs3D, input3DShape);
     const preventEagerUnpackingOfOutput = true;
     const output = this.runWebGLProgram(
-        program, [input3D], input.dtype, null /* customSetup */,
-        preventEagerUnpackingOfOutput);
+        program, [input3D], input.dtype, null /* out info */,
+        null /* customSetup */, preventEagerUnpackingOfOutput);
     return {dataId: output.dataId, shape: afterShape, dtype: output.dtype};
   }
 
@@ -2480,15 +2480,20 @@ export class MathBackendWebGL extends KernelBackend {
     const preventEagerUnpackingOfOutput = true;
     const out = this.runWebGLProgram(
         program, [{shape: shapeAs3D, dtype, dataId}], dtype,
-        null /* customSetup */, preventEagerUnpackingOfOutput);
+        null /* out info */, null /* customSetup */,
+        preventEagerUnpackingOfOutput);
     return {dtype, shape, dataId: out.dataId};
   }
 
   runWebGLProgram(
       program: GPGPUProgram, inputs: TensorInfo[], outputDtype: DataType,
+      output?: TensorInfo,
       customSetup?: (gpgpu: GPGPUContext, webGLProgram: WebGLProgram) => void,
       preventEagerUnpackingOfOutput = false): TensorInfo {
-    const output = this.makeTensorInfo(program.outputShape, outputDtype);
+    if (output == null) {
+      output = this.makeTensorInfo(program.outputShape, outputDtype);
+    }
+
     const outData = this.texData.get(output.dataId);
     if (program.packedOutput) {
       outData.isPacked = true;
@@ -2615,7 +2620,7 @@ export class MathBackendWebGL extends KernelBackend {
       preventEagerUnpackingOfOutput = false): K {
     outputDtype = outputDtype || inputs[0].dtype;
     const outInfo = this.runWebGLProgram(
-        program, inputs, outputDtype, customSetup,
+        program, inputs, outputDtype, null /* out info */, customSetup,
         preventEagerUnpackingOfOutput);
     return ENGINE.makeTensorFromDataId(
                outInfo.dataId, outInfo.shape, outInfo.dtype) as {} as K;
@@ -2741,7 +2746,8 @@ export class MathBackendWebGL extends KernelBackend {
       // WEBGL_PACK.
       const preventEagerUnpacking = true;
       const encodedOutputTarget = this.runWebGLProgram(
-          program, [tempDenseInputHandle], dtype, null, preventEagerUnpacking);
+          program, [tempDenseInputHandle], dtype, null /* out info */, null,
+          preventEagerUnpacking);
 
       // Have the original texture assume the identity of the encoded output.
       const outputTexData = this.texData.get(encodedOutputTarget.dataId);
@@ -2777,7 +2783,7 @@ export class MathBackendWebGL extends KernelBackend {
     return texData.values as TypedArray;
   }
 
-  private acquireTexture(
+  acquireTexture(
       texShape: [number, number], texType: TextureUsage, dtype: DataType,
       isPacked: boolean): WebGLTexture {
     this.numBytesInGPU += this.computeBytes(texShape, dtype);
diff --git a/tfjs-core/src/backends/webgl/kernels/Div.ts b/tfjs-core/src/backends/webgl/kernels/Div.ts
new file mode 100644
index 00000000000..ea6eb839735
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/kernels/Div.ts
@@ -0,0 +1,59 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {backend_util} from '../../..';
+import {env} from '../../../environment';
+import {NamedTensorInfoMap, registerKernel, TensorInfo} from '../../../kernel_registry';
+import {MathBackendWebGL} from '../backend_webgl';
+import * as binaryop_gpu from '../binaryop_gpu';
+import {BinaryOpProgram} from '../binaryop_gpu';
+import * as binaryop_packed_gpu from '../binaryop_packed_gpu';
+import {BinaryOpPackedProgram} from '../binaryop_packed_gpu';
+
+interface DivInputs extends NamedTensorInfoMap {
+  a: TensorInfo;
+  b: TensorInfo;
+}
+
+export const divImpl =
+    (a: TensorInfo, b: TensorInfo, outTensorInfo: TensorInfo,
+     backend: MathBackendWebGL): TensorInfo => {
+      let program = new BinaryOpProgram(binaryop_gpu.DIV, a.shape, b.shape);
+      if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
+        program = new BinaryOpPackedProgram(
+            binaryop_packed_gpu.DIV, a.shape, b.shape, true);
+      }
+      const output =
+          backend.runWebGLProgram(program, [a, b], 'float32', outTensorInfo);
+      return output;
+    };
+
+registerKernel({
+  kernelName: 'Div',
+  backendName: 'webgl',
+  kernelFunc: ({inputs, backend}) => {
+    const {a, b} = inputs as DivInputs;
+    const webglBackend = backend as MathBackendWebGL;
+
+    const outShape = backend_util.assertAndGetBroadcastShape(a.shape, b.shape);
+    const outTensorInfo = webglBackend.makeTensorInfo(outShape, a.dtype);
+
+    const out = divImpl(a, b, outTensorInfo, webglBackend);
+
+    return {dataId: out.dataId, shape: out.shape, dtype: out.dtype};
+  }
+});
diff --git a/tfjs-core/src/backends/webgl/kernels/Exp.ts b/tfjs-core/src/backends/webgl/kernels/Exp.ts
new file mode 100644
index 00000000000..de8fd339994
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/kernels/Exp.ts
@@ -0,0 +1,49 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {env} from '../../../environment';
+import {NamedTensorInfoMap, registerKernel, TensorInfo} from '../../../kernel_registry';
+import {MathBackendWebGL} from '../backend_webgl';
+import * as unaryop_gpu from '../unaryop_gpu';
+import {UnaryOpProgram} from '../unaryop_gpu';
+import {UnaryOpPackedProgram} from '../unaryop_packed_gpu';
+
+interface ExpInputs extends NamedTensorInfoMap {
+  x: TensorInfo;
+}
+
+export const expImpl =
+    (x: TensorInfo, backend: MathBackendWebGL): TensorInfo => {
+      let program = new UnaryOpProgram(x.shape, unaryop_gpu.EXP);
+      if (env().getBool('WEBGL_PACK_UNARY_OPERATIONS')) {
+        program = new UnaryOpPackedProgram(x.shape, unaryop_gpu.EXP);
+      }
+      const output = backend.runWebGLProgram(program, [x], x.dtype);
+      return output;
+    };
+
+registerKernel({
+  kernelName: 'Exp',
+  backendName: 'webgl',
+  kernelFunc: ({inputs, backend}) => {
+    const {x} = inputs as ExpInputs;
+    const webglBackend = backend as MathBackendWebGL;
+    const out = expImpl(x, webglBackend);
+
+    return {dataId: out.dataId, shape: out.shape, dtype: out.dtype};
+  }
+});
diff --git a/tfjs-core/src/backends/webgl/kernels/Max.ts b/tfjs-core/src/backends/webgl/kernels/Max.ts
new file mode 100644
index 00000000000..51afc8bc633
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/kernels/Max.ts
@@ -0,0 +1,67 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {NamedAttrMap, NamedTensorInfoMap, registerKernel, TensorInfo} from '../../../kernel_registry';
+import * as axis_util from '../../../ops/axis_util';
+import {reduceOutShapeFromInShape} from '../../../ops/reduce_util';
+import {sizeFromShape} from '../../../util';
+import {MathBackendWebGL} from '../backend_webgl';
+import {reduce} from '../reduce';
+import {reshape} from '../reshape';
+
+interface MaxInputs extends NamedTensorInfoMap {
+  x: TensorInfo;
+}
+
+interface MaxAttrs extends NamedAttrMap {
+  axes: number[];
+}
+
+export const maxImpl =
+    (x: TensorInfo, reduceShape: number[], outShape: number[],
+     outInfo: TensorInfo, backend: MathBackendWebGL): TensorInfo => {
+      const inSize = sizeFromShape(reduceShape);
+      const xSize = sizeFromShape(x.shape);
+      const batchSize = xSize / inSize;
+
+      x = reshape(x, [batchSize, inSize], backend);
+
+      return reshape(
+          reduce(x, reduceShape, x.dtype, outInfo, backend), outShape, backend);
+    };
+
+registerKernel({
+  kernelName: 'Max',
+  backendName: 'webgl',
+  kernelFunc: ({inputs, attrs, backend}) => {
+    const {x} = inputs as MaxInputs;
+    const {axes} = attrs as MaxAttrs;
+    const webglBackend = backend as MathBackendWebGL;
+
+    axis_util.assertAxesAreInnerMostDims('max', axes, x.shape.length);
+
+    const [outShape, reduceShape] =
+        axis_util.computeOutAndReduceShapes(x.shape, axes);
+
+    const outTensorInfo = webglBackend.makeTensorInfo(
+        reduceOutShapeFromInShape(x.shape, reduceShape), x.dtype);
+
+    const out = maxImpl(x, reduceShape, outShape, outTensorInfo, webglBackend);
+
+    return {dataId: out.dataId, shape: outShape, dtype: x.dtype};
+  }
+});
diff --git a/tfjs-core/src/backends/webgl/kernels/Softmax.ts b/tfjs-core/src/backends/webgl/kernels/Softmax.ts
new file mode 100644
index 00000000000..abe1fbe697f
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/kernels/Softmax.ts
@@ -0,0 +1,79 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Softmax, SoftmaxAttrs, SoftmaxInputs} from '../../../kernel_names';
+// import {backend_util} from '../../..';
+import {KernelConfig} from '../../../kernel_registry';
+import * as axis_util from '../../../ops/axis_util';
+import {reduceOutShapeFromInShape} from '../../../ops/reduce_util';
+// import {parseAxisParam, sizeFromShape} from '../../../util';
+import {parseAxisParam} from '../../../util';
+import {MathBackendWebGL} from '../backend_webgl';
+
+// import {divImpl} from './Div';
+// import {expImpl} from './Exp';
+import {maxImpl} from './Max';
+
+// import {subImpl} from './Sub';
+// import {sumImpl} from './Sum';
+
+export const softmaxConfig: KernelConfig = {
+  kernelName: Softmax,
+  backendName: 'webgl',
+  kernelFunc: ({inputs, attrs, backend}) => {
+    const {logits} = inputs as SoftmaxInputs;
+    const {dim} = attrs as {} as SoftmaxAttrs;
+    const webglBackend = backend as MathBackendWebGL;
+
+    const axes = parseAxisParam([dim], logits.shape);
+
+    const [outShape, reduceShape] =
+        axis_util.computeOutAndReduceShapes(logits.shape, axes);
+
+    const maxOutShape = reduceOutShapeFromInShape(logits.shape, reduceShape);
+
+    const maxOutTensorInfo =
+        webglBackend.makeTensorInfo(maxOutShape, logits.dtype);
+    const max =
+        maxImpl(logits, reduceShape, outShape, maxOutTensorInfo, webglBackend);
+
+    // const subtracted = subImpl(logits, max, webglBackend);
+    // const exponentiated = expImpl(subtracted, webglBackend);
+
+    // const sumOutTensorInfo =
+    //     webglBackend.makeTensorInfo(outShape, logits.dtype);
+    // const summed = sumImpl(
+    //     exponentiated, reduceShape, outShape, sumOutTensorInfo,
+    //     webglBackend);
+
+    // const divOutTensorInfo = webglBackend.makeTensorInfo(
+    //     backend_util.assertAndGetBroadcastShape(
+    //         exponentiated.shape, summed.shape),
+    //     exponentiated.dtype);
+
+    // const out = divImpl(exponentiated, summed, divOutTensorInfo,
+    // webglBackend);
+
+    // webglBackend.disposeData(max.dataId);
+    // webglBackend.disposeData(subtracted.dataId);
+    // webglBackend.disposeData(exponentiated.dataId);
+    // webglBackend.disposeData(summed.dataId);
+
+    // return {dataId: out.dataId, shape: out.shape, dtype: out.dtype};
+    return {dataId: max.dataId, shape: max.shape, dtype: max.dtype};
+  }
+};
diff --git a/tfjs-core/src/backends/webgl/kernels/SquaredDifference.ts b/tfjs-core/src/backends/webgl/kernels/SquaredDifference.ts
index 41463b2c1cb..9ff08538a74 100644
--- a/tfjs-core/src/backends/webgl/kernels/SquaredDifference.ts
+++ b/tfjs-core/src/backends/webgl/kernels/SquaredDifference.ts
@@ -16,7 +16,7 @@
  */
 
 import {env} from '../../../environment';
-import {SquaredDifference, SquaredDifferenceInputs} from '../../../kernel_names';
+import {BinaryInputs, SquaredDifference} from '../../../kernel_names';
 import {KernelConfig} from '../../../kernel_registry';
 import {MathBackendWebGL} from '../backend_webgl';
 import {BinaryOpProgram} from '../binaryop_gpu';
@@ -26,7 +26,7 @@ export const squaredDifferenceConfig: KernelConfig = {
   kernelName: SquaredDifference,
   backendName: 'webgl',
   kernelFunc: ({inputs, backend}) => {
-    const {a, b} = inputs as SquaredDifferenceInputs;
+    const {a, b} = inputs as BinaryInputs;
     const SQUARED_DIFFERENCE = 'return (a - b) * (a - b);';
     const webGLBackend = backend as MathBackendWebGL;
 
diff --git a/tfjs-core/src/backends/webgl/kernels/Sub.ts b/tfjs-core/src/backends/webgl/kernels/Sub.ts
new file mode 100644
index 00000000000..5f017a353a0
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/kernels/Sub.ts
@@ -0,0 +1,52 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {env} from '../../../environment';
+import {NamedTensorInfoMap, registerKernel, TensorInfo} from '../../../kernel_registry';
+import {upcastType} from '../../../types';
+import {MathBackendWebGL} from '../backend_webgl';
+import * as binaryop_gpu from '../binaryop_gpu';
+import {BinaryOpProgram} from '../binaryop_gpu';
+import {BinaryOpPackedProgram} from '../binaryop_packed_gpu';
+
+interface SubInputs extends NamedTensorInfoMap {
+  a: TensorInfo;
+  b: TensorInfo;
+}
+
+export const subImpl =
+    (a: TensorInfo, b: TensorInfo, backend: MathBackendWebGL): TensorInfo => {
+      const dtype = upcastType(a.dtype, b.dtype);
+      let program = new BinaryOpProgram(binaryop_gpu.SUB, a.shape, b.shape);
+      if (env().getBool('WEBGL_PACK_BINARY_OPERATIONS')) {
+        program = new BinaryOpPackedProgram(binaryop_gpu.SUB, a.shape, b.shape);
+      }
+      const output = backend.runWebGLProgram(program, [a, b], dtype);
+      return output;
+    };
+
+registerKernel({
+  kernelName: 'Sub',
+  backendName: 'webgl',
+  kernelFunc: ({inputs, backend}) => {
+    const {a, b} = inputs as SubInputs;
+    const webglBackend = backend as MathBackendWebGL;
+    const out = subImpl(a, b, webglBackend);
+
+    return {dataId: out.dataId, shape: out.shape, dtype: out.dtype};
+  }
+});
diff --git a/tfjs-core/src/backends/webgl/kernels/Sum.ts b/tfjs-core/src/backends/webgl/kernels/Sum.ts
new file mode 100644
index 00000000000..998d63fc521
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/kernels/Sum.ts
@@ -0,0 +1,69 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {NamedAttrMap, NamedTensorInfoMap, registerKernel, TensorInfo} from '../../../kernel_registry';
+import * as axis_util from '../../../ops/axis_util';
+import {sumOutType} from '../../../types';
+
+import {sizeFromShape} from '../../../util';
+import {MathBackendWebGL} from '../backend_webgl';
+import {reduce} from '../reduce';
+import {reshape} from '../reshape';
+
+interface SumInputs extends NamedTensorInfoMap {
+  x: TensorInfo;
+}
+
+interface SumAttrs extends NamedAttrMap {
+  axes: number[];
+}
+
+export const sumImpl =
+    (x: TensorInfo, reduceShape: number[], outShape: number[],
+     outInfo: TensorInfo, backend: MathBackendWebGL): TensorInfo => {
+      const inSize = sizeFromShape(reduceShape);
+      const xSize = sizeFromShape(x.shape);
+      const batchSize = xSize / inSize;
+
+      x = reshape(x, [batchSize, inSize], backend);
+
+      return reshape(
+          reduce(x, reduceShape, sumOutType(x.dtype), outInfo, backend),
+          outShape, backend);
+    };
+
+registerKernel({
+  kernelName: 'Sum',
+  backendName: 'webgl',
+  kernelFunc: ({inputs, attrs, backend}) => {
+    const {x} = inputs as SumInputs;
+    const {axes} = attrs as SumAttrs;
+    const webglBackend = backend as MathBackendWebGL;
+
+    axis_util.assertAxesAreInnerMostDims('sum', axes, x.shape.length);
+
+    const [outShape, reduceShape] =
+        axis_util.computeOutAndReduceShapes(x.shape, axes);
+
+    const outTensorInfo = webglBackend.makeTensorInfo(outShape, x.dtype);
+    const out = sumImpl(x, reduceShape, outShape, outTensorInfo, webglBackend);
+
+    webglBackend.disposeData(x);
+
+    return {dataId: out.dataId, shape: outShape, dtype: x.dtype};
+  }
+});
diff --git a/tfjs-core/src/backends/webgl/reduce.ts b/tfjs-core/src/backends/webgl/reduce.ts
new file mode 100644
index 00000000000..97fa91a3ee5
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/reduce.ts
@@ -0,0 +1,47 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {TensorInfo} from '../../kernel_registry';
+import {computeOptimalWindowSize} from '../../ops/reduce_util';
+import {DataType} from '../../types';
+
+import {MathBackendWebGL} from './backend_webgl';
+import {ReduceProgram} from './reduce_gpu';
+
+export const reduce =
+    (x: TensorInfo, reduceShape: number[], dtype: DataType, outInfo: TensorInfo,
+     backend: MathBackendWebGL): TensorInfo => {
+      const [batchSize, inSize] = x.shape;
+      const windowSize = computeOptimalWindowSize(inSize);
+      const reduceInfo = {windowSize, inSize, batchSize};
+      const program = new ReduceProgram(reduceInfo, 'sum');
+      const output = backend.runWebGLProgram(program, [x], dtype, outInfo);
+
+      backend.disposeData(x.dataId);
+
+      if (output.shape[1] === 1) {
+        return output;
+      }
+
+      const [newBatchSize, newInSize] = output.shape;
+      const newWindowSize = computeOptimalWindowSize(newInSize);
+      const newOutShape = [newBatchSize, Math.ceil(newInSize / newWindowSize)];
+
+      const newOutInfo = backend.makeTensorInfo(newOutShape, output.dtype);
+
+      return reduce(output, reduceShape, dtype, newOutInfo, backend);
+    };
diff --git a/tfjs-core/src/backends/webgl/register_all_kernels.ts b/tfjs-core/src/backends/webgl/register_all_kernels.ts
index f7913ac6b21..8521e55d446 100644
--- a/tfjs-core/src/backends/webgl/register_all_kernels.ts
+++ b/tfjs-core/src/backends/webgl/register_all_kernels.ts
@@ -18,6 +18,7 @@ import {KernelConfig, registerKernel} from '../../kernel_registry';
 
 import {fromPixelsConfig} from './kernels/FromPixels';
 import {nonMaxSuppressionV5Config} from './kernels/NonMaxSuppressionV5';
+import {softmaxConfig} from './kernels/Softmax';
 import {squareConfig} from './kernels/Square';
 import {squaredDifferenceConfig} from './kernels/SquaredDifference';
 
@@ -27,6 +28,7 @@ const kernelConfigs: KernelConfig[] = [
   nonMaxSuppressionV5Config,
   squareConfig,
   squaredDifferenceConfig,
+  softmaxConfig,
 ];
 
 for (const kernelConfig of kernelConfigs) {
diff --git a/tfjs-core/src/backends/webgl/reshape.ts b/tfjs-core/src/backends/webgl/reshape.ts
new file mode 100644
index 00000000000..cd69d7e8427
--- /dev/null
+++ b/tfjs-core/src/backends/webgl/reshape.ts
@@ -0,0 +1,62 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {TensorInfo} from '../../kernel_registry';
+import {webgl_util} from '../../webgl';
+
+import {MathBackendWebGL} from './backend_webgl';
+import {ReshapePackedProgram} from './reshape_packed_gpu';
+
+const packedReshape =
+    (input: TensorInfo, afterShape: number[],
+     backend: MathBackendWebGL): TensorInfo => {
+      console.log('PACKED RESHAPE');
+      const input3DShape = [
+        webgl_util.getBatchDim(input.shape),
+        ...webgl_util.getRowsCols(input.shape)
+      ] as [number, number, number];
+      const input3D: TensorInfo = {
+        dtype: input.dtype,
+        shape: input3DShape,
+        dataId: input.dataId
+      };
+      const afterShapeAs3D = [
+        webgl_util.getBatchDim(afterShape),
+        ...webgl_util.getRowsCols(afterShape)
+      ] as [number, number, number];
+
+      const program = new ReshapePackedProgram(afterShapeAs3D, input3DShape);
+      const preventEagerUnpackingOfOutput = true;
+      const output = backend.runWebGLProgram(
+          program, [input3D], input.dtype, null /* out info */,
+          null /* customSetup */, preventEagerUnpackingOfOutput);
+      return {dataId: output.dataId, shape: afterShape, dtype: output.dtype};
+    };
+
+export const reshape =
+    (x: TensorInfo, afterShape: number[],
+     backend: MathBackendWebGL): TensorInfo => {
+      console.log('RESHAPE');
+      const xTexData = backend.texData.get(x.dataId);
+      if (xTexData.isPacked && !webgl_util.isReshapeFree(x.shape, afterShape) &&
+          !(xTexData.texture !== null &&
+            webgl_util.isReshapeFree(xTexData.shape, afterShape))) {
+        return packedReshape(x, afterShape, backend);
+      }
+      x.shape = afterShape;
+      return x;
+    };
diff --git a/tfjs-core/src/engine.ts b/tfjs-core/src/engine.ts
index 7f491b89bc3..2f7c17c4e18 100644
--- a/tfjs-core/src/engine.ts
+++ b/tfjs-core/src/engine.ts
@@ -521,6 +521,7 @@ export class Engine implements TensorTracker, DataMover {
         this.state.numDataMovesStack[this.state.numDataMovesStack.length - 1];
     const dataIdsLeaked =
         numDataIdsAfter - numDataIdsBefore - numOutputDataIds - numMoves;
+
     if (dataIdsLeaked > 0) {
       throw new Error(
           `Backend '${this.backendName}' has an internal memory leak ` +
diff --git a/tfjs-core/src/kernel_names.ts b/tfjs-core/src/kernel_names.ts
index 2f027e57b1f..cd764cd0508 100644
--- a/tfjs-core/src/kernel_names.ts
+++ b/tfjs-core/src/kernel_names.ts
@@ -21,12 +21,38 @@
 import {NamedTensorInfoMap} from './kernel_registry';
 import {PixelData} from './types';
 
+export type BinaryInputs = Pick<NamedTensorInfoMap, 'a'|'b'>;
+
+export const Div = 'Div';
+
+export const Exp = 'Exp';
+export type ExpInputs = Pick<NamedTensorInfoMap, 'x'>;
+
+export const Max = 'Max';
+export type MaxInputs = Pick<NamedTensorInfoMap, 'x'>;
+export interface MaxAttrs {
+  axes: number[];
+}
+
+export const Softmax = 'Softmax';
+export type SoftmaxInputs = Pick<NamedTensorInfoMap, 'logits'>;
+export interface SoftmaxAttrs {
+  dim: number;
+}
+
 export const SquaredDifference = 'SquaredDifference';
-export type SquaredDifferenceInputs = Pick<NamedTensorInfoMap, 'a'|'b'>;
 
 export const Square = 'Square';
 export type SquareInputs = Pick<NamedTensorInfoMap, 'x'>;
 
+export const Sub = 'Sub';
+
+export const Sum = 'Sum';
+export type SumInputs = Pick<NamedTensorInfoMap, 'x'>;
+export interface SumAttrs {
+  axes: number[];
+}
+
 export const NonMaxSuppressionV5 = 'NonMaxSuppressionV5';
 export type NonMaxSuppressionV5Inputs =
     Pick<NamedTensorInfoMap, 'boxes'|'scores'>;
diff --git a/tfjs-core/src/ops/reduce_util.ts b/tfjs-core/src/ops/reduce_util.ts
index cb9f27d158a..a772aa03372 100644
--- a/tfjs-core/src/ops/reduce_util.ts
+++ b/tfjs-core/src/ops/reduce_util.ts
@@ -19,7 +19,7 @@
  * Inputs of size above this threshold will be parallelized by calling multiple
  * shader programs.
  */
-import {nearestDivisor} from '../util';
+import {nearestDivisor, sizeFromShape} from '../util';
 
 export const PARALLELIZE_THRESHOLD = 30;
 
@@ -35,3 +35,13 @@ export function computeOptimalWindowSize(inSize: number): number {
   }
   return nearestDivisor(inSize, Math.floor(Math.sqrt(inSize)));
 }
+
+export function reduceOutShapeFromInShape(
+    xShape: number[], reduceShape: number[]): number[] {
+  const xSize = sizeFromShape(xShape);
+  const inSize = sizeFromShape(reduceShape);
+  const batchSize = xSize / inSize;
+  const windowSize = computeOptimalWindowSize(inSize);
+  const outSize = Math.ceil(inSize / windowSize);
+  return [batchSize, outSize];
+}
diff --git a/tfjs-core/src/ops/softmax_test.ts b/tfjs-core/src/ops/softmax_test.ts
index e18397bb0a0..6cb529425d8 100644
--- a/tfjs-core/src/ops/softmax_test.ts
+++ b/tfjs-core/src/ops/softmax_test.ts
@@ -15,18 +15,35 @@
  * =============================================================================
  */
 
+import {WEBGL_ENVS} from '../backends/webgl/backend_webgl_test_registry';
+// import {CPU_ENVS} from '../backends/cpu/backend_cpu_test_registry';
 import * as tf from '../index';
 import {ALL_ENVS, describeWithFlags} from '../jasmine_util';
 import {expectArraysClose} from '../test_util';
 
-describeWithFlags('softmax', ALL_ENVS, () => {
-  it('regular test', async () => {
+describeWithFlags('softmax', WEBGL_ENVS, () => {
+  fit('regular test', async () => {
+    console.log('TESTINGGGG');
     const y = tf.softmax(tf.tensor1d([2, 1, 3]));
 
-    expectArraysClose(await y.data(), [0.24472847, 0.09003057, 0.66524095]);
-    expectArraysClose(await y.sum().data(), 1);
+    const data = await y.data();
+    console.log(Array.from(data));
+
+    // expectArraysClose(await y.data(), [0.24472847, 0.09003057, 0.66524095]);
+    // expectArraysClose(await y.sum().data(), 1);
+    // const x = tf.tensor1d([1, 2, 3]);
+    // x.softmax().print();
   });
 
+  fit('div', async () => {
+    console.log('TEST DIV');
+    const c = tf.div(tf.tensor1d([2, 1, 3]), tf.tensor1d([0.5, 0.5, 0.5]));
+    const data = await c.data();
+    console.log(data);
+  });
+});
+
+describeWithFlags('softmax', ALL_ENVS, () => {
   it('overflow', async () => {
     const y = tf.softmax(tf.tensor1d([100, 100]));
 
diff --git a/tfjs-core/src/ops/squared_difference.ts b/tfjs-core/src/ops/squared_difference.ts
index 0653191326c..9a5ecef3641 100644
--- a/tfjs-core/src/ops/squared_difference.ts
+++ b/tfjs-core/src/ops/squared_difference.ts
@@ -16,7 +16,7 @@
  */
 
 import {ENGINE, ForwardFunc} from '../engine';
-import {SquaredDifference, SquaredDifferenceInputs} from '../kernel_names';
+import {BinaryInputs, SquaredDifference} from '../kernel_names';
 import {Tensor} from '../tensor';
 import {NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
@@ -74,7 +74,7 @@ function squaredDifference_<T extends Tensor>(
     return res;
   };
 
-  const inputs: SquaredDifferenceInputs = {a: $a, b: $b};
+  const inputs: BinaryInputs = {a: $a, b: $b};
   const attrs = {};
 
   const inputsToSave = [$a, $b];