3 years ago · 71217544b7
--- a/c++/include/conv2D.hpp
+++ b/c++/include/conv2D.hpp
@@ -1,5 +1,5 @@
 
				-#ifndef CONV2D_FPGA_H
			
 
				-#define CONV2D_FPGA_H
			
 
				+#ifndef CONV2D_1_FPGA_H
			
 
				+#define CONV2D_1_FPGA_H
			
 
				 
			
 
				 #include "tensorflow/core/framework/op_kernel.h"
			
 
				 #include "tensorflow/core/framework/function.h"
			
--- a/c++/include/conv2D_2.hpp
+++ b/c++/include/conv2D_2.hpp
--- a/c++/include/conv2D_3.hpp
+++ b/c++/include/conv2D_3.hpp
--- a/c++/include/conv2D_maxpool.hpp
+++ b/c++/include/conv2D_maxpool.hpp
--- a/c++/include/conv2D_maxpool_multi.hpp
+++ b/c++/include/conv2D_maxpool_multi.hpp
--- a/c++/include/entrypoint.hpp
+++ b/c++/include/entrypoint.hpp
@@ -11,7 +11,11 @@
 
				 
			
 
				 #include "tensorflow/core/lib/math/math_util.h"
			
 
				 
			
 
				-#include "conv2D.hpp"
			
 
				+#include "conv2D_1.hpp"
			
 
				+#include "conv2D_2.hpp"
			
 
				+#include "conv2D_3.hpp"
			
 
				+#include "conv2D_maxpool.hpp"
			
 
				+#include "conv2D_maxpool_multi.hpp"
			
 
				 #include "dummyOp.hpp"
			
 
				 #include "dummyBigOp.hpp"
			
 
				 #include "../lib/mlfpga/include/connectionManager.hpp"
			
--- a/c++/src/conv2D.cpp
+++ b/c++/src/conv2D.cpp
@@ -1,5 +1,5 @@
 
				 
			
 
				-#include "conv2D.hpp"
			
 
				+#include "conv2D_1.hpp"
			
 
				 
			
 
				 namespace tf_lib {
			
 
				 
			
@@ -149,8 +149,7 @@ namespace tf_lib {
 
				             auto job = jobs->getJob(sample * outputChannels * channels + outputChannel * channels + channel);
			
 
				             for(int x=0; x<outputSize; x++) {
			
 
				               for(int y=0; y<outputSize; y++) {
			
 
				-                uint32_t val = job->getResponsePayload((y+border*2)*sizeWithBorder + (x+border*2));
			
 
				-                output_tensor(sample, y, x, outputChannel) += *((float*)&val);
			
 
				+                memcpy(&output_tensor(sample, y, x, outputChannel), &job->getResponseAddr()[(y+border*2)*sizeWithBorder + (x+border*2)], 4);
			
 
				               }
			
 
				             }
			
 
				           }
			
@@ -227,13 +226,13 @@ namespace tf_lib {
 
				                             true, "dz", true, "x", true);
			
 
				   }
			
 
				 
			
 
				-  REGISTER_OP("MyConv2D")
			
 
				+  REGISTER_OP("MyConv2D_1")
			
 
				       .Input("input: float")
			
 
				       .Input("filter: float")
			
 
				       .Output("output: float")
			
 
				       .SetShapeFn(conv2d_shape_fn);
			
 
				 
			
 
				-  REGISTER_KERNEL_BUILDER(Name("MyConv2D").Device(DEVICE_CPU), Conv2DOp);
			
 
				-  REGISTER_OP_GRADIENT("MyConv2D", MatMulGrad);
			
 
				+  REGISTER_KERNEL_BUILDER(Name("MyConv2D_1").Device(DEVICE_CPU), Conv2DOp);
			
 
				+  REGISTER_OP_GRADIENT("MyConv2D_1", MatMulGrad);
			
 
				 
			
 
				 }
			
--- a/c++/src/conv2D_2.cpp
+++ b/c++/src/conv2D_2.cpp
--- a/c++/src/conv2D_3.cpp
+++ b/c++/src/conv2D_3.cpp
--- a/c++/src/conv2D_maxpool.cpp
+++ b/c++/src/conv2D_maxpool.cpp
--- a/c++/src/conv2D_maxpool_multi.cpp
+++ b/c++/src/conv2D_maxpool_multi.cpp
--- a/doku/layer/conv2d.md
+++ b/doku/layer/conv2d.md
@@ -1,31 +0,0 @@
 
				-# Zweidimensionale Konvolution
			
 
				-
			
 
				-## Tensorgröße
			
 
				-
			
 
				-Input:
			
 
				-- in TF: `[batchSize, imageY, imageX, channels]`
			
 
				-- an FPGA: `[imageY, imageX]`
			
 
				-
			
 
				-Kernel:
			
 
				-- in TF: `[kernelY, kernelX, channels, outputChannels]`
			
 
				-- an FPGA: `[kernelY, kernelX]`
			
 
				-
			
 
				-Output:
			
 
				-- vom FPGA: `[imageY2, imageX2]`
			
 
				-- an TF: `[batchSize, imageY2, imageX2, outputChannels]`
			
 
				-
			
 
				-## Parallelisierung
			
 
				-
			
 
				-1.  **Ohne FPGA-seitigem Speicher**
			
 
				-
			
 
				-    FPGA Recheneinheiten werden verteilt `(batchSize * channels * outputChannels)` Mal verwendet.
			
 
				-
			
 
				-    ```python
			
 
				-    for sample in range(batchSize):
			
 
				-      for outputChannel in range(outputChannels):
			
 
				-        for channel in range(channels):
			
 
				-          output[sample][outputChannel] += f(
			
 
				-            input[sample][channel], 
			
 
				-            kernel[channel][outputChannel]
			
 
				-          )
			
 
				-    ```
			
--- a/doku/layer/conv2d_1.md
+++ b/doku/layer/conv2d_1.md
@@ -0,0 +1,65 @@
 
				+# 2D convolution of one channel
			
 
				+
			
 
				+## TF equivalent
			
 
				+
			
 
				+```python
			
 
				+layers.Conv2d(
			
 
				+  filters,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation=None,
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## Tensor sizes
			
 
				+
			
 
				+Input:
			
 
				+- from TF: `[batchSize, imageY, imageX, channels]`
			
 
				+- to FPGA: `[imageY, imageX]`
			
 
				+
			
 
				+Filter:
			
 
				+- from TF: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+- to FPGA: `[kernelY, kernelX]`
			
 
				+
			
 
				+Output:
			
 
				+- from FPGA: `[imageY2, imageX2]`
			
 
				+- to TF: `[batchSize, imageY2, imageX2, outputChannels]`
			
 
				+
			
 
				+## Parallelization
			
 
				+
			
 
				+- `(batchSize * channels * outputChannels)` jobs will be created.
			
 
				+- layer can be called in parallel (functional model)
			
 
				+
			
 
				+```python
			
 
				+for sample in range(batchSize):
			
 
				+  for outputChannel in range(outputChannels):
			
 
				+    for channel in range(channels):
			
 
				+      output[sample][outputChannel] += job(
			
 
				+        input[sample][channel], 
			
 
				+        kernel[channel][outputChannel]
			
 
				+      )
			
 
				+```
			
 
				+
			
 
				+## Constraints
			
 
				+
			
 
				+```python
			
 
				+  imageX  = imageY  = 228
			
 
				+  kernelX = kernelY = 5
			
 
				+  imageX2 = imageY2 = 224
			
 
				+```
			
 
				+
			
 
				+## Job lengths
			
 
				+
			
 
				+- to FPGA:
			
 
				+  `5 * 5 + 228 * 228 = 52009`
			
 
				+- from FPGA:
			
 
				+  `224 * 224 = 50176`
			
 
				+
			
 
				+## Gradient
			
 
				+  same as layers.Conv2d: `conv2d_backprop` in 
			
 
				+  `tensorflow/tensorflow/python/ops/nn_ops.py:2290`
			
--- a/doku/layer/conv2d_2.md
+++ b/doku/layer/conv2d_2.md
@@ -0,0 +1,65 @@
 
				+# 2D convolution with activation
			
 
				+
			
 
				+## TF equivalent
			
 
				+
			
 
				+```python
			
 
				+layers.Conv2d(
			
 
				+  filters,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation='relu',
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## Tensor sizes
			
 
				+
			
 
				+Input:
			
 
				+- from TF: `[batchSize, imageY, imageX, channels]`
			
 
				+- to FPGA: `[imageY, imageX, channels]`
			
 
				+
			
 
				+Filter:
			
 
				+- from TF: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+- to FPGA: `[kernelY, kernelX, channels]`
			
 
				+
			
 
				+Output:
			
 
				+- from FPGA: `[imageY2, imageX2]`
			
 
				+- to TF: `[batchSize, imageY2, imageX2, outputChannels]`
			
 
				+
			
 
				+## Parallelization
			
 
				+
			
 
				+- `(batchSize * outputChannels)` jobs will be created.
			
 
				+- layer can be called in parallel (functional model)
			
 
				+
			
 
				+```python
			
 
				+for sample in range(batchSize):
			
 
				+  for outputChannel in range(outputChannels):
			
 
				+    output[sample][outputChannel] = job(
			
 
				+      input[sample], 
			
 
				+      kernel[outputChannel]
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+## Constraints
			
 
				+
			
 
				+```python
			
 
				+  imageX  = imageY  = 228
			
 
				+  kernelX = kernelY = 5
			
 
				+  imageX2 = imageY2 = 224
			
 
				+  channels = 3
			
 
				+```
			
 
				+
			
 
				+## Job lengths
			
 
				+
			
 
				+- to FPGA:
			
 
				+  `5 * 5 * 3 + 228 * 228 * 3 = 156027`
			
 
				+- from FPGA:
			
 
				+  `224 * 224 = 50176`
			
 
				+
			
 
				+## Gradient
			
 
				+  same as layers.Conv2d: `conv2d_backprop` in 
			
 
				+  `tensorflow/tensorflow/python/ops/nn_ops.py:2290`
			
--- a/doku/layer/conv2d_3.md
+++ b/doku/layer/conv2d_3.md
@@ -0,0 +1,65 @@
 
				+# 2D convolution with activation and fixed output channels
			
 
				+
			
 
				+## TF equivalent
			
 
				+
			
 
				+```python
			
 
				+layers.Conv2d(
			
 
				+  filters=32,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation='relu',
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## Tensor sizes
			
 
				+
			
 
				+Input:
			
 
				+- from TF: `[batchSize, imageY, imageX, channels]`
			
 
				+- to FPGA: `[imageY, imageX, channels]`
			
 
				+
			
 
				+Filter:
			
 
				+- from TF: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+- to FPGA: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+
			
 
				+Output:
			
 
				+- from FPGA: `[imageY2, imageX2, outputChannels]`
			
 
				+- to TF: `[batchSize, imageY2, imageX2, outputChannels]`
			
 
				+
			
 
				+## Parallelization
			
 
				+
			
 
				+- `(batchSize)` jobs will be created.
			
 
				+- layer can be called in parallel (functional model)
			
 
				+
			
 
				+```python
			
 
				+for sample in range(batchSize):
			
 
				+    output[sample] = job(
			
 
				+      input[sample], 
			
 
				+      kernel
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+## Constraints
			
 
				+
			
 
				+```python
			
 
				+  imageX  = imageY  = 228
			
 
				+  kernelX = kernelY = 5
			
 
				+  imageX2 = imageY2 = 224
			
 
				+  channels = 3
			
 
				+  outputChannels = 32
			
 
				+```
			
 
				+
			
 
				+## Job lengths
			
 
				+
			
 
				+- to FPGA:
			
 
				+  `5 * 5 * 3 * 32 + 228 * 228 * 3 = 158352`
			
 
				+- from FPGA:
			
 
				+  `224 * 224 * 32 = 1605632`
			
 
				+
			
 
				+## Gradient
			
 
				+  same as layers.Conv2d: `conv2d_backprop` in 
			
 
				+  `tensorflow/tensorflow/python/ops/nn_ops.py:2290`
			
--- a/doku/layer/conv2d_maxpool.md
+++ b/doku/layer/conv2d_maxpool.md
@@ -0,0 +1,71 @@
 
				+# 2D convolution and MaxPooling
			
 
				+
			
 
				+## TF equivalent
			
 
				+
			
 
				+```python
			
 
				+layers.Conv2d(
			
 
				+  filters=32,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation='relu',
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+
			
 
				+layers.MaxPooling2D(
			
 
				+  pool_size=(224, 224),
			
 
				+  strides=(2, 2),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last'
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## Tensor sizes
			
 
				+
			
 
				+Input:
			
 
				+- from TF: `[batchSize, imageY, imageX, channels]`
			
 
				+- to FPGA: `[imageY, imageX, channels]`
			
 
				+
			
 
				+Filter:
			
 
				+- from TF: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+- to FPGA: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+
			
 
				+Output:
			
 
				+- from FPGA: `[imageY2, imageX2, outputChannels]`
			
 
				+- to TF: `[batchSize, imageY2, imageX2, outputChannels]`
			
 
				+
			
 
				+## Parallelization
			
 
				+
			
 
				+- `(batchSize)` jobs will be created.
			
 
				+- layer can be called in parallel (functional model)
			
 
				+
			
 
				+```python
			
 
				+for sample in range(batchSize):
			
 
				+    output[sample] = job(
			
 
				+      input[sample], 
			
 
				+      kernel
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+## Constraints
			
 
				+
			
 
				+```python
			
 
				+  imageX  = imageY  = 228
			
 
				+  kernelX = kernelY = 5
			
 
				+  imageX2 = imageY2 = 112
			
 
				+  channels = 3
			
 
				+  outputChannels = 32
			
 
				+```
			
 
				+
			
 
				+## Job lengths
			
 
				+
			
 
				+- to FPGA:
			
 
				+  `5 * 5 * 3 * 32 + 228 * 228 * 3 = 158352`
			
 
				+- from FPGA:
			
 
				+  `112 * 112 * 32 = 401408`
			
 
				+
			
 
				+## Gradient
			
 
				+  tbd
			
--- a/doku/layer/conv2d_maxpool_multi.md
+++ b/doku/layer/conv2d_maxpool_multi.md
@@ -0,0 +1,109 @@
 
				+# multiple 2D convolutions with MaxPooling
			
 
				+
			
 
				+## TF equivalent
			
 
				+
			
 
				+```python
			
 
				+layers.Conv2d(
			
 
				+  filters=32,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation='relu',
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+
			
 
				+layers.MaxPooling2D(
			
 
				+  pool_size=(224, 224),
			
 
				+  strides=(2, 2),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last'
			
 
				+)
			
 
				+
			
 
				+layers.Conv2d(
			
 
				+  filters=32,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation='relu',
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+
			
 
				+layers.MaxPooling2D(
			
 
				+  pool_size=(108, 108),
			
 
				+  strides=(2, 2),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last'
			
 
				+)
			
 
				+
			
 
				+layers.Conv2d(
			
 
				+  filters=32,
			
 
				+  kernel_size=5,
			
 
				+  strides=(1, 1),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last',
			
 
				+  dilation_rate=(1, 1),
			
 
				+  activation='relu',
			
 
				+  use_bias=False,
			
 
				+  trainable=True
			
 
				+)
			
 
				+
			
 
				+layers.MaxPooling2D(
			
 
				+  pool_size=(50, 50),
			
 
				+  strides=(2, 2),
			
 
				+  padding='valid',
			
 
				+  data_format='channels_last'
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## Tensor sizes
			
 
				+
			
 
				+Input:
			
 
				+- from TF: `[batchSize, imageY, imageX, channels]`
			
 
				+- to FPGA: `[imageY, imageX, channels]`
			
 
				+
			
 
				+Filter:
			
 
				+- from TF: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+- to FPGA: `[kernelY, kernelX, channels, outputChannels]`
			
 
				+
			
 
				+Output:
			
 
				+- from FPGA: `[imageY2, imageX2, outputChannels]`
			
 
				+- to TF: `[batchSize, imageY2, imageX2, outputChannels]`
			
 
				+
			
 
				+## Parallelization
			
 
				+
			
 
				+- `(batchSize)` jobs will be created.
			
 
				+- layer can be called in parallel (functional model)
			
 
				+
			
 
				+```python
			
 
				+for sample in range(batchSize):
			
 
				+    output[sample] = job(
			
 
				+      input[sample], 
			
 
				+      kernel
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+## Constraints
			
 
				+
			
 
				+```python
			
 
				+  imageX  = imageY  = 228
			
 
				+  kernelX = kernelY = 5
			
 
				+  imageX2 = imageY2 = 25
			
 
				+  channels = 3
			
 
				+  outputChannels = 32
			
 
				+```
			
 
				+
			
 
				+## Job lengths
			
 
				+
			
 
				+- to FPGA:
			
 
				+  `5 * 5 * (3 * 32 + 32 * 32 * 2) + 228 * 228 * 3 = 209552`
			
 
				+- from FPGA:
			
 
				+  `25 * 25 * 32 = 20000`
			
 
				+
			
 
				+## Gradient
			
 
				+  tbd
			
--- a/hostLib/layers/__init__.py
+++ b/hostLib/layers/__init__.py
@@ -1,2 +1,2 @@
 
				 
			
 
				-__all__ = ["conv2d"]
			
 
				+__all__ = ["conv2d", "conv2d_maxpool", "conv2d_maxpool_multi"]
			
--- a/hostLib/layers/conv2d.py
+++ b/hostLib/layers/conv2d.py
@@ -9,18 +9,18 @@ from .. import load_op
 
				 
			
 
				 class Conv2D(layers.Layer):
			
 
				   def __init__(self,
			
 
				-    filters = 1,
			
 
				-    kernel_initializer = 'glorot_uniform',
			
 
				+               filters = 1,
			
 
				+               kernel_initializer = 'glorot_uniform',
			
 
				                kernel_regularizer=None,
			
 
				                kernel_constraint=None,
			
 
				-    ):
			
 
				+               implementation = 1):
			
 
				     super(Conv2D, self).__init__()
			
 
				     #int, dim of output space
			
 
				     self.filters = filters
			
 
				     self.kernel_initializer = initializers.get(kernel_initializer)
			
 
				     self.kernel_regularizer = regularizers.get(kernel_regularizer)
			
 
				     self.kernel_constraint = constraints.get(kernel_constraint)
			
 
				-
			
 
				+    self.implementation = implementation
			
 
				 
			
 
				   def build(self, input_shape):
			
 
				     input_shape = tf.TensorShape(input_shape)
			
@@ -37,10 +37,16 @@ class Conv2D(layers.Layer):
 
				         dtype=self.dtype)
			
 
				 
			
 
				   def call(self, inputs):
			
 
				+    if self.implementation == 1:
			
 
				+      return load_op.op_lib.MyConv2D_1(input=inputs, filter=self.kernel)
			
 
				+    if self.implementation == 2:
			
 
				+      return load_op.op_lib.MyConv2D_2(input=inputs, filter=self.kernel)
			
 
				+    if self.implementation == 3:
			
 
				+      return load_op.op_lib.MyConv2D_3(input=inputs, filter=self.kernel)
			
 
				 
			
 
				-    return load_op.op_lib.MyConv2D(input=inputs, filter=self.kernel)
			
 
				-
			
 
				-@ops.RegisterGradient("MyConv2D")
			
 
				+@ops.RegisterGradient("MyConv2D_1")
			
 
				+@ops.RegisterGradient("MyConv2D_2")
			
 
				+@ops.RegisterGradient("MyConv2D_3")
			
 
				 def _my_conv_2d_grad(op, grad):
			
 
				   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
			
 
				 
			
--- a/hostLib/layers/conv2d_maxpool.py
+++ b/hostLib/layers/conv2d_maxpool.py
--- a/hostLib/layers/conv2d_maxpool_multi.py
+++ b/hostLib/layers/conv2d_maxpool_multi.py