3 years ago · a49eba986a
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 
				 __pycache__
			
 
				 /c++/build/
			
 
				 /model.png
			
 
				+*.log
			
--- a/c++/tests/benchmark.sh
+++ b/c++/tests/benchmark.sh
@@ -0,0 +1,28 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+printf "benchmark.sh on $(date)\n\n\n" > bandwidth_perf.log
			
 
				+
			
 
				+run() {
			
 
				+  echo "$(date) build/test $1 $2 $3 $4"
			
 
				+  { /usr/bin/time -f "%U user %S system %E(%e s) elapsed  %P CPU\n(%Xtext+%Ddata %Mmax)k\n%I inputs + %O outputs (%F major + %R minor)pagefaults %W swaps" build/test $1 $2 $3 $4 ; } 2>> bandwith_perf.log | \
			
 
				+  head -n 110 | tee -a bandwidth_perf.log
			
 
				+  printf "\n\n" >> bandwidth_perf.log
			
 
				+}
			
 
				+
			
 
				+run 1 100 4 100
			
 
				+run 2 100 4 100
			
 
				+run 3 100 4 100
			
 
				+
			
 
				+run 3 100 1 100
			
 
				+run 3 100 2 100
			
 
				+run 3 100 4 100
			
 
				+run 3 100 8 100
			
 
				+run 3 100 16 100
			
 
				+run 3 100 32 100
			
 
				+
			
 
				+run 3 10000 4 1
			
 
				+run 3 1000 4 10
			
 
				+run 3 100 4 100
			
 
				+run 3 10 4 1000
			
 
				+run 3 1 1 10000
			
 
				+
			
--- a/c++/tests/main.cpp
+++ b/c++/tests/main.cpp
@@ -3,13 +3,14 @@
 
				 
			
 
				 ConnectionManager connectionManager;
			
 
				 
			
 
				-Module mod = Module::dummyBigModule;
			
 
				+Module mod = Module::conv2D_5x5_Module;
			
 
				+unsigned int jobsPerWorker = 100;
			
 
				 
			
 
				 size_t s=0, f=0, r=0;
			
 
				 std::mutex statsLk;
			
 
				 
			
 
				 void work() {
			
 
				-    auto worker = connectionManager.createWorker(mod, 1000);
			
 
				+    auto worker = connectionManager.createWorker(mod, jobsPerWorker);
			
 
				 
			
 
				     worker->setJobTimeout(milliseconds(1000));
			
 
				     worker->setRetryCount(10);
			
@@ -45,30 +46,49 @@ void work() {
 
				     worker->startAsync();
			
 
				 }
			
 
				 
			
 
				-int main(void)
			
 
				+int main(int argc, char *argv[])
			
 
				 {
			
 
				     puts("This is a shared library test...");
			
 
				 
			
 
				+    unsigned int numFPGA = 3;
			
 
				+    unsigned int workNum = 100;
			
 
				+    unsigned int workerCount = 1;
			
 
				+
			
 
				+    if(argc > 1)
			
 
				+        numFPGA = atoi(argv[1]);
			
 
				     
			
 
				-    connectionManager.addFPGA("192.168.1.33", 1234);
			
 
				-    connectionManager.addFPGA("192.168.1.34", 1234);
			
 
				+    if(numFPGA >= 1)
			
 
				+        connectionManager.addFPGA("192.168.1.33", 1234);
			
 
				+    if(numFPGA >= 2)
			
 
				+        connectionManager.addFPGA("192.168.1.34", 1234);
			
 
				+    if(numFPGA >= 3)
			
 
				+        connectionManager.addFPGA("192.168.1.35", 1234);
			
 
				 
			
 
				     connectionManager.setSendDelay(microseconds(50));
			
 
				-
			
 
				     connectionManager.start();
			
 
				 
			
 
				-    int workNum = 10000;
			
 
				-    int n=1;
			
 
				+    if(argc > 2)
			
 
				+        workNum = atoi(argv[2]);
			
 
				+
			
 
				+    if(argc > 3)
			
 
				+        workerCount = atoi(argv[3]);
			
 
				+
			
 
				+    if(argc > 4)
			
 
				+        jobsPerWorker = atoi(argv[4]);
			
 
				     
			
 
				+    printf("arguments: <numFPGA = %u> <workNum = %u> <workerCount = %u> <jobsPerWorker = %u>\n", numFPGA, workNum, workerCount, jobsPerWorker);
			
 
				+
			
 
				     while(workNum > 0 || connectionManager.getWorkerCount() > 0) {
			
 
				-        std::this_thread::sleep_for(milliseconds(300));
			
 
				+        std::this_thread::sleep_for(microseconds(1000));
			
 
				         connectionManager.removeFinishedWorkers();
			
 
				-        while(workNum > 0 && connectionManager.getWorkerCount() < 8) {
			
 
				+        while(workNum > 0 && connectionManager.getWorkerCount() < workerCount) {
			
 
				             workNum--;
			
 
				             work();
			
 
				+            std::unique_lock<std::mutex> lk(statsLk);
			
 
				+            printf("work: %2d   worker: %2lu failed: %12lu, successful: %12lu, retries: %12lu\n", workNum, connectionManager.getWorkerCount(), f, s, r);
			
 
				         }
			
 
				-        std::unique_lock<std::mutex> lk(statsLk);
			
 
				-        printf("work: %2d   worker: %2lu failed: %12lu, successful: %12lu, retries: %12lu  %8.3f MBit/s\n", workNum, connectionManager.getWorkerCount(), f, s, r, (float)s*(moduleSendPayloadLength[mod]+4)*4*10*8/1024/1024/3/(n++));
			
 
				     }
			
 
				+        std::unique_lock<std::mutex> lk(statsLk);
			
 
				+        printf("work: %2d   worker: %2lu failed: %12lu, successful: %12lu, retries: %12lu\n", workNum, connectionManager.getWorkerCount(), f, s, r);
			
 
				     return 0;
			
 
				 }
			
--- a/tests/bandwidth.py
+++ b/tests/bandwidth.py
@@ -0,0 +1,54 @@
 
				+import tensorflow as tf
			
 
				+from tensorflow import nn
			
 
				+import numpy as np
			
 
				+import time
			
 
				+
			
 
				+import sys
			
 
				+sys.path.append('../hostLib/')
			
 
				+from hostLib.layers.conv2d import Conv2D as Conv2DFPGA
			
 
				+from hostLib import load_op
			
 
				+
			
 
				+def run(inputShape, filterShape, n):
			
 
				+
			
 
				+  input = tf.random.uniform(shape=inputShape)
			
 
				+  filter = tf.random.uniform(shape=filterShape)
			
 
				+
			
 
				+  start = time.time()
			
 
				+  for i in range(n):
			
 
				+    nn.convolution(input, filter)
			
 
				+  elapsed_time = time.time() - start
			
 
				+  print("shapes: {:22s} {:22s}, count: {:6d},  CPU Conv2D OP time: {:.6f} s".format(str(inputShape), str(filterShape), n, elapsed_time))
			
 
				+
			
 
				+  start = time.time()
			
 
				+  for i in range(n):
			
 
				+    load_op.op_lib.MyConv2D_1(input=input, filter=filter)
			
 
				+  elapsed_time = time.time() - start
			
 
				+  print("shapes: {:22s} {:22s}, count: {:6d}, FPGA Conv2D OP time: {:.6f} s".format(str(inputShape), str(filterShape), n, elapsed_time))
			
 
				+
			
 
				+input = tf.random.uniform(shape=[1,228,228,1])
			
 
				+filter = tf.random.uniform(shape=[5,5,1,1])
			
 
				+nn.convolution(input, filter)
			
 
				+load_op.op_lib.MyConv2D_1(input=input, filter=filter)
			
 
				+
			
 
				+
			
 
				+run((1,228,228,1), (5,5,1,1), 10000)
			
 
				+run((10,228,228,1), (5,5,1,1), 1000)
			
 
				+run((100,228,228,1), (5,5,1,1), 100)
			
 
				+run((1000,228,228,1), (5,5,1,1), 10)
			
 
				+
			
 
				+
			
 
				+run((1,228,228,1), (5,5,1,1), 10000)
			
 
				+run((1,228,228,10), (5,5,10,1), 1000)
			
 
				+run((1,228,228,100), (5,5,100,1), 100)
			
 
				+run((1,228,228,1000), (5,5,1000,1), 10)
			
 
				+
			
 
				+
			
 
				+run((1,228,228,1), (5,5,1,1), 10000)
			
 
				+run((1,228,228,1), (5,5,1,10), 1000)
			
 
				+run((1,228,228,1), (5,5,1,100), 100)
			
 
				+run((1,228,228,1), (5,5,1,1000), 10)
			
 
				+
			
 
				+run((1,228,228,1), (5,5,1,1000), 10)
			
 
				+run((1,228,228,10), (5,5,10,100), 10)
			
 
				+run((1,228,228,100), (5,5,100,10), 10)
			
 
				+run((1,228,228,1000), (5,5,1000,1), 10)
			
--- a/tests/benchmark.sh
+++ b/tests/benchmark.sh
@@ -0,0 +1,6 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+printf "benchmark.sh on $(date)\n\n\n" > bandwidth_perf.log
			
 
				+
			
 
				+exec &> >(tee  -a bandwidth_perf.log)
			
 
				+python3 tests/bandwidth.py