Browse Source

added bandwidth benchmark

subDesTagesMitExtraKaese 3 years ago
parent
commit
a49eba986a
5 changed files with 121 additions and 12 deletions
  1. 1 0
      .gitignore
  2. 28 0
      c++/tests/benchmark.sh
  3. 32 12
      c++/tests/main.cpp
  4. 54 0
      tests/bandwidth.py
  5. 6 0
      tests/benchmark.sh

+ 1 - 0
.gitignore

@@ -5,3 +5,4 @@
 __pycache__
 /c++/build/
 /model.png
+*.log

+ 28 - 0
c++/tests/benchmark.sh

@@ -0,0 +1,28 @@
+#!/bin/bash
+
+printf "benchmark.sh on $(date)\n\n\n" > bandwidth_perf.log
+
+run() {
+  echo "$(date) build/test $1 $2 $3 $4"
+  { /usr/bin/time -f "%U user %S system %E(%e s) elapsed  %P CPU\n(%Xtext+%Ddata %Mmax)k\n%I inputs + %O outputs (%F major + %R minor)pagefaults %W swaps" build/test $1 $2 $3 $4 ; } 2>> bandwith_perf.log | \
+  head -n 110 | tee -a bandwidth_perf.log
+  printf "\n\n" >> bandwidth_perf.log
+}
+
+run 1 100 4 100
+run 2 100 4 100
+run 3 100 4 100
+
+run 3 100 1 100
+run 3 100 2 100
+run 3 100 4 100
+run 3 100 8 100
+run 3 100 16 100
+run 3 100 32 100
+
+run 3 10000 4 1
+run 3 1000 4 10
+run 3 100 4 100
+run 3 10 4 1000
+run 3 1 1 10000
+

+ 32 - 12
c++/tests/main.cpp

@@ -3,13 +3,14 @@
 
 ConnectionManager connectionManager;
 
-Module mod = Module::dummyBigModule;
+Module mod = Module::conv2D_5x5_Module;
+unsigned int jobsPerWorker = 100;
 
 size_t s=0, f=0, r=0;
 std::mutex statsLk;
 
 void work() {
-    auto worker = connectionManager.createWorker(mod, 1000);
+    auto worker = connectionManager.createWorker(mod, jobsPerWorker);
 
     worker->setJobTimeout(milliseconds(1000));
     worker->setRetryCount(10);
@@ -45,30 +46,49 @@ void work() {
     worker->startAsync();
 }
 
-int main(void)
+int main(int argc, char *argv[])
 {
     puts("This is a shared library test...");
 
+    unsigned int numFPGA = 3;
+    unsigned int workNum = 100;
+    unsigned int workerCount = 1;
+
+    if(argc > 1)
+        numFPGA = atoi(argv[1]);
     
-    connectionManager.addFPGA("192.168.1.33", 1234);
-    connectionManager.addFPGA("192.168.1.34", 1234);
+    if(numFPGA >= 1)
+        connectionManager.addFPGA("192.168.1.33", 1234);
+    if(numFPGA >= 2)
+        connectionManager.addFPGA("192.168.1.34", 1234);
+    if(numFPGA >= 3)
+        connectionManager.addFPGA("192.168.1.35", 1234);
 
     connectionManager.setSendDelay(microseconds(50));
-
     connectionManager.start();
 
-    int workNum = 10000;
-    int n=1;
+    if(argc > 2)
+        workNum = atoi(argv[2]);
+
+    if(argc > 3)
+        workerCount = atoi(argv[3]);
+
+    if(argc > 4)
+        jobsPerWorker = atoi(argv[4]);
     
+    printf("arguments: <numFPGA = %u> <workNum = %u> <workerCount = %u> <jobsPerWorker = %u>\n", numFPGA, workNum, workerCount, jobsPerWorker);
+
     while(workNum > 0 || connectionManager.getWorkerCount() > 0) {
-        std::this_thread::sleep_for(milliseconds(300));
+        std::this_thread::sleep_for(microseconds(1000));
         connectionManager.removeFinishedWorkers();
-        while(workNum > 0 && connectionManager.getWorkerCount() < 8) {
+        while(workNum > 0 && connectionManager.getWorkerCount() < workerCount) {
             workNum--;
             work();
+            std::unique_lock<std::mutex> lk(statsLk);
+            printf("work: %2d   worker: %2lu failed: %12lu, successful: %12lu, retries: %12lu\n", workNum, connectionManager.getWorkerCount(), f, s, r);
         }
-        std::unique_lock<std::mutex> lk(statsLk);
-        printf("work: %2d   worker: %2lu failed: %12lu, successful: %12lu, retries: %12lu  %8.3f MBit/s\n", workNum, connectionManager.getWorkerCount(), f, s, r, (float)s*(moduleSendPayloadLength[mod]+4)*4*10*8/1024/1024/3/(n++));
     }
+        std::unique_lock<std::mutex> lk(statsLk);
+        printf("work: %2d   worker: %2lu failed: %12lu, successful: %12lu, retries: %12lu\n", workNum, connectionManager.getWorkerCount(), f, s, r);
     return 0;
 }

+ 54 - 0
tests/bandwidth.py

@@ -0,0 +1,54 @@
+import tensorflow as tf
+from tensorflow import nn
+import numpy as np
+import time
+
+import sys
+sys.path.append('../hostLib/')
+from hostLib.layers.conv2d import Conv2D as Conv2DFPGA
+from hostLib import load_op
+
+def run(inputShape, filterShape, n):
+
+  input = tf.random.uniform(shape=inputShape)
+  filter = tf.random.uniform(shape=filterShape)
+
+  start = time.time()
+  for i in range(n):
+    nn.convolution(input, filter)
+  elapsed_time = time.time() - start
+  print("shapes: {:22s} {:22s}, count: {:6d},  CPU Conv2D OP time: {:.6f} s".format(str(inputShape), str(filterShape), n, elapsed_time))
+
+  start = time.time()
+  for i in range(n):
+    load_op.op_lib.MyConv2D_1(input=input, filter=filter)
+  elapsed_time = time.time() - start
+  print("shapes: {:22s} {:22s}, count: {:6d}, FPGA Conv2D OP time: {:.6f} s".format(str(inputShape), str(filterShape), n, elapsed_time))
+
+input = tf.random.uniform(shape=[1,228,228,1])
+filter = tf.random.uniform(shape=[5,5,1,1])
+nn.convolution(input, filter)
+load_op.op_lib.MyConv2D_1(input=input, filter=filter)
+
+
+run((1,228,228,1), (5,5,1,1), 10000)
+run((10,228,228,1), (5,5,1,1), 1000)
+run((100,228,228,1), (5,5,1,1), 100)
+run((1000,228,228,1), (5,5,1,1), 10)
+
+
+run((1,228,228,1), (5,5,1,1), 10000)
+run((1,228,228,10), (5,5,10,1), 1000)
+run((1,228,228,100), (5,5,100,1), 100)
+run((1,228,228,1000), (5,5,1000,1), 10)
+
+
+run((1,228,228,1), (5,5,1,1), 10000)
+run((1,228,228,1), (5,5,1,10), 1000)
+run((1,228,228,1), (5,5,1,100), 100)
+run((1,228,228,1), (5,5,1,1000), 10)
+
+run((1,228,228,1), (5,5,1,1000), 10)
+run((1,228,228,10), (5,5,10,100), 10)
+run((1,228,228,100), (5,5,100,10), 10)
+run((1,228,228,1000), (5,5,1000,1), 10)

+ 6 - 0
tests/benchmark.sh

@@ -0,0 +1,6 @@
+#!/bin/bash
+
+printf "benchmark.sh on $(date)\n\n\n" > bandwidth_perf.log
+
+exec &> >(tee  -a bandwidth_perf.log)
+python3 tests/bandwidth.py