/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
  CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools)
NOTES: optimized for NVIDIA-GPU-architecture: FERMI!
       two compile-steps!
  1.) <src.cu>  ==> <src.cpp>
  2.) <src.cpp> ==> <dest>
Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v
Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v
Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v
*/

// HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <climits>
#include <stdint.h>
#include <inttypes.h>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <cmath>

// HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

// HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <math_functions.h>
#include "./cuPrintf/cuPrintf.cu"

using namespace std;

#define free(x) free(x); *x=NULL

//NVdimN = NumValues of dimN
const uint64_t NVsize = 1920ULL;
const uint64_t NVdim1 = NVsize,    //dim1 = x (col)
               NVdim2 = NVsize;    //dim2 = y (row)

// DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function)
__global__ void cu2DMatrixMul64ui(uint64_t *D_MA, uint64_t *D_MB, uint64_t *D_MC);

__host__ int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(0);
  omp_set_nested(3);  // important for nested parallelism

  // data declarations and implementations
  double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00;
  uint64_t MA_s[NVdim1][NVdim2] = {{0ULL}}, 
           MB_s[NVdim1][NVdim2] = {{0ULL}}, 
           MC_s[NVdim1][NVdim2] = {{0ULL}}, 
           MA_p[NVdim1][NVdim2] = {{0ULL}}, 
           MB_p[NVdim1][NVdim2] = {{0ULL}}, 
           MC_p[NVdim1][NVdim2] = {{0ULL}}, 
           h_MA_p[NVdim1*NVdim2]={0ULL}, 
           h_MB_p[NVdim1*NVdim2]={0ULL}, 
           h_MC_p[NVdim1*NVdim2]={0ULL}, 
           *d_MA_p=NULL, 
           *d_MB_p=NULL, 
           *d_MC_p=NULL, 
           x = 0ULL, y = 0ULL, i = 0ULL; // x: col, y: row, i: Initialisierung
  bool ResultsAreCorrect = true;

  std::cout << "Matrix-Multiplikation                                      (64-Bit)\n"
            << "===================================================================\n"
            << "Initialisierung:";

  //--------------------------Begin: Initialization of data------------------------------------------

  x = y = i = 0ULL;
  //spaltenweise vektorielle Initialisierung
  for(x=0ULL;x<NVdim1;++x)   //col
  {
    for(y=0ULL;y<NVdim2;++y) //row
    {
      MA_s[x][y] = MB_s[x][y] = MA_p[x][y] = MB_p[x][y] = h_MA_p[(x+y*NVsize)] = h_MB_p[(x+y*NVsize)] = (i+1ULL);
      ++i;
    }
  }

  //--------------------------End: Initialization of data--------------------------------------------

  std::cout << "                                               done\n"
            << "CPU-SERIELLE AUSFÜHRUNG:";

  //--------------------------Begin: CPU-serial execution of algorithm-------------------------------

  x = y = i = 0ULL;
  starttime = omp_get_wtime();
  //CPU-serial algorithm:
  for(x=0;x<NVdim1;++x)     //col
  {
    for(y=0;y<NVdim2;++y)   //row
    {
      for(i=0;i<NVsize;++i)
      {
        MC_s[x][y]+=(MA_s[x][i] * MB_s[i][y]);
      }
    }
  }
  sdelay = omp_get_wtime()-starttime;
  std::cout << "                                       done\n"; //serial

  //--------------------------End: CPU-serial execution of algorithm---------------------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm----------------------

  std::cout << "CPU-PARALLELE AUSFÜHRUNG mit ";

  x = y = i = 0ULL;
  // create parallel region:
  #pragma omp parallel default(none) private(x, y, i) shared(std::cout, starttime, pdelay1, MA_p, MB_p, MC_p)
  {
    #pragma omp master
    {
      std::cout << omp_get_num_threads() << " Threads:";
      starttime = omp_get_wtime();
    }

    /* OpenMP-CPU-parallel algorithm with nested parallelism:
       Nested parallelism with collapse(k)-clause on OpenMP-for-directive (k = nesting deep of loops). */
    #pragma omp flush
    #pragma omp for collapse(3) schedule(static)
    for(x=0;x<NVdim1;++x)     //col
    {
      for(y=0;y<NVdim2;++y)   //row
      {
        for(i=0;i<NVsize;++i)
        {
          MC_p[x][y]+=(MA_p[x][i] * MB_p[i][y]);
        }
      }
    }

    #pragma omp master
    {
      pdelay1 = omp_get_wtime()-starttime;
      if(omp_get_num_threads() >= 10)
      {
        std::cout << "                       done\n";  //parallel
      }
      else
      {
        std::cout << "                        done\n"; //parallel
      }
    }
  }
  x = y = 0ULL;

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------

  //++++++++++++++++++++++++++Begin: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++

  std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 1 GPU u. 14 SM:";
  cudaSetDevice(0); // choose GPU

  // allocate data on GPU memory
  cudaMalloc((void**)&d_MA_p,((NVdim1*NVdim2)*sizeof(uint64_t)));
  cudaMalloc((void**)&d_MB_p,((NVdim1*NVdim2)*sizeof(uint64_t)));
  cudaMalloc((void**)&d_MC_p,((NVdim1*NVdim2)*sizeof(uint64_t)));
  // copy Host-Data into GPU-Data (memory)
  cudaMemcpy(d_MA_p,(void*)h_MA_p,((NVdim1*NVdim2)*sizeof(uint64_t)),cudaMemcpyHostToDevice);
  cudaMemcpy(d_MB_p,(void*)h_MB_p,((NVdim1*NVdim2)*sizeof(uint64_t)),cudaMemcpyHostToDevice);
  // begin of CUDAPrintf-section
  cudaPrintfInit();
  starttime = omp_get_wtime();
  /* CUDA-kernel call with
     112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */
  cu2DMatrixMul64ui<<<112,192>>>(d_MA_p,d_MB_p,d_MC_p);
  //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n';
  // CPU external GPU-Threads synchronisation
  cudaThreadSynchronize();
  pdelay2=(omp_get_wtime()-starttime);
  // CUDAPrintf output of CUDA-kernel call
  cudaPrintfDisplay(stdout,true);
  // end of CUDAPrintf-section
  cudaPrintfEnd();
  // copy calculated device-data from device memory back into CPU-data (RAM)
  cudaMemcpy((void*)h_MC_p,d_MC_p,((NVdim1*NVdim2)*sizeof(uint64_t)),cudaMemcpyDeviceToHost);
  /*
  // Control output
  for(uint64_t i=0ULL;i<NVdim1;++i)
  {
    for(uint64_t j=0ULL;j<NVdim2;++j)
    {
      printf("%d\n",h_MC_p[i+j*NVsize]);
    }
    printf("\n");
  }
  */
  /*
  // Control output
  std::cout << h_MC_p[0] << std::endl
            << h_MC_p[(NVdim1-1)+(NVdim2-1)*NVsize] << std::endl;
  */
  std::cout << "                   done\n"; //parallel (GPU)

  //++++++++++++++++++++++++++End: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++++

  //--------------------------Analysis of results----------------------------------------------------

  std::cout << "Überprüfe Ergebnisse:"; //spaltenweise vektorielle Überprüfung
  for(x=0ULL;x<NVdim1;++x)   //col
  {
    for(y=0ULL;y<NVdim2;++y) //row
    {
      if((MC_p[x][y]!=MC_s[x][y])&&(h_MC_p[(x+y*NVsize)]!=MC_s[x][y]))
      {
        ResultsAreCorrect = false;
        break;
      }
    }
  }
  std::cout << "                                          done\n";

  std::cout << "\nAuswertung:\n"
            << "*******************************************************************\n"
            << "Anzahl Komponenten der 2D-Eingangsmatrix  A: " << (NVdim1*NVdim2) << '\n'
            << "Anzahl Komponenten der 2D-Eingangsmatrix  B: " << (NVdim1*NVdim2) << '\n'
            << "Anzahl Komponenten der 2D-Ergebnismatrix  C: " << (NVdim1*NVdim2) << '\n'
            << "Seriell & parallel richtig gerechnet?:                          " << ((ResultsAreCorrect==true)?"yes\n":" no\n")
            << "Dauer - SERIELL  (CPU): " << sdelay << " sec\n"
            << "Dauer - PARALLEL (CPU): " << pdelay1 << " sec\n"
            << "Dauer - PARALLEL (GPU): " << pdelay2 << " sec\n"
            << "__________________\n"
            << "==>Summenmatrix (CPU-seriell berechnet):\n"
            << " MA(" << MA_s[0][0] << ';' << MA_s[1][0] << ';' << MA_s[2][0] << ';' << MA_s[3][0] << ";...;"
                     << MA_s[NVdim1-2][NVdim2-1] << ';' << MA_s[NVdim1-1][NVdim2-1] << ")\n+"
            << "MB(" << MB_s[0][0] << ';' << MB_s[1][0] << ';' << MB_s[2][0] << ';' << MB_s[3][0] << ";...;"
                     << MB_s[NVdim1-2][NVdim2-1] << ';' << MB_s[NVdim1-1][NVdim2-1] << ")\n="
            << "MC(" << MC_s[0][0] << ';' << MC_s[1][0] << ';' << MC_s[2][0] << ';' << MC_s[3][0] << ";...;"
                     << MC_s[NVdim1-2][NVdim2-1] << ';' << MC_s[NVdim1-1][NVdim2-1] << ")\n"
            << "==>Summenmatrix (CPU-parallel berechnet):\n"
            << " MA(" << MA_p[0][0] << ';' << MA_p[1][0] << ';' << MA_p[2][0] << ';' << MA_p[3][0] << ";...;"
                     << MA_p[NVdim1-2][NVdim2-1] << ';' << MA_p[NVdim1-1][NVdim2-1] << ")\n+"
            << "MB(" << MB_p[0][0] << ';' << MB_p[1][0] << ';' << MB_p[2][0] << ';' << MB_p[3][0] << ";...;"
                     << MB_p[NVdim1-2][NVdim2-1] << ';' << MB_p[NVdim1-1][NVdim2-1] << ")\n="
            << "MC(" << MC_p[0][0] << ';' << MC_p[1][0] << ';' << MC_p[2][0] << ';' << MC_p[3][0] << ";...;"
                     << MC_p[NVdim1-2][NVdim2-1] << ';' << MC_p[NVdim1-1][NVdim2-1] << ")\n"
            << "==>Summenmatrix (GPU-parallel berechnet):\n"
            << " MA(" << h_MA_p[(0+0*NVsize)] << ';' << h_MA_p[(1+0*NVsize)] << ';' << h_MA_p[(2+0*NVsize)] << ';' << h_MA_p[(3+0*NVsize)] << ";...;"
                     << h_MA_p[((NVdim1-2)+(NVdim2-1)*NVsize)] << ';' << h_MA_p[((NVdim1-1)+(NVdim2-1)*NVsize)] << ")\n+"
            << "MB(" << h_MB_p[(0+0*NVsize)] << ';' << h_MB_p[(1+0*NVsize)] << ';' << h_MB_p[(2+0*NVsize)] << ';' << h_MB_p[(3+0*NVsize)] << ";...;"
                     << h_MB_p[((NVdim1-2)+(NVdim2-1)*NVsize)] << ';' << h_MB_p[((NVdim1-1)+(NVdim2-1)*NVsize)] << ")\n="
            << "MC(" << h_MC_p[(0+0*NVsize)] << ';' << h_MC_p[(1+0*NVsize)] << ';' << h_MC_p[(2+0*NVsize)] << ';' << h_MC_p[(3+0*NVsize)] << ";...;"
                     << h_MC_p[((NVdim1-2)+(NVdim2-1)*NVsize)] << ';' << h_MC_p[((NVdim1-1)+(NVdim2-1)*NVsize)] << ")\n";
  /*
  // Detailed output
  std::cout << "__________________\n"
            << "Ergebnisliste:\n";
  x = y = i = 0ULL;
  uint64_t j = 0ULL;
  for(x=0ULL;x<NVdim1;++x)   //col
  {
    for(y=0ULL;y<NVdim2;++y) //row
    {
      for(i=0;i<NVsize;++i)
      {
        std::cout << "Seriell:    MA" << (j+1ULL) << '[' << x << "][" << i << "](" << MA_s[x][i] << ") + "
                  <<             "MB" << (j+1ULL) << '[' << i << "][" << y << "](" << MB_s[i][y] << ") = "
                  <<             "MC" << (j+1ULL) << '[' << x << "][" << y << "](" << MC_s[x][y] << ")\n"
                  << "Parallel:   MA" << (j+1ULL) << '[' << x << "][" << i << "](" << MA_p[x][i] << ") + "
                  <<             "MB" << (j+1ULL) << '[' << i << "][" << y << "](" << MB_p[i][y] << ") = "
                  <<             "MC" << (j+1ULL) << '[' << x << "][" << y << "](" << MC_p[x][y] << ")\n";
        ++j;
      }
    }
  }
  */
  // free allocated data on GPU memory
  cudaFree(d_MA_p);
  cudaFree(d_MB_p);
  cudaFree(d_MC_p);

  getchar();
  return 0;
}

// DEVICE: Implementation of CUDA-kernel "cu2DMatrixMul64ui()"
__global__ void cu2DMatrixMul64ui(uint64_t *D_MA, uint64_t *D_MB, uint64_t *D_MC)
{
  // Get GPU-Thread-ID (dim.x and dim.y) by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale)
  unsigned int tx = threadIdx.x + blockDim.x*blockIdx.x;
  unsigned int ty = threadIdx.y + blockDim.y*blockIdx.y;

  // algorithm
  uint64_t rC = 0;
  for(uint64_t z=0;z<NVsize;++z)
  {
    rC+=(D_MA[ty*NVsize+z]*D_MB[z*NVsize+tx]);
    // CUDAPrintf call
    //cuPrintf("%d\n",D_MA[ty*NVsize+z]);
  }
  D_MC[ty*NVsize+tx]=rC;
  // block locale GPU-Threads synchronization
  //__syncthreads();
}