/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
  CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools)
NOTES: optimized for NVIDIA-GPU-architecture: FERMI!
       two compile-steps!
  1.) <src.cu>  ==> <src.cpp>
  2.) <src.cpp> ==> <dest>
Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v
Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v
Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v
*/

// HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <cxxabi.h>
#include <climits>
#include <stdint.h>
#include <inttypes.h>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <cmath>

// HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

// HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support
#include <cuda.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "./cuPrintf/cuPrintf.cu"

using namespace std;

#define free(x) free(x); *x=NULL
const uint64_t NumValues = 400000000ULL,
               DataPartitionSize = (NumValues/2ULL);

// DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function)
__global__ void cu1DVecAdd64(uint64_t *D_A, uint64_t *D_B, uint64_t *D_C);

__host__ int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(0);

  // data declarations and implementations
  double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00;
  uint64_t A_s[NumValues] = {0ULL}, \
           B_s[NumValues] = {0ULL}, \
           C_s[NumValues] = {0ULL}, \
           A_p[NumValues] = {0ULL}, \
           B_p[NumValues] = {0ULL}, \
           C_p[NumValues] = {0ULL}, \
           h_A_p[NumValues] = {0ULL}, \
           h_B_p[NumValues] = {0ULL}, \
           h_C_p[NumValues] = {0ULL}, \
           *d_A_p=NULL, *d_B_p=NULL, *d_C_p=NULL;
  bool ResultsAreCorrect = false;

  std::cout << "Vektoraddition (1D)                                        (64-Bit)\n"
            << "===================================================================\n"
            << "Initialisierung:";

  //--------------------------Begin: Initialization of data------------------------------------------

  for(uint64_t i=0ULL;i<NumValues;++i)
  {
    A_s[i] = A_p[i] = h_A_p[i] = B_s[i] = B_p[i] = h_B_p[i] = (i+1ULL);
  }

  //--------------------------End: Initialization of data--------------------------------------------

  std::cout << "                                               done\n"
            << "CPU-SERIELLE AUSFÜHRUNG:";

  //--------------------------Begin: CPU-serial execution of algorithm-------------------------------

  starttime = omp_get_wtime();
  //CPU-serial algorithm:
  for(uint64_t j=0ULL;j<NumValues;++j)
  {
    C_s[j] = A_s[j] + B_s[j];
  }
  sdelay = omp_get_wtime()-starttime;
  std::cout << "                                       done\n"; //serial

  //--------------------------End: CPU-serial execution of algorithm---------------------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm----------------------

  std::cout << "CPU-PARALLELE AUSFÜHRUNG mit ";
  // create parallel region:
  #pragma omp parallel default(none) shared(std::cout, starttime, pdelay1, A_p, B_p, C_p)
  {
    #pragma omp master
    {
      std::cout << omp_get_num_threads() << " Threads:";
      starttime = omp_get_wtime();
    }

    //OpenMP-CPU-parallel algorithm
    #pragma omp flush
    #pragma omp for schedule(static)
    for(uint64_t k=0ULL;k<NumValues;++k)
    {
      C_p[k] = A_p[k] + B_p[k];
    }

    #pragma omp master
    {
      pdelay1 = omp_get_wtime()-starttime;
      if(omp_get_num_threads() >= 10)
      {
        std::cout << "                       done\n";  //parallel
      }
      else
      {
        std::cout << "                        done\n"; //parallel
      }
    }
  }

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------

  //++++++++++++++++++++++++++Begin: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++

  std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 2 GPU u. 14 SM:";
  // choose GPU
  cudaSetDevice(0);
  // allocate data on GPU memory
  cudaMalloc((void**)&d_A_p,(DataPartitionSize*sizeof(uint64_t)));
  cudaMalloc((void**)&d_B_p,(DataPartitionSize*sizeof(uint64_t)));
  cudaMalloc((void**)&d_C_p,(DataPartitionSize*sizeof(uint64_t)));
  // partitioning of data (CPU memory = big, but GPU memory (relatively) = small)
  for(uint64_t partition=0;partition<(NumValues/DataPartitionSize);++partition) //(NumValues/DataPartitionSize) = NumPartitions
  {
    // copy Host-Data into GPU-Data (memory)
    cudaMemcpy(d_A_p,(void*)(h_A_p+(partition*DataPartitionSize)),(DataPartitionSize*sizeof(uint64_t)),cudaMemcpyHostToDevice);
    cudaMemcpy(d_B_p,(void*)(h_B_p+(partition*DataPartitionSize)),(DataPartitionSize*sizeof(uint64_t)),cudaMemcpyHostToDevice);
    starttime = omp_get_wtime();
    /* CUDA-kernel call with
       112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */
    cu1DVecAdd64<<<112,192>>>(d_A_p,d_B_p,d_C_p);
    //std::cout << "Fehler?: " << cudaGetErrorString(cudaGetLastError()) << '\n';
    // CPU external GPU-Threads synchronisation
    cudaDeviceSynchronize();
    pdelay2+=(omp_get_wtime()-starttime);
    // copy calculated device-data from device memory back into CPU-data (RAM)
    cudaMemcpy((void*)(h_C_p+(partition*DataPartitionSize)),d_C_p,(DataPartitionSize*sizeof(uint64_t)),cudaMemcpyDeviceToHost);
  }
  std::cout << "                   done\n"; //parallel (GPU)

  //++++++++++++++++++++++++++End: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++++

  //--------------------------Analysis of results----------------------------------------------------

  std::cout << "Überprüfe Ergebnisse:";
  for(uint64_t l=0ULL;l<NumValues;++l)
  {
    if((C_p[l]==C_s[l])&&(h_C_p[l]==C_s[l]))
    {
      ResultsAreCorrect = true;
    }
    else
    {
      ResultsAreCorrect = false;
      break;
    }
  }
  std::cout << "                                          done\n";

  std::cout << "\nAuswertung:\n"
            << "*******************************************************************\n"
            << "Anzahl 1D-Eingangsvektoren  A: " << NumValues << '\n'
            << "Anzahl 1D-Eingangsvektoren  B: " << NumValues << '\n'
            << "Anzahl 1D-Ergebnis-Vektoren C: " << NumValues << '\n'
            << "Seriell & parallel richtig gerechnet?:                          " << ((ResultsAreCorrect==true)?"yes\n":" no\n")
            << "Dauer - SERIELL:           " << sdelay << " sec\n"
            << "Dauer - PARALLEL (CPU):    " << pdelay1 << " sec\n"
            << "Dauer - PARALLEL (GPU):    " << pdelay2 << " sec\n"
            << "__________________\n"
            << "Beispiele:\n"
            << "==> 1.(1D)-Vektoraddition:\n"
            << "C1 = A1(" << A_p[0] << ") + B1(" << B_p[0] << ")\n"
            << "C1 = " << C_p[0] << "\n"
            << "==> " << NumValues << ".(1D)-Vektoraddition:\n"
            << "C" << NumValues << " = A" << NumValues << "(" << A_p[NumValues-1] << ") + B" << NumValues << "(" << B_p[NumValues-1] << ")\n"
            << "C" << NumValues << " = " << C_p[NumValues-1] << "\n";
  /*
  // Detailed output
  std::cout << "__________________\n"
            << "Ergebnisliste:\n";
  for(uint64_t m=0ULL;m<NumValues;++m)
  {
    std::cout << "Seriell:    A" << (m+1) << " = " << A_s[m] << "    B" << (m+1) << " = " << B_s[m] << "    C" << (m+1) << " = " << C_s[m] << '\n';
    std::cout << "Parallel (CPU):   A" << (m+1) << " = " << A_p[m] << "    B" << (m+1) << " = " << B_p[m] << "    C" << (m+1) << " = " << C_p[m] << '\n';
    std::cout << "Parallel (GPU):   A" << (m+1) << " = " << h_A_p[m] << "    B" << (m+1) << " = " << h_B_p[m] << "    C" << (m+1) << " = " << h_C_p[m] << '\n';
  }
  */
  // free allocated data on GPU memory
  cudaFree(d_A_p);
  cudaFree(d_B_p);
  cudaFree(d_C_p);

  getchar();
  return 0;
}

// DEVICE: Implementation of CUDA-kernel "cu1DVecAdd64()"
__global__ void cu1DVecAdd64(uint64_t *D_A, uint64_t *D_B, uint64_t *D_C)
{
  // for work with faster on-Chip GPU-shared memory
  __shared__ uint64_t sD_A;
  __shared__ uint64_t sD_B;
  __shared__ uint64_t sD_C;
  // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale)
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  // algorithm
  while(tid < DataPartitionSize) // iterate through all the data in GPU-memory - PART 1
  {
    sD_A = D_A[tid];
    sD_B = D_B[tid];
    sD_C = sD_A + sD_B;
    D_C[tid] = sD_C;
    // block locale GPU-Threads synchronization
    __syncthreads();
    tid+=(blockDim.x*gridDim.x); // iterate through all the data in GPU-memory - PART 2
  }
}