/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
  CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools)
NOTES: optimized for NVIDIA-GPU-architecture: FERMI!
       two compile-steps!
  1.) <src.cu>  ==> <src.cpp>
  2.) <src.cpp> ==> <dest>
Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v
Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v
Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v
*/

// HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <climits>
#include <stdint.h>
#include <inttypes.h>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <cmath>

// HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

// HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <math_functions.h>
#include "./cuPrintf/cuPrintf.cu"

using namespace std;

#define free(x) free(x); *x=NULL

//const uint64_t UINT64_MIN     = 0;

const uint64_t NF = 100000000ULL; // Anzahl Brüche (num fractions): 1000000000ULL

// DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function)
__global__ void PiCuCalc(double *D_PiCu);

__host__ int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(0);

  // data declarations and implementations
  double starttime = 0.00, sdelay = 0.00, pdelay1 = 0.00, pdelay2 = 0.00;
  long double PiSingle = 0.000000000000000000, PiParallel = 0.000000000000000000;
  double h_PiCu[NF] = {0.000000000000000000}, *d_PiCu=NULL;

  std::cout.precision(20);

  std::cout << "PiCalc (Leibniz-Reihe)                                               (64-Bit)\n"
            << "=============================================================================\n"
            << "CPU-SERIELLE AUSFÜHRUNG:";

  //--------------------------Begin: CPU-serial execution of algorithm-------------------------------

  starttime = omp_get_wtime();
  //CPU-serial algorithm:
  for(uint64_t i = 0ULL;i<NF;++i)
  {
    PiSingle+=(pow(-1.000000000000000000,i)/((2.000000000000000000*i)+1.000000000000000000));
  }
  PiSingle*=4.000000000000000000;
  sdelay = omp_get_wtime()-starttime;
  std::cout << "                                                 done\n"; //serial (CPU)

  //--------------------------End: CPU-serial execution of algorithm---------------------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm----------------------

  std::cout << "CPU-PARALLELE AUSFÜHRUNG mit ";

  //long double *Threads = new long double[omp_get_max_threads()];
  // create parallel region: shared(Threads,...) if using "Threads"-variable
  #pragma omp parallel default(none) shared(std::cout, starttime, pdelay1, PiParallel)
  {
    #pragma omp master
    {
      std::cout << omp_get_num_threads() << " Threads:";
      starttime = omp_get_wtime();
    }

    //OpenMP-CPU-parallel algorithm with the Reduction-clause:
    #pragma omp flush
    #pragma omp for schedule(static) reduction(+: PiParallel)
    for(uint64_t k = 0ULL;k<NF;++k)
    {
      PiParallel+=(pow(-1.000000000000000000,k)/((2.000000000000000000*k)+1.000000000000000000));
      //Threads[omp_get_thread_num()]=PiParallel; //Partial results
    }

    #pragma omp master
    {
      PiParallel*=4.000000000000000000; //endresult
      pdelay1 = omp_get_wtime()-starttime;
      if(omp_get_num_threads() >= 10)
      {
        std::cout << "                                 done\n";  //parallel (CPU)
      }
      else
      {
        std::cout << "                                  done\n"; //parallel (CPU)
      }
      /*
      //Partial results of each CPU-Thread
      for(int j=0;j<omp_get_num_threads();++j)
      {
        if(j<9)
        {
          if((Threads[j]*4.000000000000000000)>=0)
          {
            std::cout << "Teilergebnis von Thread  " << (j+1) << ": " << (Threads[j]*4.000000000000000000) << '\n';
          }
          else
          {
            std::cout << "Teilergebnis von Thread  " << (j+1) << ":" << (Threads[j]*4.000000000000000000) << '\n';
          }
        }
        else
        {
          if((Threads[j]*4.000000000000000000)>=0)
          {
            std::cout << "Teilergebnis von Thread " << (j+1) << ": " << (Threads[j]*4.000000000000000000) << '\n';
          }
          else
          {
            std::cout << "Teilergebnis von Thread " << (j+1) << ":" << (Threads[j]*4.000000000000000000) << '\n';
          }
        }
      }
      */
    }
  }

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm------------------------

  //++++++++++++++++++++++++++Begin: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++

  std::cout << "GPU-PARALLELE AUSFÜHRUNG mit 1 GPU u. 14 SM:";
  // begin of CUDAPrintf-section
  //cudaPrintfInit();
  // allocate data on GPU memory
  cudaMalloc((void**)&d_PiCu,NF*sizeof(double));
  // copy Host-Data into GPU-Data (memory)
  cudaMemcpy(d_PiCu,h_PiCu,NF*sizeof(double),cudaMemcpyHostToDevice);
  starttime = omp_get_wtime();
  /* CUDA-kernel call with
     112 Blöcke (8 per 1 SM) & 192 Threads per Block = 21504 Threads = 672 Warps (=14 SM = 1 Fermi-GPC/-GPU) */
  PiCuCalc<<<112,192>>>(d_PiCu);  // Part 1 of algorithm
  // CPU external GPU-Threads synchronisation
  cudaDeviceSynchronize();
  pdelay2 = omp_get_wtime()-starttime;
  // CUDAPrintf output of CUDA-kernel call
  //cudaPrintfDisplay(stdout,true);
  // end of CUDAPrintf-section
  //cudaPrintfEnd();
  // copy calculated device-data from device memory back into CPU-data (RAM)
  cudaMemcpy(h_PiCu,d_PiCu,NF*sizeof(double),cudaMemcpyDeviceToHost);
  // reduce (accumulate) partial data (results of algorithm) to one result: h_PiCu[0]
  for(uint64_t c=1ULL;c<NF;++c)
  {
    h_PiCu[0]+=h_PiCu[c];
  }
  h_PiCu[0]*=4.000000000000000000; // endresult (PI)
  std::cout << "                             done\n"; //parallel (GPU)

  //++++++++++++++++++++++++++End: GPU-parallel CUDA-execution of algorithm++++++++++++++++++++++++++

  //--------------------------Analysis of results----------------------------------------------------

  std::cout << "\nAuswertung:\n"
            << "*****************************************************************************\n"
            << "Anzahl Brüche:          " << NF << '\n'
            << "PI - SERIELL  (CPU):    " << PiSingle << '\n'
            << "PI - PARALLEL (CPU):    " << PiParallel << '\n'
            << "PI - PARALLEL (GPU):    " << h_PiCu[0] << '\n'
            << "Referenzwert:           3.141592653589793238462643383279502884197169399375105\n"
            << "Dauer - SERIELL:        " << sdelay << " sec\n"
            << "Dauer - PARALLEL (CPU)  " << pdelay1 << " sec\n"
            << "Dauer - PARALLEL (GPU): " << pdelay2 << " sec\n";

  //delete []Threads;
  // free allocated data on GPU memory
  cudaFree(d_PiCu);

  getchar();
  return 0;
}

// DEVICE: Implementation of CUDA-kernel "PiCuCalc()"
__global__ void PiCuCalc(double *D_PiCu)
{
  // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale)
  int tid = threadIdx.x + blockIdx.x*blockDim.x;
  /* algorithm (Part 2)
     NOTE: normal algorithm ist SUM(powf(-1,k)/2*k+1)) ; k=0 to k=NF, but:
           powf(>-1<,k)-(CUDA)-function of the current CUDA-Version credited incorrectly with negative value "-1",
           therefore, manual determination with tmp-variable: */
  __shared__ int64_t  tmp;
  while(tid<NF) // iterate through all the data in GPU-memory - PART 1
  {
    if(tid%2 == 0)
    {
      tmp = (1.000000000000000000);
    }
    else
    {
      tmp = (-1.000000000000000000);
    }
    D_PiCu[tid]+=(tmp/((2.000000000000000000*((double)tid))+1.000000000000000000));
    //D_PiCu[tid]+=(powf((double)(-1.000000000000000000),(double)tid)/((2.000000000000000000*((double)tid))+1.000000000000000000));
    // CUDAPrintf call
    //cuPrintf("%f\n",D_PiCu[tid]);
    // block locale GPU-Threads synchronization
    __syncthreads();
    // iterate through all the data in GPU-memory - PART 2
    tid+=(blockDim.x*gridDim.x);
  }
}