/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
  CUDA->v4.0-supported GPU-driver, compat. CUDA SDK >v4.0 (e.g. for CUPrintf), compat. CUDA Toolkit (nvcc-Compiler and other CUDA-Tools)
NOTES: optimized for NVIDIA-GPU-architecture: FERMI!
       two compile-steps!
  1.) <src.cu>  ==> <src.cpp>
  2.) <src.cpp> ==> <dest>
Normal-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math <src.cpp> -o <dest> -v
Release-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra -O3" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -O3 <src.cpp> -o <dest> -v
Debug-Compile with nvcc- (for CUDA-GPU-Code) and g++-Compiler (for Host-C/C++-Code) (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  1.) nvcc -ccbin /usr/bin/g++ -Xcompiler "-m64 -fopenmp -lstdc++ -lm -lgomp -lcuda -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcuda -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 -cuda <src.cu> -o <src.cpp> -v
  2.) nvcc -x c++ -ccbin /usr/bin/g++ -Xcompiler "-std=c++0x -m64 -fopenmp -lstdc++ -lm -lgomp -lcudart -Wall -Wextra" -m64 -gencode=arch=compute_10,code=sm_10 -gencode=arch=compute_10,code=compute_10 -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_20,code=compute_20 -lstdc++ -lm -lgomp -lcudart -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/include/ -I/usr/local/cuda/lib -I/usr/local/cuda/lib64 -I /usr/local/cuda/include/ -use_fast_math -g -G3 <src.cpp> -o <dest> -v
*/

// HOST: Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <cxxabi.h>
#include <climits>
#include <stdint.h>
#include <inttypes.h>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <cmath>

// HOST: Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

// HOST: Include of CUDA-Mainlib, CUDA-Runtimelib and CUDAPrintf-Lib for CUDA-Support
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "./cuPrintf/cuPrintf.cu"

using namespace std;

#define free(x) free(x); *x=NULL

// DEVICE: Prototype of the CUDA-kernel declaration (CUDA-kernel = CUDA-C/C++-function)
__global__ void GPUthread(void);

__host__ int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(false);
  omp_set_nested(true);

  // Get Number of GPU´s
  int NumGPUs = 0;
  cudaGetDeviceCount(&NumGPUs);

  std::cout << "Hello World                                                                           (64-Bit)\n"
            << "==============================================================================================\n";

  /* Create OpenMP-parallel region to use Multi-CPU and Multi-GPU.
     For CUDA: only one CPU-Thread can use one GPU-session at the same time. */
  #pragma omp parallel default(none) shared(std::cout, stdout, NumGPUs)
  {
    // "Hello World" from (Multi-)CPU and get CPU information
    #pragma omp master
    {
      std::cout << "==>sagt die CPU (HOST, SERIELL):                                                          done\n";
      if(omp_get_num_procs() > 1)
      {
        std::cout << "Multiprozessor:                        yes (" << omp_get_num_procs() << " CPU-Kerne)\n"
                  << "Max. Anzahl möglicher Threads:         " << omp_get_max_threads() << '\n'
                  << "Max. Anzahl Threads im aktuellen Team: " << omp_get_num_threads() <<'\n';
      }
      else
      {
        std::cout << "Multiprozessor: no (nur " << omp_get_num_procs() << "CPU-Kern)\n";
      }
    }
    #pragma omp barrier

    // All CPU-Threads says "Hello World"
    if(omp_get_num_procs() > 1)
    {
      #pragma omp master
      {
        std::cout << "\n==>sagen alle CPU´s (" << omp_get_num_threads() << " HOST-Threads, PARALLEL mit OpenMP):                               done\n";
      }
      #pragma omp barrier

      #pragma omp critical
      {
        if(omp_get_thread_num()==0)
        {
          std::cout << "   CPU[-Thread]: " << (omp_get_thread_num()+1) << " (Master-Thread)\n";
        }
        else
        {
          std::cout << "   CPU[-Thread]: " << (omp_get_thread_num()+1) << '\n';
        }
      }
    }
    #pragma omp barrier

    // Get GPU[´s] informations
    #pragma omp master
    {
      bool isFermi = false;  // state variable for GPU architecture type (is FERMI = true, isn´t FERMI = false, default: false)

      if(NumGPUs!=0)
      {
        if(NumGPUs == 1)
        {
          std::cout << "\n==> sagt die CUDA-fähige GPU (Device):                                                    done\n";
        }
        else
        {
          std::cout << "\n==> sagen alle " << NumGPUs << " CUDA-fähigen GPU´s (GPC, Devices):                                       done\n";
        }

        cudaDeviceProp dprop;      // CUDA-Struct variable for CUDA-GPU-Settings
        for(int p=0;p<NumGPUs;++p)
        {
          std::cout << "    GPU " << p << ":\n";
          cudaGetDeviceProperties(&dprop,p);

          if((dprop.major==2)&&(dprop.minor==0))
          {
            isFermi = true;
          }
          else
          {
            isFermi = false;
          }

          std::cout << "    Name:                                                 NVIDIA® " << dprop.name << ((isFermi==true)?" (Fermi)\n":"\n")
                    << "    Typ:                                                  " << ((dprop.integrated==1)?"integrated (Onboard,MB)\n":"discrete (card)\n")
                    << "    Compute capability (revision numbers):                " << dprop.major << '.' << dprop.minor << " (major.minor)\n"
                    << "    Clock frequency:                                      " << dprop.clockRate << " kHz (~" << ((double)dprop.clockRate/1000.00) << " MHz / ~" << ((double)dprop.clockRate/1000000.00) << " GHz)\n"
                    << "    Total global memory (available):                      " << dprop.totalGlobalMem << " Bytes (~" << ((double)dprop.totalGlobalMem/pow(1024.00,3)) << " Gb)\n"
                    << "    Total constant memory (available):                    " << dprop.totalConstMem << " Bytes (~" << ((double)dprop.totalConstMem/1024.00) << " Kb)\n"
                    << "    Max memory pitch:                                     " << dprop.memPitch << " Bytes (~" << ((double)dprop.memPitch/pow(1024.00,3)) << " Gb)\n"
                    << "    Texture Alignment:                                    " << dprop.textureAlignment << " Bytes\n"
                    << "    Device copy overlap:                                  " << ((dprop.deviceOverlap==1)?"Enabled\n":"Disabled\n")
                    << "    Host memory-mapping (hostm. map to devicem.):         " << ((dprop.canMapHostMemory==1)?"Enabled\n":"Disabled\n")
                    << "    Kernel execution timeout:                             " << ((dprop.kernelExecTimeoutEnabled==1)?"Enabled\n":"Disabled\n")
                    << "    Compute Mode:                                         " << ((dprop.computeMode==0)?"Default mode\n":\
                                                                                        (dprop.computeMode==1)?"Compute-exclusive mode\n":\
                                                                                        (dprop.computeMode==2)?"Compute-prohibited mode\n":\
                                                                                        "unknown\n")
                    << "    Streaming Multiprocessors (SM) count:                 " << dprop.multiProcessorCount << '\n';
                    if(isFermi==true)
                    {
                      std::cout << "    Streamprocessors (SP, CUDA-Cores) count:              " << (dprop.multiProcessorCount*32) << '\n'
                                << "    SP´s per SM:                                          32\n"
                                << "    Threads pro SM:                                       1536\n"
                                << "    Warps per SM:                                         " << (1536/dprop.warpSize) << '\n';
                    }
                    else
                    {
                      std::cout << "    Streamprocessors (SP, CUDA-Cores) count:              unknown\n"
                                << "    SP´s per SM:                                          unknown\n"
                                << "    Threads pro SM:                                       unknown\n";
                    }
          std::cout << "    Shared mem per SM (available & shared by all blocks): " << dprop.sharedMemPerBlock << " Bytes (~" << ((double)dprop.sharedMemPerBlock/1024.00) << " Kb)\n"
                    << "    Registers per SM (available & shared by all blocks):  " << dprop.regsPerBlock << " (32-bit width)\n"
                    << "    Warp size:                                            " << dprop.warpSize << " Threads/Warp\n"
                    << "    Max threads per block:                                " << dprop.maxThreadsPerBlock << '\n'
                    << "    Max block dimensions (threads per blockdim):          " << dprop.maxThreadsDim[0] << 'x' << dprop.maxThreadsDim[1] << 'x' << dprop.maxThreadsDim[2] << " (x*y*z)\n"
                    << "    Max grid dimensions (blocks per griddim):             " << dprop.maxGridSize[0] << 'x' << dprop.maxGridSize[1] << 'x' << dprop.maxGridSize[2] << " (x*y*z)\n"
                    << '\n';
        }
      }
      else
      {
      }
    }

    /* All CUDA supported GPU´s says "Hello World" in the CUDA-kernel call.
       The OpenMP-loop: number of used CPU-Threads = Number of GPU´s.
       CUDAPrintf requires a CUDAPrintf-section by the following function calls in order: cudaSetDevice(<GPU-ID>);
                                                                                          cudaPrintfInit();
                                                                                          <CUDA-kernel call>;
                                                                                          cudaDeviceSynchronize();
                                                                                          cudaPrintfDisplay(stdout,true);
                                                                                          cudaPrintfEnd(); */
    #pragma omp for schedule(static)
    for(int GPU=0;GPU<NumGPUs;++GPU)
    {
      #pragma omp critical
      {
        cudaSetDevice(GPU);             // choose GPU
        cudaPrintfInit();               // begin of CUDAPrintf-section
        printf("GPU %i:\n",GPU);        // normal C-printf()
        GPUthread<<<1,1>>>();           // CUDA-kernel call with 1 Thread-Block and 1 GPU-Thread per block = 1 GPU-Thread total
        cudaDeviceSynchronize();        // CPU external GPU-Threads synchronisation
        cudaPrintfDisplay(stdout,true); // CUDAPrintf output of CUDA-kernel call
        cudaPrintfEnd();                // end of CUDAPrintf-section
      }
    }
  }

  getchar();
  return 0;
}

// DEVICE: Implementation of CUDA-kernel "GPUthread()"
__global__ void GPUthread(void)
{
  // Get GPU-Thread-ID by linearization of the Thread-ID in the GPU-(Grid-and-Block)-construct (Grid- & Block locale)
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  // CUDAPrintf call
  cuPrintf("Thread %i: Hello, nice to meet you!\n",(idx+1));
  // block locale GPU-Threads synchronization
  __syncthreads();
}