/*
COPYRIGHT (2011-2012) by:
Kevin Marco Erler (author), http://www.kevinerler.de
AIU-FSU Jena (co-owner), http://www.astro.uni-jena.de
SBSZ Jena-Göschwitz (co-owner), http://www.sbsz-jena.de
BSZ-Hermsdorf (co-owner), http://www.bszh.de
Advanced Licensing (dual license: COPYRIGHT and following licenses):
License (international): CC-BY v3.0-unported or later - link: http://creativecommons.org/licenses/by/3.0/deed.en
License (Germany):       CC-BY v3.0-DE       or later - link: http://creativecommons.org/licenses/by/3.0/de/
------------------
Compilation requirements:
Packages (x86-64):
  GCC >v4.2, compat. libstdc++ and GOMP v3.0
Normal-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -s <source.cpp> -o <dest>
Release-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -O3 -s <source.cpp> -o <dest>
Debug-Compile with g++-Compiler (Red Hat GCC 4.4.5-6 x86-64 tested) + OpenMP v3.0 ([lib]GOMP v3.0 x86-64 tested)
  g++ -std=c++0x -m64 -fopenmp -Wall -Wextra -pedantic -pedantic-errors -lgomp -lm -g -ggdb3 <source.cpp> -o <dest>
*/

// Includes of C/C++-Librarys for INTs, REAL/FLOATs, STRINGS, Math-Calc and I/O
#include <climits>
#include <cstdint>
#include <cinttypes>
#include <cfloat>
#include <cwchar>
#include <string>  //std:string
#include <string.h>
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <cmath>
#include <ctime>

// Conditional compilation (conditional include) of the OpenMP-Mainlib for OpenMP-Support
#ifdef _OPENMP
#include <omp.h>
#endif

using namespace std;

#define free(x) free(x); *x=NULL
#define PRId128 "s"
#define PRIi128 "s"
#define PRIu128 "s"

const uint64_t UINT64_MIN     = 0;
const __int128_t INT128_MIN   = (__int128_t)((-170141183460469231731.687303715884105728) * pow(10,18));
const __int128_t INT128_MAX   = (__int128_t)(( 170141183460469231731.687303715884105727) * pow(10,18));
const __uint128_t UINT128_MAX = (__uint128_t)((340282366920938463463.374607431768211455) * pow(10,18));
const __uint128_t UINT128_MIN = 0/* * pow(10,18)*/;

std::ostream &operator<<(std::ostream &out, __uint128_t x)
{
  if(x >= 10)
  {
    out << x / 10;
  }
  return out << static_cast<unsigned>(x % 10);
}

std::ostream &operator<<(std::ostream &out, __int128_t x)
{
  if(x < 0)
  {
    out << '-';
    x = -x;
  }
  return out << static_cast<__uint128_t>(x);
}

string INT128ToSTR(__int128_t x)
{
  std::stringstream sstr;
  sstr<<x;
  return sstr.str();
}
#define INT128ToCSTR(x) (INT128ToSTR(x)).c_str()

string UINT128ToSTR(__uint128_t x)
{
  std::stringstream sstr;
  sstr<<x;
  return sstr.str();
}
#define UINT128ToCSTR(x) (UINT128ToSTR(x)).c_str()

const uint64_t datarsize = 100000ULL;

int main(int argc, char *argv[])
{
  // Runtime manipulation of OpenMP-state variables
  //omp_set_num_threads(4);
  omp_set_dynamic(1);
  omp_set_nested(1);

  /* data declarations and implementations
     DB-arrays requires a FILE-Input-database (file based) of datarsize (N) elements (mtype uint64_t) */
  FILE *FileVar;
  double starttime = 0.00, sdelay1 = 0.00, sdelay2 = 0ULL, pdelay = 0.00;
  uint64_t db1[datarsize] = {0ULL},  //db1 for BubbleSort: Single-Execution
           db3[datarsize] = {0ULL},  //db3 for Odd-Even-Transposition-Sort (OetSort: parallelversion of BubbleSort): Single-Execution
           db2[datarsize] = {0ULL},  //db2 for Odd-Even-Transposition-Sort (OetSort: parallelversion of BubbleSort): Parallel-Execution
           tmp=0ULL;
  bool CorrectSorted = true;

  std::cout << "BubbleSort                                                 (64-Bit)\n"
            << "===================================================================\n"
            << "DB einlesen:";

  //--------------------------Begin: Initialization of data------------------------------------------

  if((FileVar = fopen("db1.bin","rb"))!=NULL)
  {
    fread(&db1,sizeof(db1),1,FileVar);
    fseek(FileVar,0,SEEK_SET);
    fread(&db2,sizeof(db2),1,FileVar);
    fseek(FileVar,0,SEEK_SET);
    fread(&db3,sizeof(db3),1,FileVar);
    fclose(FileVar);
  }
  else
  {
    std::cout << "Fehler beim Öffnen/Einlesen der DB-Datei!\n";
    return 0;
  }
  std::cout << "                                                   done\n";
  /*
  std::cout << "Vorsortierte DB erzeugen (für Testzwecke):";
  tmp = 0ULL;
  for(uint64_t q=0ULL;q<(datarsize-1ULL);++q)
  {
    for(uint64_t r=0ULL;r<(datarsize-q-1ULL);++r)
    {
      if(db1[r] > db1[r+1ULL])
      {
        tmp = db1[r];
        db1[r] = db1[r+1ULL];
        db1[r+1ULL] = tmp;
      }
    }
  }
  for(uint64_t v=0ULL;v<datarsize;++v)
  {
    db3[v] = db2[v] = db1[v];
  }
  std::cout << "                     done\n";
  */

  //--------------------------End: Initialization of data--------------------------------------------

  //--------------------------Begin: CPU-serial execution of algorithm (BubbleSort)------------------

  std::cout << "SERIELLE AUSFÜHRUNG (BubbleSort):";
  tmp = 0ULL;
  starttime = omp_get_wtime();
  //CPU-serial algorithm (BubbleSort):
  for(uint64_t i=0ULL;i<(datarsize-1ULL);++i)
  {
    for(uint64_t j=0ULL;j<(datarsize-i-1ULL);++j)
    {
      if(db1[j] > db1[j+1ULL])
      {
        tmp = db1[j];
        db1[j] = db1[j+1ULL];
        db1[j+1ULL] = tmp;
      }
    }
  }
  sdelay1 = omp_get_wtime()-starttime;
  std::cout << "                              done\n"; //serial (BubbleSort)

  //--------------------------End: CPU-serial execution of algorithm (BubbleSort)--------------------

  //--------------------------Begin: CPU-serial execution of algorithm (OetSort)---------------------

  std::cout << "SERIELLE AUSFÜHRUNG (OetSort):";
  tmp = 0ULL;
  starttime = omp_get_wtime();
  //CPU-serial algorithm (OetSort):
  for(uint64_t n3=1ULL;n3<datarsize;++n3)
  {
    if((n3%2ULL)==0ULL) //conditional execution of EVEN- or ODD-for-loop
    {
      for(uint64_t e3=0ULL;e3<(datarsize-1ULL);e3+=2ULL) //even-step
      {
        if(db3[e3] > db3[(e3+1ULL)])
        {
          tmp = db3[e3];
          db3[e3] = db3[(e3+1ULL)];
          db3[(e3+1ULL)] = tmp;
        }
      }
    }
    else
    {
      for(uint64_t o3=1ULL;o3<(datarsize-1ULL);o3+=2ULL) //odd-step
      {
        if(db3[o3] > db3[(o3+1ULL)])
        {
          tmp = db3[o3];
          db3[o3] = db3[(o3+1ULL)];
          db3[(o3+1ULL)] = tmp;
        }
      }
    }
  }
  sdelay2 = omp_get_wtime()-starttime;
  std::cout << "                                 done\n"; //serial (OetSort)

  //--------------------------End: CPU-serial execution of algorithm (OetSort)-----------------------

  //--------------------------Begin: CPU-parallel OpenMP-execution of algorithm (OetSort)------------
  //
  //For a better reliability and stability: min. 2 Threads, max. (MaxThreads-1)
  std::cout << "PARALLELE AUSFÜHRUNG (OetSort) mit " << ((omp_get_max_threads()>2)?(omp_get_max_threads()-1):(omp_get_max_threads()==2)?2:1) 
            << " Threads:";
  /* OpenMP-CPU-parallel algorithm
     a normal single-for-loop (C/C++) contains two OpenMP-loops (conditional execution) */
  tmp = 0ULL;
  starttime = omp_get_wtime();
  for(uint64_t n2=1ULL;n2<datarsize;++n2)
  {
    #pragma omp flush (db2)
    if((n2%2ULL)==0ULL) //conditional execution of EVEN- or ODD-OpenMP-for-loop
    {
      //even-step
      //create parallel region and OpenMP-for-loop:
      #pragma omp parallel for num_threads((omp_get_max_threads()>2)?(omp_get_max_threads()-1):(omp_get_max_threads()==2)?2:1) \
                               schedule(static) \
                               default(none) private(tmp) shared(db2)
      for(uint64_t e2=0ULL;e2<(datarsize-1ULL);e2+=2ULL)
      {
        if(db2[e2] > db2[(e2+1ULL)])
        {
          tmp = db2[e2];
          db2[e2] = db2[(e2+1ULL)];
          db2[(e2+1ULL)] = tmp;
        }
      }
    }
    else
    {
      //odd-step
      //create parallel region and OpenMP-for-loop:
      #pragma omp parallel for num_threads((omp_get_max_threads()>2)?(omp_get_max_threads()-1):(omp_get_max_threads()==2)?2:1) \
                               schedule(static) \
                               default(none) private(tmp) shared(db2)
      for(uint64_t o2=1ULL;o2<(datarsize-1ULL);o2+=2ULL)
      {
        if(db2[o2] > db2[(o2+1ULL)])
        {
          tmp = db2[o2];
          db2[o2] = db2[(o2+1ULL)];
          db2[(o2+1ULL)] = tmp;
        }
      }
    }
  }
  pdelay = omp_get_wtime()-starttime;
  if((omp_get_max_threads()-1) >= 10)
  {
    std::cout << "                 done\n";  //parallel
  }
  else
  {
    std::cout << "                  done\n"; //parallel
  }

  //--------------------------End: CPU-parallel OpenMP-execution of algorithm (OetSort)--------------

  //--------------------------Analysis of results----------------------------------------------------

  std::cout << "Überprüfe Ergebnisse:";
  for(uint64_t t=0ULL;t<datarsize;++t)
  {
    if(((db1[t]==db2[t]) &&
        (db1[t]==db3[t])) &&
        (db1[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]==db2[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]) &&
        (db1[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]==db3[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]))
    {
      if((db1[t]>db1[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]) &&
         (db2[t]>db2[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]) &&
         (db3[t]>db3[((t<(datarsize-1ULL))?(t+1ULL):(datarsize-1ULL))]))
      {
        CorrectSorted = false;
        break;
      }
    }
    else
    {
      CorrectSorted = false;
      break;
    }
  }
  std::cout << "                                          done\n";

  std::cout << "\nAuswertung:\n"
            << "*******************************************************************\n"
            << "DB-Größe:            " << sizeof(db1) << " Bytes\n"
            << "Anzahl Elemente:     " << datarsize << " zu je " << sizeof(uint64_t) << " Bytes\n"
            << "Seriell & parallel richtig sortiert?:                           " << ((CorrectSorted==true)?"yes\n":" no\n")
            << "Dauer - SERIELL (BubbleSort):     " << sdelay1 << " sec\n"
            << "Dauer - SERIELL (OetSort):        " << sdelay2 << " sec\n"
            << "Dauer - PARALLEL (OetSort):       " << pdelay  << " sec\n"
            << "__________________\n"
            << "==>Via BubbleSort seriell|parallel sortierte Daten:\n";
  for(uint64_t m1=0ULL;m1<50ULL;++m1)
  {
    std::cout << "S1-Element " << (m1+1ULL) << ": " << db1[m1] << " | S2-Element " << (m1+1ULL) << ": " << db3[m1] 
              << " | P-Element " << (m1+1ULL) << ": " << db2[m1] << '\n';
  }
  std::cout << "...\n";
  for(uint64_t m2=(datarsize-50ULL);m2<datarsize;++m2)
  {
    std::cout << "S1-Element " << (m2+1ULL) << ": " << db1[m2] << " | S2-Element " << (m2+1ULL) << ": " << db3[m2] 
              << " | P-Element " << (m2+1ULL) << ": " << db2[m2] << '\n';
  }
  std::cout << "__________________"
            << "\n64-Bit-Werte:\n"
            << "INT64_MIN:                                    "  << INT64_MIN << '\n'
            << "INT64_MAX:                                     " << INT64_MAX << '\n'
            << "UINT64_MIN:                                    " << UINT64_MIN << '\n'
            << "UINT64_MAX:                                    " << UINT64_MAX << '\n';
  /*
  // Detailed output
  std::cout << "Sortierte Daten:\n"
            << "==========================\n";
  for(uint64_t k=0ULL;k<datarsize;++k)
  {
    std::cout << "S1-Element " << (k+1ULL) << ": " << db1[k] << " | S2-Element " << (k+1ULL) << ": " << db3[k] 
              << " | P-Element " << (k+1ULL) << ": " << db2[k] << '\n';
  }
  */
  getchar();
  return 0;
}