#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

#define MAXSIZE 10000000


/* Timing helpers. */
#ifdef _OPENMP
static double t0;
void reset_clock ()
{
    t0 = omp_get_wtime();
}
double read_clock ()
{
    double t1, t;
    t1 = omp_get_wtime();
    t = t1 - t0;
    return t;
}
#else
static int t0;
void reset_clock ()
{
    t0 = clock();
}
double read_clock ()
{
    int t1;
    double t;
    t1 = clock();
    t = (double) (t1 - t0) / CLOCKS_PER_SEC;
    return t;
}
#endif

/* Initialization loop. */
void init_vectors (double **a, double **b, int vsize)
{
    int i;

    *a = (double*) malloc(vsize * sizeof(double));
    *b = (double*) malloc(vsize * sizeof(double));

    #pragma omp parallel for
    for (i = 0; i < vsize; ++i) {
        (*a)[i] = (double) i / vsize;
        (*b)[i] = (double) (vsize - i) / vsize;
    }
}

/* Test loop: euclidean scalar product (2 FLOP in the kernel). */
double test_speuc (double *a, double *b, int vsize, int *flop)
{
    int i;
    double sum;

    sum = 0.0;
    #pragma omp parallel for reduction(+:sum)
    for (i = 0; i < vsize; ++i) {
        sum += a[i] * b[i];
    }

    *flop = 2 * vsize;
    return sum;
}

/* Test loop: a scalar product with many FLOPs in the kernel. */
#define NINNER 16
double test_spmflop (double *a, double *b, int vsize, int *flop)
{
    int i;
    double sum;

    sum = 0.0;
    #pragma omp parallel for reduction(+:sum)
    for (i = 0; i < vsize; ++i) {
        double ab = a[i] * b[i];
        double prod = 1.0;
        int j;
        for (j = 0; j < NINNER; ++j) {
            prod *= ab;
            sum += prod;
        }
    }

    *flop = (2 * NINNER + 1) * vsize;
    return sum;
}


int main (int argc, char **argv)
{
    int i, k, sc;
    int nruns, vsize, sizep, flop, loopi;
    double sumsum1, sumsum2;
    double t, flops;
    double (*test_loop)(double *, double*, int, int *);
    double (*test_loops[])(double *, double*, int, int *) = {
        test_speuc, test_spmflop
    };
    int scoeff[] = {1, 2, 5};
    int ncoeff = sizeof(scoeff) / sizeof(int);

    /* Select test loop. */
    loopi = 0;
    if (argc >= 2) {
        loopi = atoi(argv[1]);
    }

    /* Benchmark the loop for different vector sizes. */
    printf("# Selected loop: %d\n", loopi);
    printf("# %15s %15s %15s\n",
           "VSIZE [*]", "SPEED [MFLOPS]", "SUMSUM [*]");
    test_loop = test_loops[loopi];
    sizep = 100;
    vsize = sizep;
    sc = 0;
    while (vsize <= MAXSIZE) {
        double *a, *b;

        /* Initialize storage. */
        init_vectors(&a, &b, vsize);

        /* Computation and printing of "sumsum*" is needed in order to
           prevent compiler remove whole sections of code as inconsequential.
           Also good for checking that parallelization is fine. */

        /* Compute reasonable number of benchmarking runs for averaging. */
        nruns = 0;
        sumsum1 = 0.0;
        reset_clock();
        do {
            sumsum1 += test_loop(a, b, vsize, &flop);
            ++nruns;
        }
        while (read_clock() < 1.0); /* at least 1 second runtime */
        sumsum1 /= nruns;

        /* Benchmarking runs. */
        reset_clock();
        sumsum2 = 0.0;
        for (k = 0; k < nruns; ++k) {
            sumsum2 += test_loop(a, b, vsize, &flop);
        }
        t = read_clock();
        sumsum2 /= nruns;

        /* Clean up. */
        free(a);
        free(b);

        /* Report performance. */
        flops = flop / (t / nruns);
        printf("  %15d %15.0f %15.1f\n",
               vsize, 1e-6 * flops, 0.5 * (sumsum1 + sumsum2));

        /* Go to next size. */
        ++sc;
        if (sc == ncoeff) {
            sc = 0;
            sizep *= 10;
        }
        vsize = sizep * scoeff[sc];
    }

    return 0;
}