#include #include #include #include #define MAXSIZE 10000000 /* Timing helpers. */ #ifdef _OPENMP static double t0; void reset_clock () { t0 = omp_get_wtime(); } double read_clock () { double t1, t; t1 = omp_get_wtime(); t = t1 - t0; return t; } #else static int t0; void reset_clock () { t0 = clock(); } double read_clock () { int t1; double t; t1 = clock(); t = (double) (t1 - t0) / CLOCKS_PER_SEC; return t; } #endif /* Initialization loop. */ void init_vectors (double **a, double **b, int vsize) { int i; *a = (double*) malloc(vsize * sizeof(double)); *b = (double*) malloc(vsize * sizeof(double)); #pragma omp parallel for for (i = 0; i < vsize; ++i) { (*a)[i] = (double) i / vsize; (*b)[i] = (double) (vsize - i) / vsize; } } /* Test loop: euclidean scalar product (2 FLOP in the kernel). */ double test_speuc (double *a, double *b, int vsize, int *flop) { int i; double sum; sum = 0.0; #pragma omp parallel for reduction(+:sum) for (i = 0; i < vsize; ++i) { sum += a[i] * b[i]; } *flop = 2 * vsize; return sum; } /* Test loop: a scalar product with many FLOPs in the kernel. */ #define NINNER 16 double test_spmflop (double *a, double *b, int vsize, int *flop) { int i; double sum; sum = 0.0; #pragma omp parallel for reduction(+:sum) for (i = 0; i < vsize; ++i) { double ab = a[i] * b[i]; double prod = 1.0; int j; for (j = 0; j < NINNER; ++j) { prod *= ab; sum += prod; } } *flop = (2 * NINNER + 1) * vsize; return sum; } int main (int argc, char **argv) { int i, k, sc; int nruns, vsize, sizep, flop, loopi; double sumsum1, sumsum2; double t, flops; double (*test_loop)(double *, double*, int, int *); double (*test_loops[])(double *, double*, int, int *) = { test_speuc, test_spmflop }; int scoeff[] = {1, 2, 5}; int ncoeff = sizeof(scoeff) / sizeof(int); /* Select test loop. */ loopi = 0; if (argc >= 2) { loopi = atoi(argv[1]); } /* Benchmark the loop for different vector sizes. */ printf("# Selected loop: %d\n", loopi); printf("# %15s %15s %15s\n", "VSIZE [*]", "SPEED [MFLOPS]", "SUMSUM [*]"); test_loop = test_loops[loopi]; sizep = 100; vsize = sizep; sc = 0; while (vsize <= MAXSIZE) { double *a, *b; /* Initialize storage. */ init_vectors(&a, &b, vsize); /* Computation and printing of "sumsum*" is needed in order to prevent compiler remove whole sections of code as inconsequential. Also good for checking that parallelization is fine. */ /* Compute reasonable number of benchmarking runs for averaging. */ nruns = 0; sumsum1 = 0.0; reset_clock(); do { sumsum1 += test_loop(a, b, vsize, &flop); ++nruns; } while (read_clock() < 1.0); /* at least 1 second runtime */ sumsum1 /= nruns; /* Benchmarking runs. */ reset_clock(); sumsum2 = 0.0; for (k = 0; k < nruns; ++k) { sumsum2 += test_loop(a, b, vsize, &flop); } t = read_clock(); sumsum2 /= nruns; /* Clean up. */ free(a); free(b); /* Report performance. */ flops = flop / (t / nruns); printf(" %15d %15.0f %15.1f\n", vsize, 1e-6 * flops, 0.5 * (sumsum1 + sumsum2)); /* Go to next size. */ ++sc; if (sc == ncoeff) { sc = 0; sizep *= 10; } vsize = sizep * scoeff[sc]; } return 0; }