/*-----------------------------------------------------------------------*/ /* Program: STREAM */ /* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ /* Original code developed by John D. McCalpin */ /* Programmers: John D. McCalpin */ /* Joe R. Zagar */ /* */ /* This program measures memory transfer rates in MB/s for simple */ /* computational kernels coded in C. */ /*-----------------------------------------------------------------------*/ /* Copyright 1991-2013: John D. McCalpin */ /*-----------------------------------------------------------------------*/ /* License: */ /* 1. You are free to use this program and/or to redistribute */ /* this program. */ /* 2. You are free to modify this program for your own use, */ /* including commercial use, subject to the publication */ /* restrictions in item 3. */ /* 3. You are free to publish results obtained from running this */ /* program, or from works that you derive from this program, */ /* with the following limitations: */ /* 3a. In order to be referred to as "STREAM benchmark results", */ /* published results must be in conformance to the STREAM */ /* Run Rules, (briefly reviewed below) published at */ /* http://www.cs.virginia.edu/stream/ref.html */ /* and incorporated herein by reference. */ /* As the copyright holder, John McCalpin retains the */ /* right to determine conformity with the Run Rules. */ /* 3b. Results based on modified source code or on runs not in */ /* accordance with the STREAM Run Rules must be clearly */ /* labelled whenever they are published. Examples of */ /* proper labelling include: */ /* "tuned STREAM benchmark results" */ /* "based on a variant of the STREAM benchmark code" */ /* Other comparable, clear, and reasonable labelling is */ /* acceptable. */ /* 3c. Submission of results to the STREAM benchmark web site */ /* is encouraged, but not required. */ /* 4. Use of this program or creation of derived works based on this */ /* program constitutes acceptance of these licensing restrictions. */ /* 5. Absolutely no warranty is expressed or implied. */ /*-----------------------------------------------------------------------*/ # include # include # include # include # include # include /*----------------------------------------------------------------------- * INSTRUCTIONS: * * 1) STREAM requires different amounts of memory to run on different * systems, depending on both the system cache size(s) and the * granularity of the system timer. * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) * to meet *both* of the following criteria: * (a) Each array must be at least 4 times the size of the * available cache memory. I don't worry about the difference * between 10^6 and 2^20, so in practice the minimum array size * is about 3.8 times the cache size. * Example 1: One Xeon E3 with 8 MB L3 cache * STREAM_ARRAY_SIZE should be >= 4 million, giving * an array size of 30.5 MB and a total memory requirement * of 91.5 MB. * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) * STREAM_ARRAY_SIZE should be >= 20 million, giving * an array size of 153 MB and a total memory requirement * of 458 MB. * (b) The size should be large enough so that the 'timing calibration' * output by the program is at least 20 clock-ticks. * Example: most versions of Windows have a 10 millisecond timer * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. * This means the each array must be at least 1 GB, or 128M elements. * * Version 5.10 increases the default array size from 2 million * elements to 10 million elements in response to the increasing * size of L3 caches. The new default size is large enough for caches * up to 20 MB. * Version 5.10 changes the loop index variables from "register int" * to "ssize_t", which allows array indices >2^32 (4 billion) * on properly configured 64-bit systems. Additional compiler options * (such as "-mcmodel=medium") may be required for large memory runs. * * Array size can be set at compile time without modifying the source * code for the (many) compilers that support preprocessor definitions * on the compile line. E.g., * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M * will override the default size of 10M with a new size of 100M elements * per array. */ #ifndef STREAM_ARRAY_SIZE # define STREAM_ARRAY_SIZE 10000000 #endif /* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result * for any iteration after the first, therefore the minimum value * for NTIMES is 2. * There are no rules on maximum allowable values for NTIMES, but * values larger than the default are unlikely to noticeably * increase the reported performance. * NTIMES can also be set on the compile line without changing the source * code using, for example, "-DNTIMES=7". */ #ifdef NTIMES #if NTIMES<=1 # define NTIMES 10 #endif #endif #ifndef NTIMES # define NTIMES 10 #endif /* Users are allowed to modify the "OFFSET" variable, which *may* change the * relative alignment of the arrays (though compilers may change the * effective offset by making the arrays non-contiguous on some systems). * Use of non-zero values for OFFSET can be especially helpful if the * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. * OFFSET can also be set on the compile line without changing the source * code using, for example, "-DOFFSET=56". */ #ifndef OFFSET # define OFFSET 0 #endif /* * 3) Compile the code with optimization. Many compilers generate * unreasonably bad code before the optimizer tightens things up. * If the results are unreasonably good, on the other hand, the * optimizer might be too smart for me! * * For a simple single-core version, try compiling with: * cc -O stream.c -o stream * This is known to work on many, many systems.... * * To use multiple cores, you need to tell the compiler to obey the OpenMP * directives in the code. This varies by compiler, but a common example is * gcc -O -fopenmp stream.c -o stream_omp * The environment variable OMP_NUM_THREADS allows runtime control of the * number of threads/cores used when the resulting "stream_omp" program * is executed. * * To run with single-precision variables and arithmetic, simply add * -DSTREAM_TYPE=float * to the compile line. * Note that this changes the minimum array sizes required --- see (1) above. * * The preprocessor directive "TUNED" does not do much -- it simply causes the * code to call separate functions to execute each kernel. Trivial versions * of these functions are provided, but they are *not* tuned -- they just * provide predefined interfaces to be replaced with tuned code. * * * 4) Optional: Mail the results to mccalpin@cs.virginia.edu * Be sure to include info that will help me understand: * a) the computer hardware configuration (e.g., processor model, memory type) * b) the compiler name/version and compilation flags * c) any run-time information (such as OMP_NUM_THREADS) * d) all of the output from the test case. * * Thanks! * *-----------------------------------------------------------------------*/ # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif #ifndef STREAM_TYPE #define STREAM_TYPE double #endif static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], b[STREAM_ARRAY_SIZE+OFFSET], c[STREAM_ARRAY_SIZE+OFFSET]; static double avgtime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static char *label[4] = {"Copy: ", "Scale: ", "Add: ", "Triad: "}; static double bytes[4] = { 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE }; extern double mysecond(); extern void checkSTREAMresults(); #ifdef TUNED extern void tuned_STREAM_Copy(); extern void tuned_STREAM_Scale(STREAM_TYPE scalar); extern void tuned_STREAM_Add(); extern void tuned_STREAM_Triad(STREAM_TYPE scalar); #endif #ifdef _OPENMP extern int omp_get_num_threads(); #endif int main() { int quantum, checktick(); int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.10 $\n"); printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k double mysecond() { struct timeval tp; struct timezone tzp; int i; i = gettimeofday(&tp,&tzp); return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); } #ifndef abs #define abs(a) ((a) >= 0 ? (a) : -(a)) #endif void checkSTREAMresults () { STREAM_TYPE aj,bj,cj,scalar; STREAM_TYPE aSumErr,bSumErr,cSumErr; STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; double epsilon; ssize_t j; int k,ierr,err; /* reproduce initialization */ aj = 1.0; bj = 2.0; cj = 0.0; /* a[] is modified during timing check */ aj = 2.0E0 * aj; /* now execute timing loop */ scalar = 3.0; for (k=0; k epsilon) { err++; printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); ierr = 0; for (j=0; j epsilon) { ierr++; #ifdef VERBOSE if (ierr < 10) { printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", j,aj,a[j],abs((aj-a[j])/aAvgErr)); } #endif } } printf(" For array a[], %d errors were found.\n",ierr); } if (abs(bAvgErr/bj) > epsilon) { err++; printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); ierr = 0; for (j=0; j epsilon) { ierr++; #ifdef VERBOSE if (ierr < 10) { printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", j,bj,b[j],abs((bj-b[j])/bAvgErr)); } #endif } } printf(" For array b[], %d errors were found.\n",ierr); } if (abs(cAvgErr/cj) > epsilon) { err++; printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); ierr = 0; for (j=0; j epsilon) { ierr++; #ifdef VERBOSE if (ierr < 10) { printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", j,cj,c[j],abs((cj-c[j])/cAvgErr)); } #endif } } printf(" For array c[], %d errors were found.\n",ierr); } if (err == 0) { printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); } #ifdef VERBOSE printf ("Results Validation Verbose Results: \n"); printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); #endif } #ifdef TUNED /* stubs for "tuned" versions of the kernels */ void tuned_STREAM_Copy() { ssize_t j; #pragma omp parallel for for (j=0; j