// // // Copyright (c) Microsoft Corporation. All rights reserved. // // // pplhelpers.h -- some helpers for PPL library // #pragma once #ifndef __unix__ #include #endif namespace msra { namespace parallel { // =========================================================================== // helpers related to multiprocessing and NUMA // =========================================================================== // determine number of CPU cores on this machine static inline size_t determine_num_cores() { SYSTEM_INFO sysInfo; GetSystemInfo (&sysInfo); return sysInfo.dwNumberOfProcessors; } extern size_t ppl_cores; // number of cores to run on as requested by user static inline void set_cores (size_t cores) { ppl_cores = cores; } static inline size_t get_cores() // if returns 1 then no parallelization will be done { return ppl_cores; } #if 0 // execute body() a bunch of times for hopefully each core // This is not precise. Cores will be hit multiple times, and some cores may not be touched. template void for_all_numa_nodes_approximately (const FUNCTION & body) { if (ppl_cores > 1) // parallel computation (regular) parallel_for ((size_t) 0, ppl_cores * 2, (size_t) 1, [&](size_t) { body(); }); else // for comparison: single-threaded (this also documents what the above means) body(); } #endif // wrapper around Concurrency::parallel_for() to allow disabling parallelization altogether template void parallel_for (size_t begin, size_t end, size_t step, const FUNCTION & f) { const size_t cores = ppl_cores; if (cores > 1) // parallel computation (regular) { //fprintf (stderr, "foreach_index_block: computing %d blocks of %d frames on %d cores\n", nblocks, nfwd, determine_num_cores()); Concurrency::parallel_for (begin, end, step, f); } else // for comparison: single-threaded (this also documents what the above means) { //fprintf (stderr, "foreach_index_block: computing %d blocks of %d frames on a single thread\n", nblocks, nfwd); for (size_t j0 = begin; j0 < end; j0 += step) f (j0); } } // execute a function 'body (j0, j1)' for j = [0..n) in chunks of ~targetstep in 'cores' cores // Very similar to parallel_for() except that body function also takes end index, // and the 'targetsteps' gets rounded a little to better map to 'cores.' // ... TODO: Currently, 'cores' does not limit the number of threads in parallel_for() (not so critical, fix later or never) template void foreach_index_block (size_t n, size_t targetstep, size_t targetalignment, const FUNCTION & body) { const size_t cores = ppl_cores; const size_t maxnfwd = 2 * targetstep; size_t nblocks = (n + targetstep / 2) / targetstep; if (nblocks == 0) nblocks = 1; // round to a multiple of the number of cores if (nblocks < cores) // less than # cores -> round up nblocks = (1+(nblocks-1)/cores) * cores; else // more: round down (reduce overhead) nblocks = nblocks / cores * cores; size_t nfwd = 1 + (n - 1) / nblocks; assert (nfwd * nblocks >= n); if (nfwd > maxnfwd) nfwd = maxnfwd; // limit to allocated memory just in case // ... TODO: does the above actually do anything/significant? nfwd != targetstep? // enforce alignment nfwd = (1 + (nfwd -1) / targetalignment) * targetalignment; // execute it! parallel_for (0, n, nfwd, [&](size_t j0) { size_t j1 = min (j0 + nfwd, n); body (j0, j1); }); } };};