22 #if !defined( BOLT_AMP_TRANSFORM_REDUCE_RANGE_H )
23 #define BOLT_AMP_TRANSFORM_REDUCE_RANGE_H
27 #include <bolt/transform_reduce.h>
30 #ifdef BOLT_POOL_ALLOC
31 #include <bolt/pool_alloc.h>
40 #define BARRIER(W) // FIXME - placeholder for future barrier insertions
42 #define REDUCE_STEP(_IDX, _W) \
43 if (_IDX < _W) tiled_data[_IDX] = reduce_op(tiled_data[_IDX], tiled_data[_IDX+_W]); \
54 template<
typename outputT,
int Rank,
typename UnaryFunction,
typename BinaryFunction>
55 outputT transform_reduce_range(concurrency::accelerator_view av,
56 concurrency::index<Rank> origin, concurrency::extent<Rank> ext,
57 UnaryFunction transform_op,
58 outputT init, BinaryFunction reduce_op)
60 using namespace concurrency;
62 int wgPerComputeUnit = p_wgPerComputeUnit;
63 int computeUnits = p_computeUnits;
64 int resultCnt = computeUnits * wgPerComputeUnit;
67 int globalH = wgPerComputeUnit * localH;
68 int globalW = computeUnits * localW;
70 globalH = (ext[0] < globalH) ? ext[0] : globalH;
71 globalW = (ext[1] < globalW) ? ext[1] : globalW;
74 extent<2> launchExt(globalH, globalW);
75 #ifdef BOLT_POOL_ALLOC
77 array<outputT,1> &results1 = *(entry._dBuffer);
79 array<outputT,1> results1(resultCnt, av);
81 index<2> bottomRight(origin[0]+ext[0], origin[1]+ext[1]);
86 concurrency::parallel_for_each(av, launchExt.tile<localH, localW>(), [=,&results1](concurrency::tiled_index<localH, localW> idx)
mutable restrict(amp)
88 tile_static outputT tiled_data[waveSize];
91 init = reduce_op(init, transform_op(index<Rank>(origin[0]+idx.global[0], origin[1]+idx.global[1]),
97 for (
int y=origin[0]+idx.global[0]; y<origin[0]+ext[0]; y+=launchExt[0]) {
98 for (
int x=origin[1]+idx.global[1]*VW; x<origin[1]+ext[1]; x+=launchExt[1]*VW) {
99 init = reduce_op(init, transform_op(concurrency::index<Rank>(y,x)));
106 int lx = localW * idx.local[0] + idx.local[1];
107 tiled_data[lx] = init;
120 results1[idx.tile[0]*computeUnits + idx.tile[1]] = tiled_data[0];
135 outputT finalReduction = init;
136 for (
int i=0; i<results1.extent[0]; i++) {
137 finalReduction = reduce_op(finalReduction, (*entry._stagingBuffer)[i]);
140 #ifdef BOLT_POOL_ALLOC
141 arrayPool.free(entry);
144 return finalReduction;
148 template<
typename outputT,
int Rank,
typename UnaryFunction,
typename BinaryFunction>
149 outputT transform_reduce_range(
150 concurrency::index<Rank> origin, concurrency::extent<Rank> ext,
151 UnaryFunction transform_op,
152 outputT init, BinaryFunction reduce_op)
154 return transform_reduce_range(concurrency::accelerator().default_view, origin, ext, transform_op, init, reduce_op);