Bolt  1.3
C++ template library with support for OpenCL
control.h
Go to the documentation of this file.
1 /***************************************************************************
2 * © 2012,2014 Advanced Micro Devices, Inc. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 
16 ***************************************************************************/
17 
18 
23 #pragma once
24 #if !defined( BOLT_CL_CONTROL_H )
25 #define BOLT_CL_CONTROL_H
26 
27 
28 #include <bolt/cl/bolt.h>
29 #include <string>
30 #include <map>
31 
32 #include <boost/thread/mutex.hpp>
33 #include <boost/thread/locks.hpp>
34 #include <boost/shared_ptr.hpp>
35 
40 namespace bolt {
41  namespace cl {
42 
102  class control {
103  public:
104  enum e_UseHostMode {NoUseHost, UseHost};
105  enum e_RunMode {Automatic,
106  SerialCpu,
107  MultiCoreCpu,
108  OpenCL };
109 
110  enum e_AutoTuneMode{NoAutoTune=0x0,
111  AutoTuneDevice=0x1,
112  AutoTuneWorkShape=0x2,
113  AutoTuneAll=0x3}; // FIXME, experimental
114  struct debug {
115  static const unsigned None=0;
116  static const unsigned Compile = 0x1;
117  static const unsigned ShowCode = 0x2;
118  static const unsigned SaveCompilerTemps = 0x4;
119  static const unsigned DebugKernelRun = 0x8;
120  static const unsigned AutoTune = 0x10;
121  };
122 
123  enum e_WaitMode {BalancedWait, // Balance of Busy and Nice: tries to use Busy for short-running kernels. \todo: Balanced currently maps to nice.
124  NiceWait, // Use an OS semaphore to detect completion status.
125  BusyWait, // Busy a CPU core continuously monitoring results. Lowest-latency, but requires a dedicated core.
126  ClFinish, // Call clFinish on the queue.
127  };
128 
129  public:
130 
131  // Construct a new control structure, copying from default control for arguments that are not overridden.
132  control(
133  const ::cl::CommandQueue& commandQueue = getDefault().getCommandQueue(),
134  e_UseHostMode useHost=getDefault().getUseHost(),
135  unsigned debug=getDefault().getDebugMode()
136  ) :
137  m_commandQueue(commandQueue),
138  m_useHost(useHost),
139  m_forceRunMode(OpenCL), //Replaced this with automatic because the default is not MultiCoreCPU if no GPU is found
140  m_defaultRunMode(OpenCL),
141  m_debug(debug),
142  m_autoTune(getDefault().m_autoTune),
143  m_wgPerComputeUnit(getDefault().m_wgPerComputeUnit),
144  m_compileOptions(getDefault().m_compileOptions),
145  m_compileForAllDevices(getDefault().m_compileForAllDevices),
146  m_waitMode(getDefault().m_waitMode),
147  m_unroll(getDefault().m_unroll)
148  {};
149 
150 
151  control( const control& ref) :
152  m_commandQueue(ref.m_commandQueue),
153  m_useHost(ref.m_useHost),
154  m_forceRunMode(ref.m_forceRunMode),
155  m_defaultRunMode(ref.m_defaultRunMode),
156  m_debug(ref.m_debug),
157  m_autoTune(ref.m_autoTune),
158  m_wgPerComputeUnit(ref.m_wgPerComputeUnit),
159  m_compileOptions(ref.m_compileOptions),
160  m_compileForAllDevices(ref.m_compileForAllDevices),
161  m_waitMode(ref.m_waitMode),
162  m_unroll(ref.m_unroll)
163  {
164  //printf("control::copy construcor\n");
165  };
166 
167  //setters:
172  void setCommandQueue(::cl::CommandQueue commandQueue) { m_commandQueue = commandQueue; };
173 
177  void setUseHost(e_UseHostMode useHost) { m_useHost = useHost; };
178 
179 
186  void setForceRunMode(e_RunMode forceRunMode) { m_forceRunMode = forceRunMode; };
187 
198  void setDebugMode(unsigned debug) { m_debug = debug; };
199 
204  void setWGPerComputeUnit(int wgPerComputeUnit) { m_wgPerComputeUnit = wgPerComputeUnit; };
205 
207  void setWaitMode(e_WaitMode waitMode) { m_waitMode = waitMode; };
208 
210  void setUnroll(int unroll) { m_unroll = unroll; };
211 
214  void setCompileOptions(std::string &compileOptions) { m_compileOptions = compileOptions; };
215 
216  // getters:
217  ::cl::CommandQueue& getCommandQueue( ) { return m_commandQueue; };
218  const ::cl::CommandQueue& getCommandQueue( ) const { return m_commandQueue; };
219  ::cl::Context getContext() const { return m_commandQueue.getInfo<CL_QUEUE_CONTEXT>();};
220  ::cl::Device getDevice() const { return m_commandQueue.getInfo<CL_QUEUE_DEVICE>();};
221  e_UseHostMode getUseHost() const { return m_useHost; };
222  e_RunMode getForceRunMode() const { return m_forceRunMode; };
223  e_RunMode getDefaultPathToRun() const { return m_defaultRunMode; };
224  unsigned getDebugMode() const { return m_debug;};
225  int const getWGPerComputeUnit() const { return m_wgPerComputeUnit; };
226  const ::std::string getCompileOptions() const { return m_compileOptions; };
227  e_WaitMode getWaitMode() const { return m_waitMode; };
228  int getUnroll() const { return m_unroll; };
229  bool getCompileForAllDevices() const { return m_compileForAllDevices; };
230 
246  static control &getDefault()
247  {
248  // Default control structure; this can be accessed by the bolt::cl::control::getDefault()
249  static control _defaultControl( true );
250  return _defaultControl;
251  };
252 
253  static void printPlatforms( bool printDevices = true, cl_device_type deviceType = CL_DEVICE_TYPE_ALL );
254  static void printPlatformsRange( std::vector< ::cl::Platform >::iterator begin, std::vector< ::cl::Platform >::iterator end,
255  bool printDevices = true, cl_device_type deviceType = CL_DEVICE_TYPE_ALL );
256 
263  static ::cl::CommandQueue getDefaultCommandQueue( );
264 
267  typedef boost::shared_ptr< ::cl::Buffer > buffPointer;
268 
270  size_t totalBufferSize( );
272  buffPointer acquireBuffer( size_t reqSize, cl_mem_flags flags = CL_MEM_READ_WRITE, const void* host_ptr = NULL );
274  void freeBuffers( );
275 
276  private:
277 
278  // This is the private constructor is only used to create the initial default control structure.
279  control(bool createGlobal) :
280  m_commandQueue( getDefaultCommandQueue( ) ),
281  m_useHost(UseHost),
282  m_debug(debug::None),
283  m_autoTune(AutoTuneAll),
284  m_wgPerComputeUnit(8),
285  m_compileForAllDevices(true),
286  m_waitMode(BusyWait),
287  m_unroll(1)
288  {
289  ::cl_device_type dType = CL_DEVICE_TYPE_CPU;
290  if(m_commandQueue() != NULL)
291  {
292  ::cl::Device device = m_commandQueue.getInfo<CL_QUEUE_DEVICE>();
293  dType = device.getInfo<CL_DEVICE_TYPE>();
294  }
295  if(dType == CL_DEVICE_TYPE_CPU || m_commandQueue() == NULL)
296  {
297  //m_commandQueue will be NULL if no platforms are found and
298  //if a non AMD paltform is found but cound not enumerate any CPU device
299 #ifdef ENABLE_TBB
300  m_forceRunMode = MultiCoreCpu;
301  m_defaultRunMode = MultiCoreCpu;
302 #else
303  m_forceRunMode = SerialCpu;
304  m_defaultRunMode = SerialCpu;
305 #endif
306  }
307  else
308  {
309  //If dType = CL_DEVICE_TYPE_GPU
310  m_forceRunMode = OpenCL;
311  m_defaultRunMode = OpenCL;
312  }
313  };
314 
315  ::cl::CommandQueue m_commandQueue;
316  e_UseHostMode m_useHost;
317  e_RunMode m_forceRunMode;
318  e_RunMode m_defaultRunMode;
319  e_AutoTuneMode m_autoTune; /* auto-tune the choice of device CPU/GPU and workgroup shape */
320  unsigned m_debug;
321  int m_wgPerComputeUnit;
322  ::std::string m_compileOptions; // extra options to pass to OpenCL compiler.
323  bool m_compileForAllDevices; // compile for all devices in the context. False means to only compile for specified device.
324  e_WaitMode m_waitMode;
325  int m_unroll;
326 
327  struct descBufferKey
328  {
329  ::cl::Context buffContext;
330  cl_mem_flags memFlags;
331  const void* host_ptr;
332  };
333 
334  struct descBufferValue
335  {
336  size_t buffSize;
337  bool inUse;
338  ::cl::Buffer buffBuff;
339  };
340 
341  struct descBufferComp
342  {
343  bool operator( )( const descBufferKey& lhs, const descBufferKey& rhs ) const
344  {
345  if( lhs.memFlags < rhs.memFlags )
346  {
347  return true;
348  }
349  else if( lhs.memFlags == rhs.memFlags )
350  {
351  if( lhs.buffContext( ) < rhs.buffContext( ) )
352  {
353  return true;
354  }
355  else if( lhs.buffContext( ) == rhs.buffContext( ) )
356  {
357  if( lhs.host_ptr < rhs.host_ptr )
358  {
359  return true;
360  }
361  else
362  {
363  return false;
364  }
365  }
366  else
367  {
368  return false;
369  }
370  }
371  else
372  {
373  return false;
374  }
375  }
376  };
377 
378  typedef std::multimap< descBufferKey, descBufferValue, descBufferComp > mapBufferType;
379 
388  class UnlockBuffer
389  {
390  mapBufferType::iterator m_iter;
391  control& m_control;
392 
393  public:
394  // Basic constructor requires a reference to the container and a positional element
395  UnlockBuffer( control& p_control, mapBufferType::iterator it ): m_iter( it ), m_control( p_control )
396  {}
397 
398  void operator( )( const void* pBuff )
399  {
400  // TODO: I think a general mutex is overkill here; we should try to use an interlocked instruction to modify the
401  // inUse flag
402  boost::lock_guard< boost::mutex > lock( m_control.mapGuard );
403  m_iter->second.inUse = false;
404  }
405  };
406 
407  friend class UnlockBuffer;
408  mapBufferType mapBuffer;
409  boost::mutex mapGuard;
410 
411  }; // end class control
412 
413  };
414 };
415 
416 
417 // Implementor note:
418 // When adding a new field to this structure, don't forget to:
419 // * Add the new field, ie "int _foo.
420 // * Add setter function and getter function, ie "void foo(int fooValue)" and "int foo const { return _foo; }"
421 // * Add the field to the private constructor. This is used to set the global default "_defaultControl".
422 // * Add the field to the public constructor, copying from the _defaultControl.
423 
424 // Sample usage:
425 // bolt::control c(myCmdQueue);
426 // c.debug(bolt::control::ShowCompile);
427 // bolt::cl::reduce(c, a.begin(), a.end(), std::plus<int>);
428 //
429 //
430 // reduce (bolt::control(myCmdQueue),
431 
432 #endif