Bolt  1.3
C++ template library with support for OpenCL
device_vector.h
Go to the documentation of this file.
1 /***************************************************************************
2 * © 2012,2014 Advanced Micro Devices, Inc. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 
16 ***************************************************************************/
17 
18 
19 
20 
21 #pragma once
22 #if !defined( BOLT_CL_DEVICE_VECTOR_H )
23 #define BOLT_CL_DEVICE_VECTOR_H
24 
25 #include <iterator>
26 #include <type_traits>
27 #include <numeric>
28 #include "bolt/cl/bolt.h"
30 #include <iostream>
31 #include <boost/iterator/iterator_facade.hpp>
32 #include <boost/iterator/reverse_iterator.hpp>
33 #include <boost/shared_array.hpp>
34 
42 namespace bolt
43 {
47 namespace cl
48 {
59  : public std::random_access_iterator_tag
60  { // identifying tag for random-access iterators
61  };
62 
71  template< typename T >
73  {
77  template< typename Container >
78  class UnMapBufferFunctor
79  {
80  Container& m_Container;
81 
82  public:
83  // Basic constructor requires a reference to the container and a positional element
84  UnMapBufferFunctor( Container& rhs ): m_Container( rhs )
85  {}
86 
87  void operator( )( const void* pBuff )
88  {
89  ::cl::Event unmapEvent;
90 
91  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, const_cast< void* >( pBuff ), NULL, &unmapEvent ),
92  "shared_ptr failed to unmap host memory back to device memory" );
93  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
94  }
95  };
96 
97  typedef T* naked_pointer;
98  typedef const T* const_naked_pointer;
99 
100  public:
101 
102  // Useful typedefs specific to this container
103  typedef T value_type;
104  typedef ptrdiff_t difference_type;
105  typedef difference_type distance_type;
106  typedef int size_type;
107 
108  typedef boost::shared_array< value_type > pointer;
109  typedef boost::shared_array< const value_type > const_pointer;
110 
118  template< typename Container >
120  {
121  public:
122  reference_base(Container &rhs, size_type index ): m_Container( rhs ), m_Index( index )
123  {}
124 
125 
126 
127  // Automatic type conversion operator to turn the reference object into a value_type
128  operator value_type( ) const
129  {
130  cl_int l_Error = CL_SUCCESS;
131  naked_pointer result = reinterpret_cast< naked_pointer >( m_Container.m_commQueue.enqueueMapBuffer(
132  m_Container.m_devMemory, true, CL_MAP_READ, m_Index * sizeof( value_type ), sizeof( value_type ), NULL, NULL, &l_Error ) );
133  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
134 
135  value_type valTmp = *result;
136 
137  ::cl::Event unmapEvent;
138  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, result, NULL, &unmapEvent ), "device_vector failed to unmap host memory back to device memory" );
139  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
140 
141  return valTmp;
142  }
143 
144  reference_base< Container >& operator=( const value_type& rhs )
145  {
146  cl_int l_Error = CL_SUCCESS;
147  naked_pointer result = reinterpret_cast< naked_pointer >( m_Container.m_commQueue.enqueueMapBuffer(
148  m_Container.m_devMemory, true, CL_MAP_WRITE_INVALIDATE_REGION, m_Index * sizeof( value_type ), sizeof( value_type ), NULL, NULL, &l_Error ) );
149  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
150 
151  *result = rhs;
152 
153  ::cl::Event unmapEvent;
154  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, result, NULL, &unmapEvent ), "device_vector failed to unmap host memory back to device memory" );
155  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
156 
157  return *this;
158  }
159 
160  //This specialization is needed for linux Only.
161  //EPR - 398791
163  {
164 
165  cl_int l_Error = CL_SUCCESS;
166  value_type value = static_cast<value_type>(rhs);
167  naked_pointer result = reinterpret_cast< naked_pointer >( m_Container.m_commQueue.enqueueMapBuffer(
168  m_Container.m_devMemory, true, CL_MAP_WRITE_INVALIDATE_REGION, m_Index * sizeof( value_type ), sizeof( value_type ), NULL, NULL, &l_Error ) );
169  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
170 
171  *result = value;
172 
173  ::cl::Event unmapEvent;
174  V_OPENCL( m_Container.m_commQueue.enqueueUnmapMemObject( m_Container.m_devMemory, result, NULL, &unmapEvent ), "device_vector failed to unmap host memory back to device memory" );
175  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
176 
177  return *this;
178  }
179 
182  Container& getContainer( ) const
183  {
184  return m_Container;
185  }
186 
187  size_type getIndex() const
188  {
189  return m_Index;
190  }
191 
192  private:
193  Container& m_Container;
194  size_type m_Index;
195  };
196 
200 
206  typedef const value_type const_reference;
207 
208  // Handy for the reference class to get at the wrapped ::cl objects
209  //friend class reference;
210 
221  template< typename Container >
222  class iterator_base: public boost::iterator_facade< iterator_base< Container >, value_type, device_vector_tag,
223  typename device_vector::reference, int >
224  {
225  public:
226  typedef typename boost::iterator_facade< iterator_base< Container >, value_type, device_vector_tag,
227  typename device_vector::reference, int >::difference_type difference_type;
228 
229 
230  //typedef iterator_facade::difference_type difference_type;
231 
232  // This class represents the iterator data transferred to the openCL device. Transferring pointers is tricky,
233  // the only reason we allocate space for a pointer in this payload is because the openCl clSetKernelArg() checks the
234  // size ( bytes ) of the argument passed in, and the corresponding GPU iterator has a pointer member.
235  // The value of the pointer is not relevant on host side, and is initialized on the device side with the init method
236  // This size of the payload needs to be able to encapsulate both 32bit and 64bit devices
237  // sizeof( 32bit device payload ) = 32bit index & 32bit pointer = 8 bytes
238  // sizeof( 64bit device payload ) = 32bit index & 64bit aligned pointer = 16 bytes
239  struct Payload
240  {
241  difference_type m_Index;
242  difference_type m_Ptr1[ 3 ]; // Represents device pointer, big enough for 32 or 64bit
243  };
244 
245 
246  // Basic constructor requires a reference to the container and a positional element
247  iterator_base( ): m_Container( getContainer() ), m_Index( 0 )
248  {}
249 
250  // Basic constructor requires a reference to the container and a positional element
251  iterator_base( Container& rhs, difference_type index ): m_Container( rhs ), m_Index( index )
252  {}
253 
254  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
255  template< typename OtherContainer >
256  iterator_base( const iterator_base< OtherContainer >& rhs ): m_Container( rhs.m_Container ), m_Index( rhs.m_Index )
257  {}
258 
259  iterator_base( value_type *ptr ): m_Container( ptr ), m_Index( 0 )
260  {
261  }
262  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
263  //template< typename Container >
264  iterator_base< Container >& operator = ( const iterator_base< Container >& rhs )
265  {
266  m_Container = rhs.m_Container;
267  m_Index = rhs.m_Index;
268  return *this;
269  }
270 
271  iterator_base< Container > & base()
272  {
273  return *this;
274  }
275 
276  const iterator_base< Container > & base() const
277  {
278  return *this;
279  }
280 
281  iterator_base< Container > & operator+= ( const difference_type & n )
282  {
283  advance( n );
284  return *this;
285  }
286 
287  iterator_base< Container >& operator = ( const difference_type & n )
288  {
289  advance( n );
290  return *this;
291  }
292 
293 
294 
295  const iterator_base< Container > operator + ( const difference_type & n ) const
296  {
297  iterator_base< Container > result(*this);
298  result.advance(n);
299  return result;
300  }
301 
302  Container& getContainer( ) const
303  {
304  return m_Container;
305  }
306 
307  int setKernelBuffers(int arg_num, ::cl::Kernel &kernel) const
308  {
309  const ::cl::Buffer &buffer = getContainer().getBuffer();
310  kernel.setArg(arg_num, buffer );
311  arg_num++;
312  return arg_num;
313  }
314 
315  // This method initializes the payload of the iterator for the cl device; the contents of the pointer is 0 as it has no relevance
316  // on the host
317  const Payload gpuPayload( ) const
318  {
319  Payload payload = { m_Index, { 0, 0, 0 } };
320  return payload;
321  }
322 
323  // Calculates the size of payload for the cl device. The bitness of the device is independant of the host and must be
324  // queried. The bitness of the device determines the size of the pointer contained in the payload. 64bit pointers must
325  // be 8 byte aligned, so
326  const difference_type gpuPayloadSize( ) const
327  {
328  cl_int l_Error = CL_SUCCESS;
329  ::cl::Device which_device;
330  l_Error = m_Container.m_commQueue.getInfo(CL_QUEUE_DEVICE,&which_device );
331 
332  cl_uint deviceBits = which_device.getInfo< CL_DEVICE_ADDRESS_BITS >( );
333 
334  // Size of index and pointer
335  difference_type payloadSize = sizeof( difference_type ) + ( deviceBits >> 3 );
336 
337  // 64bit devices need to add padding for 8 byte aligned pointer
338  if( deviceBits == 64 )
339  payloadSize += 4;
340 
341  return payloadSize;
342 
343  }
344 
345  difference_type m_Index;
346  difference_type distance_to( const iterator_base< Container >& rhs ) const
347  {
348  return static_cast< difference_type >( rhs.m_Index - m_Index );
349  }
350  private:
351 
352 
353  // Payload payload;
354  // Implementation detail of boost.iterator
355  friend class boost::iterator_core_access;
356 
357  // Handy for the device_vector erase methods
358  friend class device_vector< value_type >;
359 
360  // Used for templatized copy constructor and the templatized equal operator
361  template < typename > friend class iterator_base;
362 
363  void advance( difference_type n )
364  {
365  m_Index += n;
366  }
367 
368  void increment( )
369  {
370  advance( 1 );
371  }
372 
373  void decrement( )
374  {
375  advance( -1 );
376  }
377 
378  template< typename OtherContainer >
379  bool equal( const iterator_base< OtherContainer >& rhs ) const
380  {
381  bool sameIndex = rhs.m_Index == m_Index;
382  bool sameContainer = (&m_Container == &rhs.m_Container );
383 
384  return ( sameIndex && sameContainer );
385  }
386 
387  reference dereference( ) const
388  {
389  return m_Container[ m_Index ];
390  }
391 
392  Container& m_Container;
393 
394  };
395 
405  template< typename Container >
406  class reverse_iterator_base: public boost::iterator_facade< reverse_iterator_base< Container >, value_type, std::random_access_iterator_tag, typename device_vector::reference, int >
407  {
408  public:
409 
410  // Basic constructor requires a reference to the container and a positional element
411  reverse_iterator_base( Container& lhs, size_type index ): m_Container( lhs ), m_Index( index-1 )
412  {}
413 
414  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
415  template< typename OtherContainer >
416  reverse_iterator_base( const reverse_iterator_base< OtherContainer >& lhs ): m_Container( lhs.m_Container ), m_Index( lhs.m_Index-1 )
417  {}
418 
419  // This copy constructor allows an iterator to convert into a const_iterator, but not vica versa
420  //template< typename Container >
422  {
423  m_Container = lhs.m_Container;
424  m_Index = lhs.m_Index;
425  return *this;
426  }
427 
428  reverse_iterator_base< Container >& operator+= ( const difference_type & n )
429  {
430  advance( -n );
431  return *this;
432  }
433 
434  const reverse_iterator_base< Container > operator+ ( const difference_type & n ) const
435  {
437  result.advance(-n);
438  return result;
439  }
440 #if !defined(_WIN32) && defined(__x86_64__)
441  const reverse_iterator_base< Container > operator+ ( const int & n ) const
442  {
444  result.advance(-n);
445  return result;
446  }
447 #endif
448 
449 
450 
451  int getIndex() const
452  {
453  return m_Index;
454  }
455 
456  //iterator_base<Container> base()
457  //{
458  // iterator_base<Container>(m_Container,m_Index-1);
459  //}
460 
461  difference_type distance_to( const reverse_iterator_base< Container >& lhs ) const
462  {
463  return static_cast< difference_type >( m_Index - lhs.m_Index );
464  }
465 
466  private:
467  // Implementation detail of boost.iterator
468  friend class boost::iterator_core_access;
469 
470  // Handy for the device_vector erase methods
471  friend class device_vector< value_type >;
472 
473  // Used for templatized copy constructor and the templatized equal operator
474  template < typename > friend class reverse_iterator_base;
475 
476  void advance( difference_type n )
477  {
478  m_Index += n;
479  }
480 
481  void increment( )
482  {
483  advance( -1 );
484  }
485 
486  void decrement( )
487  {
488  advance( 1 );
489  }
490 
491 
492  template< typename OtherContainer >
493  bool equal( const reverse_iterator_base< OtherContainer >& lhs ) const
494  {
495  bool sameIndex = lhs.m_Index == m_Index;
496  bool sameContainer = (&m_Container == &lhs.m_Container );
497 
498  return ( sameIndex && sameContainer );
499  }
500 
501  reference dereference( ) const
502  {
503  return m_Container[ m_Index ];
504  }
505 
506  Container& m_Container;
507  size_type m_Index;
508  };
509 
513 
517 
521 
525 
526 
532  device_vector( /* cl_mem_flags flags = CL_MEM_READ_WRITE,*/ const control& ctl = control::getDefault( ) ): m_Size( 0 ), m_commQueue( ctl.getCommandQueue( ) ), m_Flags( CL_MEM_READ_WRITE )
533  {
534  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
535  m_devMemory = NULL;
536  }
537 
547  device_vector( size_type newSize, const value_type& value = value_type( ), cl_mem_flags flags = CL_MEM_READ_WRITE,
548  bool init = true, const control& ctl = control::getDefault( ) ): m_Size( newSize ), m_commQueue( ctl.getCommandQueue( ) ), m_Flags( flags )
549  {
550  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
551 
552  // We want to use the context from the passed in commandqueue to initialize our buffer
553  cl_int l_Error = CL_SUCCESS;
554  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
555  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
556 
557  if( m_Size > 0 )
558  {
559  m_devMemory = ::cl::Buffer( l_Context, m_Flags, m_Size * sizeof( value_type ) );
560 
561  if( init )
562  {
563  std::vector< ::cl::Event > fillEvent( 1 );
564 
565  //
566  // note: If the size of value is not a power of two, we fill serially. Another approach is to
567  // launch a templatized fill kernel, but it leads to complications.
568 
569  try
570  {
571  size_t sizeDS = sizeof(value_type);
572 
573  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
574  {
575  V_OPENCL( m_commQueue.enqueueFillBuffer< value_type >( m_devMemory, value, 0,
576  newSize * sizeof( value_type ), NULL, &fillEvent.front( ) ),
577  "device_vector failed to fill the internal buffer with the requested pattern");
578  }
579  else // non 2^n data types
580  {
581  // Map the buffer to host
582  ::cl::Event fill_mapEvent;
583  value_type *host_buffer = ( value_type* )ctl.getCommandQueue( ).enqueueMapBuffer (
584  m_devMemory,
585  false,
586  CL_MAP_READ | CL_MAP_WRITE,
587  0,
588  sizeof( value_type )*newSize,
589  NULL,
590  &fill_mapEvent,
591  &l_Error );
592 
593  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
594  bolt::cl::wait( ctl, fill_mapEvent );
595 
596  // Use serial fill_n to fill the device_vector with value
597 #if defined(_WIN32)
598  std::fill_n( stdext::make_checked_array_iterator( host_buffer, newSize ),
599  newSize,
600  value );
601 #else
602  std::fill_n( host_buffer,
603  newSize,
604  value );
605 #endif
606 
607 
608  // Unmap the buffer
609  l_Error = ctl.getCommandQueue( ).enqueueUnmapMemObject( m_devMemory,
610  host_buffer,
611  NULL,
612  &fillEvent.front( ) );
613  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
614 
615  }
616  }
617  catch( std::exception& e )
618  {
619  std::cout << "device_vector enqueueFillBuffer error condition reported:" << std::endl << e.what() << std::endl;
620  //return 1;
621  }
622 
623  try
624  {
625  // Not allowed to return until the fill operation is finished
626  V_OPENCL( m_commQueue.enqueueWaitForEvents( fillEvent ), "device_vector failed to wait for an event" );
627  }
628  catch( std::exception& e )
629  {
630  std::cout << "device_vector enqueueFillBuffer enqueueWaitForEvents error condition reported:" << std::endl << e.what() << std::endl;
631  //return 1;
632  }
633  }
634  }
635  else
636  {
637  m_devMemory=NULL;
638  }
639  }
640 
649  template< typename InputIterator >
650  device_vector( const InputIterator begin, size_type newSize, cl_mem_flags flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR,
651  bool init = true, const control& ctl = control::getDefault( ),
652  typename std::enable_if< !std::is_integral< InputIterator >::value >::type* = 0 ): m_Size( newSize ),
653  m_commQueue( ctl.getCommandQueue( ) ), m_Flags( flags )
654  {
655  static_assert( std::is_convertible< value_type, typename std::iterator_traits< InputIterator >::value_type >::value,
656  "iterator value_type does not convert to device_vector value_type" );
657  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
658 
659  if ( m_Size == 0 )
660  {
661  m_devMemory=NULL;
662  return;
663  }
664  // We want to use the context from the passed in commandqueue to initialize our buffer
665  cl_int l_Error = CL_SUCCESS;
666  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
667  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
668 
669  if( m_Flags & CL_MEM_USE_HOST_PTR )
670  {
671  m_devMemory = ::cl::Buffer( l_Context, m_Flags, m_Size * sizeof( value_type ),
672  reinterpret_cast< value_type* >( const_cast< value_type* >( &*begin ) ) );
673  }
674  else
675  {
676  m_devMemory = ::cl::Buffer( l_Context, m_Flags, m_Size * sizeof( value_type ) );
677 
678  if( init )
679  {
680  size_t byteSize = m_Size * sizeof( value_type );
681 
682  // Note: The Copy API doesn't work because it uses the concept of a 'default' accelerator
683  // ::cl::copy( begin, begin+m_Size, m_devMemory );
684  naked_pointer pointer = static_cast< naked_pointer >( m_commQueue.enqueueMapBuffer(
685  m_devMemory, CL_TRUE, CL_MEM_WRITE_ONLY, 0, byteSize, 0, 0, &l_Error) );
686  V_OPENCL( l_Error, "enqueueMapBuffer failed in device_vector constructor" );
687 #if (_WIN32)
688  std::copy( begin, begin + m_Size, stdext::checked_array_iterator< naked_pointer >( pointer, m_Size ) );
689 #else
690  std::copy( begin, begin + m_Size, pointer );
691 #endif
692  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, pointer, 0, 0 );
693  V_OPENCL( l_Error, "enqueueUnmapMemObject failed in device_vector constructor" );
694  }
695  }
696  };
697 
705  template< typename InputIterator >
706  device_vector( const InputIterator begin, const InputIterator end, cl_mem_flags flags = CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, const control& ctl = control::getDefault( ),
707  typename std::enable_if< !std::is_integral< InputIterator >::value >::type* = 0 ): m_commQueue( ctl.getCommandQueue( ) ), m_Flags( flags )
708  {
709  static_assert( std::is_convertible< value_type, typename std::iterator_traits< InputIterator >::value_type >::value,
710  "iterator value_type does not convert to device_vector value_type" );
711  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
712 
713  // We want to use the context from the passed in commandqueue to initialize our buffer
714  cl_int l_Error = CL_SUCCESS;
715  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
716  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
717 
718  m_Size = static_cast< size_type >( std::distance( begin, end ) );
719  if ( m_Size == 0 )
720  {
721  m_devMemory=NULL;
722  return;
723  }
724  size_t byteSize = m_Size * sizeof( value_type );
725 
726  if( m_Flags & CL_MEM_USE_HOST_PTR )
727  {
728  m_devMemory = ::cl::Buffer( l_Context, m_Flags, byteSize,
729  reinterpret_cast< value_type* >( const_cast< value_type* >( std::addressof(*(begin) ) /*&*begin*/ ) ) );
730 
731  }
732  else
733  {
734  m_devMemory = ::cl::Buffer( l_Context, m_Flags, byteSize );
735 
736  // Note: The Copy API doesn't work because it uses the concept of a 'default' accelerator
737  //::cl::copy( begin, end, m_devMemory );
738  naked_pointer pointer = static_cast< naked_pointer >( m_commQueue.enqueueMapBuffer(
739  m_devMemory, CL_TRUE, CL_MEM_WRITE_ONLY, 0, byteSize, 0, 0, &l_Error) );
740  V_OPENCL( l_Error, "enqueueMapBuffer failed in device_vector constructor" );
741 #if (_WIN32)
742  std::copy( begin, end, stdext::checked_array_iterator< naked_pointer >( pointer, m_Size ) );
743 #else
744  std::copy( begin, end, pointer );
745 #endif
746  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, pointer, 0, 0 );
747  V_OPENCL( l_Error, "enqueueUnmapMemObject failed in device_vector constructor" );
748  }
749  }
750 
751 
752 
757  device_vector( const ::cl::Buffer& rhs, const control& ctl = control::getDefault( ) ): m_devMemory( rhs ), m_commQueue( ctl.getCommandQueue( ) )
758  {
759  static_assert( !std::is_polymorphic< value_type >::value, "AMD C++ template extensions do not support the virtual keyword yet" );
760 
761  m_Size = capacity( );
762 
763  cl_int l_Error = CL_SUCCESS;
764  m_Flags = m_devMemory.getInfo< CL_MEM_FLAGS >( &l_Error );
765  V_OPENCL( l_Error, "device_vector failed to query for the memory flags of the ::cl::Buffer object" );
766  };
767 
768  // Copying methods
769  device_vector( const device_vector& rhs ): m_Flags( rhs.m_Flags ), m_Size( 0 ), m_commQueue( rhs.m_commQueue )
770  {
771  // This method will set the m_Size member variable upon successful completion
772  resize( rhs.m_Size );
773 
774  if( m_Size == 0 )
775  return;
776 
777  size_type l_srcSize = m_Size * sizeof( value_type );
778  ::cl::Event copyEvent;
779 
780  cl_int l_Error = CL_SUCCESS;
781  l_Error = m_commQueue.enqueueCopyBuffer( rhs.m_devMemory, m_devMemory, 0, 0, l_srcSize, NULL, &copyEvent );
782  V_OPENCL( l_Error, "device_vector failed to copy data inside of operator=()" );
783  V_OPENCL( copyEvent.wait( ), "device_vector failed to wait for copy event" );
784  }
785 
786  device_vector& operator=( const device_vector& rhs )
787  {
788  if( this == &rhs )
789  return *this;
790 
791  m_Flags = rhs.m_Flags;
792  m_commQueue = rhs.m_commQueue;
793  m_Size = capacity( );
794 
795  // This method will set the m_Size member variable upon successful completion
796  resize( rhs.m_Size );
797 
798  if( m_Size == 0 )
799  return *this;
800 
801  size_type l_srcSize = m_Size * sizeof( value_type );
802  ::cl::Event copyEvent;
803 
804  cl_int l_Error = CL_SUCCESS;
805  l_Error = m_commQueue.enqueueCopyBuffer( rhs.m_devMemory, m_devMemory, 0, 0, l_srcSize, NULL, &copyEvent );
806  V_OPENCL( l_Error, "device_vector failed to copy data inside of operator=()" );
807  V_OPENCL( copyEvent.wait( ), "device_vector failed to wait for copy event" );
808 
809  return *this;
810  }
811 
812  // Member functions
813 
826  void resize( size_type reqSize, const value_type& val = value_type( ) )
827  {
828  if( (m_Flags & CL_MEM_USE_HOST_PTR) != 0 )
829  {
830  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE ,
831  "A device_vector can not resize() memory not under its direct control" );
832  }
833 
834  size_type cap = capacity( );
835 
836  if( reqSize == cap )
837  return;
838 
839  if( reqSize > max_size( ) )
840  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE ,
841  "The amount of memory requested exceeds what is available" );
842 
843  cl_int l_Error = CL_SUCCESS;
844 
845  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
846  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::Buffer object" );
847 
848  size_type l_reqSize = reqSize * sizeof( value_type );
849  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, l_reqSize, NULL, &l_Error );
850 
851  size_type l_srcSize = m_Size * sizeof( value_type );
852 
853  if( l_srcSize > 0 )
854  {
855  // If the new buffer size is greater than the old, the new elements must be initialized to the value specified on the
856  // function parameter
857  if( l_reqSize > l_srcSize )
858  {
859  std::vector< ::cl::Event > copyEvent( 1 );
860  l_Error = m_commQueue.enqueueCopyBuffer( m_devMemory,
861  l_tmpBuffer,
862  0,
863  0,
864  l_srcSize,
865  NULL,
866  &copyEvent.front( ) );
867  V_OPENCL( l_Error, "device_vector failed to copy data to the new ::cl::Buffer object" );
868  ::cl::Event fillEvent;
869 
870  size_t sizeDS = sizeof(value_type);
871  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
872  {
873  l_Error = m_commQueue.enqueueFillBuffer< value_type >( l_tmpBuffer,
874  val,
875  l_srcSize,
876  (l_reqSize - l_srcSize),
877  &copyEvent,
878  &fillEvent );
879  V_OPENCL( l_Error, "device_vector failed to fill the new data with the provided pattern" );
880  // Not allowed to return until the copy operation is finished
881  }
882  else // non 2^n data types
883  {
884  // Map the buffer to host
885  ::cl::Event fill_mapEvent;
886  value_type *host_buffer = ( value_type* )m_commQueue.enqueueMapBuffer (
887  l_tmpBuffer,
888  false,
889  CL_MAP_READ | CL_MAP_WRITE,
890  l_srcSize,
891  (l_reqSize - l_srcSize),
892  NULL,
893  &fill_mapEvent,
894  &l_Error );
895 
896  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
897  fill_mapEvent.wait( );
898 
899  // Use serial fill_n to fill the device_vector with value
900 #if defined(_WIN32)
901  std::fill_n( stdext::make_checked_array_iterator( host_buffer , reqSize ),
902  reqSize,
903  val );
904 #else
905  std::fill_n( host_buffer,
906  (reqSize - m_Size),
907  val );
908 #endif
909 
910 
911  // Unmap the buffer
912  l_Error = m_commQueue.enqueueUnmapMemObject( l_tmpBuffer,
913  host_buffer,
914  NULL,
915  &fillEvent );
916  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
917  }
918 
919  l_Error = fillEvent.wait( );
920  V_OPENCL( l_Error, "device_vector failed to wait for fill event" );
921  }
922  else
923  {
924  std::vector< ::cl::Event > copyEvent( 1 );
925  l_Error = m_commQueue.enqueueCopyBuffer( m_devMemory, l_tmpBuffer, 0, 0, l_reqSize, NULL, &copyEvent.front( ) );
926  V_OPENCL( l_Error, "device_vector failed to copy data to the new ::cl::Buffer object" );
927  // Not allowed to return until the copy operation is finished
928  l_Error = m_commQueue.enqueueWaitForEvents( copyEvent );
929  V_OPENCL( l_Error, "device_vector failed to wait for copy event" );
930  }
931  }
932  else
933  {
934  ::cl::Event fillEvent;
935  size_t sizeDS = sizeof(value_type);
936  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
937  {
938  l_Error = m_commQueue.enqueueFillBuffer< value_type >( l_tmpBuffer, val, 0, l_reqSize, NULL, &fillEvent );
939  V_OPENCL( l_Error, "device_vector failed to fill the new data with the provided pattern" );
940 
941  }
942  else // non 2^n data types
943  {
944  // Map the buffer to host
945  ::cl::Event fill_mapEvent;
946  value_type *host_buffer = ( value_type* )m_commQueue.enqueueMapBuffer (
947  l_tmpBuffer,
948  false,
949  CL_MAP_READ | CL_MAP_WRITE,
950  0,
951  l_reqSize,
952  NULL,
953  &fill_mapEvent,
954  &l_Error );
955 
956  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
957  fill_mapEvent.wait( );
958 
959  // Use serial fill_n to fill the device_vector with value
960 #if defined(_WIN32)
961  std::fill_n( stdext::make_checked_array_iterator( host_buffer , reqSize ),
962  reqSize,
963  val );
964 #else
965  std::fill_n( host_buffer,
966  reqSize,
967  val );
968 #endif
969 
970  // Unmap the buffer
971  l_Error = m_commQueue.enqueueUnmapMemObject( l_tmpBuffer,
972  host_buffer,
973  NULL,
974  &fillEvent );
975  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
976  }
977 
978  // Not allowed to return until the fill operation is finished
979  l_Error = fillEvent.wait( );
980  V_OPENCL( l_Error, "device_vector failed to wait for fill event" );
981  }
982 
983  // Remember the new size
984  m_Size = reqSize;
985 
986  // Operator= should call retain/release appropriately
987  m_devMemory = l_tmpBuffer;
988  }
989 
994  size_type size( void ) const
995  {
996  return m_Size;
997  }
998 
1002  size_type max_size( void ) const
1003  {
1004  cl_int l_Error = CL_SUCCESS;
1005 
1006  ::cl::Device l_Device = m_commQueue.getInfo< CL_QUEUE_DEVICE >( &l_Error );
1007  V_OPENCL( l_Error, "device_vector failed to query for the device of the command queue" );
1008 
1009  cl_ulong l_MaxSize = l_Device.getInfo< CL_DEVICE_MAX_MEM_ALLOC_SIZE >( &l_Error );
1010  V_OPENCL( l_Error, "device_vector failed to query device for the maximum memory size" );
1011 
1012  return static_cast< size_type >( l_MaxSize / sizeof( value_type ) );
1013  }
1014 
1024  void reserve( size_type reqSize )
1025  {
1026  if( reqSize <= capacity( ) )
1027  return;
1028 
1029  if( reqSize > max_size( ) )
1030  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE , "The amount of memory requested exceeds what is available" );
1031 
1032  // We want to use the context from the passed in commandqueue to initialize our buffer
1033  cl_int l_Error = CL_SUCCESS;
1034  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
1035  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
1036 
1037  if( m_Size == 0 )
1038  {
1039  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, reqSize * sizeof( value_type ) );
1040  m_devMemory = l_tmpBuffer;
1041  return;
1042  }
1043 
1044  size_type l_size = reqSize * sizeof( value_type );
1045  // Can't user host_ptr because l_size is guranteed to be bigger
1046  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, l_size, NULL, &l_Error );
1047  V_OPENCL( l_Error, "device_vector can not create an temporary internal OpenCL buffer" );
1048 
1049  size_type l_srcSize = static_cast<size_type> (m_devMemory.getInfo< CL_MEM_SIZE >( &l_Error ) );
1050  V_OPENCL( l_Error, "device_vector failed to request the size of the ::cl::Buffer object" );
1051 
1052  ::cl::Event copyEvent;
1053  V_OPENCL( m_commQueue.enqueueCopyBuffer( m_devMemory, l_tmpBuffer, 0, 0, l_srcSize, NULL, &copyEvent ),
1054  "device_vector failed to copy from buffer to buffer " );
1055 
1056  // Not allowed to return until the copy operation is finished
1057  V_OPENCL( copyEvent.wait( ), "device_vector failed to wait on an event object" );
1058 
1059  // Operator= should call retain/release appropriately
1060  m_devMemory = l_tmpBuffer;
1061  }
1062 
1068  size_type capacity( void ) const
1069  {
1070  size_t l_memSize = 0;
1071  cl_int l_Error = CL_SUCCESS;
1072 
1073  // this seems like bug; what if i popped everything?
1074  // if( m_Size == 0 )
1075  // return m_Size;
1076  if(m_devMemory() == NULL)
1077  return 0;
1078 
1079  l_memSize = m_devMemory.getInfo< CL_MEM_SIZE >( &l_Error );
1080  V_OPENCL( l_Error, "device_vector failed to request the size of the ::cl::Buffer object" );
1081  return static_cast< size_type >( l_memSize / sizeof( value_type ) );
1082 
1083  }
1084 
1092  {
1093  if( m_Size > capacity( ) )
1094  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE , "device_vector size can not be greater than capacity( )" );
1095 
1096  if( m_Size == capacity( ) )
1097  return;
1098 
1099  // We want to use the context from the passed in commandqueue to initialize our buffer
1100  cl_int l_Error = CL_SUCCESS;
1101  ::cl::Context l_Context = m_commQueue.getInfo< CL_QUEUE_CONTEXT >( &l_Error );
1102  V_OPENCL( l_Error, "device_vector failed to query for the context of the ::cl::CommandQueue object" );
1103 
1104  size_type l_newSize = m_Size * sizeof( value_type );
1105  ::cl::Buffer l_tmpBuffer( l_Context, m_Flags, l_newSize, NULL, &l_Error );
1106  V_OPENCL( l_Error, "device_vector can not create an temporary internal OpenCL buffer" );
1107 
1108  //TODO - this is equal to the capacity()
1109  size_type l_srcSize = static_cast< size_type >( m_devMemory.getInfo< CL_MEM_SIZE >( &l_Error ) );
1110  V_OPENCL( l_Error, "device_vector failed to request the size of the ::cl::Buffer object" );
1111 
1112  std::vector< ::cl::Event > copyEvent( 1 );
1113  l_Error = m_commQueue.enqueueCopyBuffer( m_devMemory, l_tmpBuffer, 0, 0, l_newSize, NULL, &copyEvent.front( ) );
1114  V_OPENCL( l_Error, "device_vector failed to copy data to the new ::cl::Buffer object" );
1115 
1116  // Not allowed to return until the copy operation is finished
1117  l_Error = m_commQueue.enqueueWaitForEvents( copyEvent );
1118  V_OPENCL( l_Error, "device_vector failed to wait for copy event" );
1119 
1120  // Operator= should call retain/release appropriately
1121  m_devMemory = l_tmpBuffer;
1122  }
1123 
1127  reference operator[]( size_type n )
1128  {
1129 
1130  return reference( *this, n );
1131  }
1132 
1136  const_reference operator[]( size_type n ) const
1137  {
1138  cl_int l_Error = CL_SUCCESS;
1139 
1140  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ, n * sizeof( value_type), sizeof( value_type), NULL, NULL, &l_Error ) );
1141  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1142 
1143  const_reference tmpRef = *ptrBuff;
1144 
1145  ::cl::Event unmapEvent;
1146  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1147  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1148  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1149 
1150  return tmpRef;
1151  }
1152 
1156  iterator begin( void )
1157  {
1158  return iterator( *this, 0 );
1159  }
1160 
1165  const_iterator begin( void ) const
1166  {
1167  return const_iterator( *this, 0 );
1168  }
1169 
1175  const_iterator cbegin( void ) const
1176  {
1177  return const_iterator( *this, 0 );
1178  }
1179 
1184  {
1185  //static_assert( false, "Reverse iterators are not yet implemented" );
1186  return reverse_iterator( *this, m_Size );
1187  }
1188 
1195  {
1196  //static_assert( false, "Reverse iterators are not yet implemented" );
1197  return const_reverse_iterator( *this, m_Size );
1198  }
1199 
1207  {
1208  //static_assert( false, "Reverse iterators are not yet implemented" );
1209  return const_reverse_iterator( *this, m_Size );
1210  }
1211 
1215  iterator end( void )
1216  {
1217  return iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1218  }
1219 
1224  const_iterator end( void ) const
1225  {
1226  return const_iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1227  }
1228 
1234  const_iterator cend( void ) const
1235  {
1236  return const_iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1237  }
1238 
1244  {
1245  return reverse_iterator( *this, 0 );
1246  }
1247 
1254  {
1255  //static_assert( false, "Reverse iterators are not yet implemented" );
1256  return const_reverse_iterator( *this, 0 );
1257  }
1258 
1266  {
1267  return const_reverse_iterator( *this, 0 );
1268  }
1269 
1274  {
1275  return (*begin());
1276  }
1277 
1281  const_reference front( void ) const
1282  {
1283  return (*begin());
1284  }
1285 
1289  reference back( void )
1290  {
1291  return ( *(end() - 1) );
1292  }
1293 
1297  const_reference back( void ) const
1298  {
1299  return ( *(end() - 1) );
1300  }
1301 
1302  pointer data( void )
1303  {
1304  if(0 == size())
1305  {
1306  pointer sp;
1307  return sp;
1308  }
1309  cl_int l_Error = CL_SUCCESS;
1310 
1311  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1312  0, capacity() * sizeof( value_type ), NULL, NULL, &l_Error ) );
1313 
1314  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1315 
1316  pointer sp( ptrBuff, UnMapBufferFunctor< device_vector< value_type > >( *this ) );
1317 
1318  return sp;
1319  }
1320 
1321  const_pointer data( void ) const
1322  {
1323  cl_int l_Error = CL_SUCCESS;
1324 
1325  const_naked_pointer ptrBuff = reinterpret_cast< const_naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ,
1326  0, capacity() * sizeof( value_type ), NULL, NULL, &l_Error ) );
1327  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1328 
1329  const_pointer sp( ptrBuff, UnMapBufferFunctor< const device_vector< value_type > >( *this ) );
1330  return sp;
1331  }
1332 
1336  void clear( void )
1337  {
1338  // Only way to release the Buffer resource is to explicitly call the destructor
1339  // m_devMemory.~Buffer( );
1340 
1341  // Allocate a temp empty buffer on the stack, because of a double release problem with explicitly
1342  // calling the Wrapper destructor with cl.hpp version 1.2.
1343  ::cl::Buffer tmp;
1344  m_devMemory = tmp;
1345 
1346  m_Size = 0;
1347  }
1348 
1352  bool empty( void ) const
1353  {
1354  return m_Size ? false: true;
1355  }
1356 
1360  void push_back( const value_type& value )
1361  {
1362  if( m_Size > capacity( ) )
1363  throw ::cl::Error( CL_MEM_OBJECT_ALLOCATION_FAILURE , "device_vector size can not be greater than capacity( )" );
1364 
1365  // Need to grow the vector to push new value.
1366  // Vectors double their capacity on push_back if the array is not big enough.
1367  if( m_Size == capacity( ) )
1368  {
1369  m_Size ? reserve( m_Size * 2 ) : reserve( 1 );
1370  }
1371 
1372  cl_int l_Error = CL_SUCCESS;
1373 
1374  naked_pointer result = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_WRITE_INVALIDATE_REGION,
1375  m_Size * sizeof( value_type), sizeof( value_type ), NULL, NULL, &l_Error ) );
1376  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for push_back" );
1377  *result = value;
1378 
1379  ::cl::Event unmapEvent;
1380  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, result, NULL, &unmapEvent );
1381  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1382  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1383 
1384  ++m_Size;
1385  }
1386 
1389  void pop_back( void )
1390  {
1391  if( m_Size > 0 )
1392  {
1393  --m_Size;
1394  }
1395  }
1396 
1400  void swap( device_vector& vec )
1401  {
1402  if( this == &vec )
1403  return;
1404 
1405  ::cl::Buffer swapBuffer( m_devMemory );
1406  m_devMemory = vec.m_devMemory;
1407  vec.m_devMemory = swapBuffer;
1408 
1409  ::cl::CommandQueue swapQueue( m_commQueue );
1410  m_commQueue = vec.m_commQueue;
1411  vec.m_commQueue = swapQueue;
1412 
1413  size_type sizeTmp = m_Size;
1414  m_Size = vec.m_Size;
1415  vec.m_Size = sizeTmp;
1416 
1417  cl_mem_flags flagsTmp = m_Flags;
1418  m_Flags = vec.m_Flags;
1419  vec.m_Flags = flagsTmp;
1420  }
1421 
1427  {
1428  if( &index.m_Container != this )
1429  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1430 
1431  iterator l_End = end( );
1432  if( index.m_Index >= l_End.m_Index )
1433  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1434 
1435  size_type sizeRegion = l_End.m_Index - index.m_Index;
1436 
1437  cl_int l_Error = CL_SUCCESS;
1438  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1439  index.m_Index * sizeof( value_type ), sizeRegion * sizeof( value_type ), NULL, NULL, &l_Error ) );
1440  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1441 
1442  ::memmove( ptrBuff, ptrBuff + 1, (sizeRegion - 1)*sizeof( value_type ) );
1443 
1444  ::cl::Event unmapEvent;
1445  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1446  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1447  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1448 
1449  --m_Size;
1450 
1451  size_t newIndex = (m_Size < index.m_Index) ? m_Size : index.m_Index;
1452  return iterator( *this, static_cast< difference_type >( (int)newIndex ) );
1453  }
1454 
1461  {
1462  if(( &first.m_Container != this ) && ( &last.m_Container != this ) )
1463  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1464 
1465  if( last.m_Index > m_Size )
1466  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1467 
1468  if( (first == begin( )) && (last == end( )) )
1469  {
1470  clear( );
1471  return iterator( *this, static_cast< typename iterator::difference_type >( m_Size ) );
1472  }
1473 
1474  iterator l_End = end( );
1475  size_type sizeMap = l_End.m_Index - first.m_Index;
1476 
1477  cl_int l_Error = CL_SUCCESS;
1478  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1479  first.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1480  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1481 
1482  size_type sizeErase = last.m_Index - first.m_Index;
1483  ::memmove( ptrBuff, ptrBuff + sizeErase, (sizeMap - sizeErase)*sizeof( value_type ) );
1484 
1485  ::cl::Event unmapEvent;
1486  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1487  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1488  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1489 
1490  m_Size -= sizeErase;
1491 
1492  size_type newIndex = (m_Size < last.m_Index) ? m_Size : last.m_Index;
1493  return iterator( *this, static_cast< typename iterator::difference_type >( newIndex ) );
1494  }
1495 
1503  iterator insert( const_iterator index, const value_type& value )
1504  {
1505  if( &index.m_Container != this )
1506  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1507 
1508  if (index.m_Index > m_Size)
1509  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1510 
1511  if( index.m_Index == m_Size )
1512  {
1513  push_back( value );
1514  return iterator( *this, index.m_Index );
1515  }
1516 
1517  // Need to grow the vector to insert a new value.
1518  // TODO: What is an appropriate growth strategy for GPU memory allocation? Exponential growth does not seem
1519  // right at first blush.
1520  if( m_Size == capacity( ) )
1521  {
1522  reserve( m_Size + 10 );
1523  }
1524 
1525  size_type sizeMap = (m_Size - index.m_Index) + 1;
1526 
1527  cl_int l_Error = CL_SUCCESS;
1528  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1529  index.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1530  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1531 
1532  // Shuffle the old values 1 element down
1533  ::memmove( ptrBuff + 1, ptrBuff, (sizeMap - 1)*sizeof( value_type ) );
1534 
1535  // Write the new value in its place
1536  *ptrBuff = value;
1537 
1538  ::cl::Event unmapEvent;
1539  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1540  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1541  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1542 
1543  ++m_Size;
1544 
1545  return iterator( *this, index.m_Index );
1546  }
1547 
1555  void insert( const_iterator index, size_type n, const value_type& value )
1556  {
1557  if( &index.m_Container != this )
1558  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1559 
1560  if( index.m_Index > m_Size )
1561  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1562 
1563  // Need to grow the vector to insert a new value.
1564  // TODO: What is an appropriate growth strategy for GPU memory allocation? Exponential growth does not seem
1565  // right at first blush.
1566  if( ( m_Size + n ) > capacity( ) )
1567  {
1568  reserve( m_Size + n );
1569  }
1570 
1571  size_type sizeMap = (m_Size - index.m_Index) + n;
1572 
1573  cl_int l_Error = CL_SUCCESS;
1574  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1575  index.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1576  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for operator[]" );
1577 
1578  // Shuffle the old values n element down.
1579  ::memmove( ptrBuff + n, ptrBuff, (sizeMap - n)*sizeof( value_type ) );
1580 
1581  // Copy the new value n times in the buffer.
1582  for( size_type i = 0; i < n; ++i )
1583  {
1584  ptrBuff[ i ] = value;
1585  }
1586 
1587  ::cl::Event unmapEvent;
1588  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1589  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1590  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1591 
1592  m_Size += n;
1593  }
1594 
1595  template< typename InputIterator >
1596  void insert( const_iterator index, InputIterator begin, InputIterator end )
1597  {
1598  if( &index.m_Container != this )
1599  throw ::cl::Error( CL_INVALID_ARG_VALUE , "Iterator is not from this container" );
1600 
1601  if ( index.m_Index > m_Size)
1602  throw ::cl::Error( CL_INVALID_ARG_INDEX , "Iterator is pointing past the end of this container" );
1603 
1604  // Need to grow the vector to insert a new value.
1605  // TODO: What is an appropriate growth strategy for GPU memory allocation? Exponential growth does not seem
1606  // right at first blush.
1607  size_type n = static_cast< size_type >( std::distance( begin, end ) );
1608  if( ( m_Size + n ) > capacity( ) )
1609  {
1610  reserve( m_Size + n );
1611  }
1612  size_type sizeMap = (m_Size - index.m_Index) + n;
1613 
1614  cl_int l_Error = CL_SUCCESS;
1615  naked_pointer ptrBuff = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, true, CL_MAP_READ | CL_MAP_WRITE,
1616  index.m_Index * sizeof( value_type ), sizeMap * sizeof( value_type ), NULL, NULL, &l_Error ) );
1617  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for iterator insert" );
1618 
1619  // Shuffle the old values n element down.
1620  ::memmove( ptrBuff + n, ptrBuff, (sizeMap - n)*sizeof( value_type ) );
1621 
1622 #if( _WIN32 )
1623  std::copy( begin, end, stdext::checked_array_iterator< naked_pointer >( ptrBuff, n ) );
1624 #else
1625  std::copy( begin, end, ptrBuff );
1626 #endif
1627 
1628  ::cl::Event unmapEvent;
1629  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuff, NULL, &unmapEvent );
1630  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1631  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1632 
1633  m_Size += n;
1634  }
1635 
1643  void assign( size_type newSize, const value_type& value )
1644  {
1645  if( newSize > m_Size )
1646  {
1647  reserve( newSize );
1648  }
1649  m_Size = newSize;
1650 
1651  cl_int l_Error = CL_SUCCESS;
1652 
1653  ::cl::Event fillEvent;
1654  size_t sizeDS = sizeof(value_type);
1655 
1656  if( !( sizeDS & (sizeDS - 1 ) ) ) // 2^n data types
1657  {
1658  l_Error = m_commQueue.enqueueFillBuffer< value_type >( m_devMemory,
1659  value,
1660  0,
1661  m_Size * sizeof( value_type ),
1662  NULL,
1663  &fillEvent );
1664  V_OPENCL( l_Error, "device_vector failed to fill the new data with the provided pattern" );
1665  }
1666  else
1667  {
1668  // Map the buffer to host
1669  ::cl::Event fill_mapEvent;
1670  value_type *host_buffer = ( value_type* )m_commQueue.enqueueMapBuffer ( m_devMemory,
1671  false,
1672  CL_MAP_READ | CL_MAP_WRITE,
1673  0,
1674  sizeof( value_type )*newSize,
1675  NULL,
1676  &fill_mapEvent,
1677  &l_Error );
1678 
1679  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
1680  fill_mapEvent.wait( );
1681 
1682  // Use serial fill_n to fill the device_vector with value
1683 
1684 #if( _WIN32 )
1685 
1686  std::fill_n( stdext::checked_array_iterator< naked_pointer >( host_buffer,newSize),
1687  newSize,
1688  value );
1689 
1690 #else
1691  std::fill_n( host_buffer ,
1692  newSize,
1693  value );
1694 #endif
1695 
1696 
1697  // Unmap the buffer
1698  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory,
1699  host_buffer,
1700  NULL,
1701  &fillEvent );
1702  V_OPENCL( l_Error, "Error calling map on device_vector buffer. Fill device_vector" );
1703  }
1704 
1705  // Not allowed to return until the copy operation is finished.
1706  l_Error = fillEvent.wait( );
1707  V_OPENCL( l_Error, "device_vector failed to wait for fill event" );
1708  }
1709 
1715  template<typename InputIterator>
1716  typename std::enable_if< !std::is_integral<InputIterator>::value, void>::type
1717  assign( InputIterator begin, InputIterator end )
1718  {
1719  size_type l_Count = static_cast< size_type >( std::distance( begin, end ) );
1720 
1721  if( l_Count > m_Size )
1722  {
1723  reserve( l_Count );
1724  }
1725  m_Size = l_Count;
1726 
1727  cl_int l_Error = CL_SUCCESS;
1728 
1729  naked_pointer ptrBuffer = reinterpret_cast< naked_pointer >( m_commQueue.enqueueMapBuffer( m_devMemory, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0 , m_Size * sizeof( value_type ), NULL, NULL, &l_Error ) );
1730  V_OPENCL( l_Error, "device_vector failed map device memory to host memory for push_back" );
1731 
1732 #if( _WIN32 )
1733  std::copy( begin, end, stdext::checked_array_iterator< naked_pointer >( ptrBuffer, m_Size ) );
1734 #else
1735  std::copy( begin, end, ptrBuffer );
1736 #endif
1737  ::cl::Event unmapEvent;
1738  l_Error = m_commQueue.enqueueUnmapMemObject( m_devMemory, ptrBuffer, NULL, &unmapEvent );
1739  V_OPENCL( l_Error, "device_vector failed to unmap host memory back to device memory" );
1740  V_OPENCL( unmapEvent.wait( ), "failed to wait for unmap event" );
1741  }
1742 
1743 
1750  const ::cl::Buffer& getBuffer( ) const
1751  {
1752  return m_devMemory;
1753  }
1754 
1761  ::cl::Buffer& getBuffer( )
1762  {
1763  return m_devMemory;
1764  }
1765 
1766  private:
1767  ::cl::Buffer m_devMemory;
1768  ::cl::CommandQueue m_commQueue;
1769  size_type m_Size;
1770  cl_mem_flags m_Flags;
1771  };
1772 
1773  // This string represents the device side definition of the constant_iterator template
1774  static std::string deviceVectorIteratorTemplate =
1775  std::string ("#if !defined(BOLT_CL_DEVICE_ITERATOR) \n#define BOLT_CL_DEVICE_ITERATOR \n") +
1776  STRINGIFY_CODE(
1777 
1778  namespace bolt { namespace cl { \n
1779  template< typename T > \n
1780  class device_vector \n
1781  { \n
1782  public: \n
1783  class iterator \n
1784  { \n
1785  public:
1786  typedef int iterator_category; // device code does not understand std:: tags \n
1787  typedef T value_type; \n
1788  typedef T base_type; \n
1789  typedef int difference_type; \n
1790  typedef int size_type; \n
1791  typedef T* pointer; \n
1792  typedef T& reference; \n
1793 
1794  iterator( value_type init ): m_StartIndex( init ), m_Ptr( 0 ) \n
1795  {}; \n
1796 
1797  void init( global value_type* ptr )\n
1798  { \n
1799  m_Ptr = ptr; \n
1800  }; \n
1801 
1802  global value_type& operator[]( size_type threadID ) const \n
1803  { \n
1804  return m_Ptr[ m_StartIndex + threadID ]; \n
1805  } \n
1806 
1807  value_type operator*( ) const \n
1808  { \n
1809  return m_Ptr[ m_StartIndex + threadID ]; \n
1810  } \n
1811 
1812  size_type m_StartIndex; \n
1813  global value_type* m_Ptr; \n
1814  }; \n
1815  }; \n
1816  } } \n
1817  ) +
1818  std::string ("#endif \n");
1819 }
1820 }
1821 
1823 BOLT_CREATE_CLCODE( bolt::cl::device_vector< cl_int >::iterator, bolt::cl::deviceVectorIteratorTemplate );
1824 /*Now derive each of the OpenCL Application data types from cl_int data type.*/
1825 //Visual Studio 2012 is not able to map char to cl_char. Hence this typename is added.
1827 
1837 
1838 #endif