VexCL
Namespaces | Classes | Typedefs | Functions | Variables
vex Namespace Reference

Vector expression template library for OpenCL. More...

Namespaces

namespace  fft
 Fast Fourier Transform.
 
namespace  Filter
 Device filters.
 
namespace  generator
 Kernel generation interface.
 
namespace  mpi
 MPI wrappers for VexCL types.
 
namespace  random
 Random generators.
 

Classes

class  Context
 VexCL context holder. More...
 
struct  FFT
 Fast Fourier Transform. More...
 
class  multivector
 Container for several vex::vectors. More...
 
class  profiler
 Class for gathering and printing OpenCL and Host profiling info. More...
 
struct  Random
 A random generator. More...
 
struct  RandomNormal
 Returns normal distributed random numbers. More...
 
struct  SUM
 Summation. Should be used as a template parameter for Reductor class. More...
 
struct  MAX
 Maximum element. Should be used as a template parameter for Reductor class. More...
 
struct  MIN
 Minimum element. Should be used as a template parameter for Reductor class. More...
 
class  Reductor
 Parallel reduction of arbitrary expression. More...
 
class  SpMat
 Sparse matrix in hybrid ELL-CSR format. More...
 
class  SpMatCCSR
 Sparse matrix in CCSR format. More...
 
class  stencil
 Stencil. More...
 
class  StencilOperator
 User-defined stencil operator. More...
 
struct  cl_scalar_of
 Get the corresponding scalar type for a CL vector (or scalar) type. More...
 
struct  cl_vector_of
 Get the corresponding vector type for a CL scalar type. More...
 
struct  cl_vector_length
 Get the number of values in a CL vector (or scalar) type. More...
 
struct  is_cl_native
 Declares a type as CL native, allows using it as a literal. More...
 
class  vector
 Device vector. More...
 

Typedefs

typedef multivector_expression
< typename
boost::proto::terminal
< multivector_terminal >::type > 
multivector_terminal_expression
 
typedef std::map< cl_context,
kernel_cache_entry > 
kernel_cache
 
typedef vector_expression
< typename
boost::proto::terminal
< vector_terminal >::type > 
vector_terminal_expression
 

Functions

template<class DevFilter = Filter::AllFilter>
std::vector< cl::Device > device_list (DevFilter filter=Filter::All)
 Select devices by given criteria. More...
 
template<class DevFilter = Filter::AllFilter>
std::pair< std::vector
< cl::Context >, std::vector
< cl::CommandQueue > > 
queue_list (DevFilter filter=Filter::All, cl_command_queue_properties properties=0)
 Create command queues on devices by given criteria. More...
 
const Contextcurrent_context ()
 
template<typename T >
void inclusive_scan (const vex::vector< T > &src, vex::vector< T > &dst)
 
template<typename T >
void sort (vex::vector< T > &x)
 Sort. More...
 
template<class T , size_t N, bool own>
void copy (const multivector< T, N, own > &mv, std::vector< T > &hv)
 Copy multivector to host vector.
 
template<class T , size_t N, bool own>
void copy (const std::vector< T > &hv, multivector< T, N, own > &mv)
 Copy host vector to multivector.
 
template<typename T , class... Tail>
std::enable_if< And
< std::is_same< T, Tail >
...>::value, multivector< T,
sizeof...(Tail)+1, false >
>::type 
tie (vex::vector< T > &head, vex::vector< Tail > &...tail)
 Ties several vex::vectors into a multivector. More...
 
boost::proto::result_of::as_expr
< elem_index, vector_domain >
::type 
element_index (size_t offset=0)
 When used in vector expression, returns current element index plus offset.
 
double device_spmv_perf (const cl::CommandQueue &)
 Weights device wrt to spmv performance. More...
 
template<typename T >
conv< stencil< T >, vector< T > > operator* (const stencil< T > &s, const vector< T > &x)
 
template<typename T >
conv< stencil< T >, vector< T > > operator* (const vector< T > &x, const stencil< T > &s)
 
template<class To , class From >
To cl_convert (const From &val)
 Convert each element of the vector to another type.
 
template<class T >
std::string type_name ()
 Convert typename to string.
 
template<>
std::string type_name< cl_float > ()
 
template<>
std::string type_name< cl_float2 > ()
 
template<>
std::string type_name< cl_float4 > ()
 
template<>
std::string type_name< cl_float8 > ()
 
template<>
std::string type_name< cl_float16 > ()
 
template<>
std::string type_name< cl_double > ()
 
template<>
std::string type_name< cl_double2 > ()
 
template<>
std::string type_name< cl_double4 > ()
 
template<>
std::string type_name< cl_double8 > ()
 
template<>
std::string type_name< cl_double16 > ()
 
template<>
std::string type_name< cl_char > ()
 
template<>
std::string type_name< cl_char2 > ()
 
template<>
std::string type_name< cl_char4 > ()
 
template<>
std::string type_name< cl_char8 > ()
 
template<>
std::string type_name< cl_char16 > ()
 
template<>
std::string type_name< cl_uchar > ()
 
template<>
std::string type_name< cl_uchar2 > ()
 
template<>
std::string type_name< cl_uchar4 > ()
 
template<>
std::string type_name< cl_uchar8 > ()
 
template<>
std::string type_name< cl_uchar16 > ()
 
template<>
std::string type_name< cl_short > ()
 
template<>
std::string type_name< cl_short2 > ()
 
template<>
std::string type_name< cl_short4 > ()
 
template<>
std::string type_name< cl_short8 > ()
 
template<>
std::string type_name< cl_short16 > ()
 
template<>
std::string type_name< cl_ushort > ()
 
template<>
std::string type_name< cl_ushort2 > ()
 
template<>
std::string type_name< cl_ushort4 > ()
 
template<>
std::string type_name< cl_ushort8 > ()
 
template<>
std::string type_name< cl_ushort16 > ()
 
template<>
std::string type_name< cl_int > ()
 
template<>
std::string type_name< cl_int2 > ()
 
template<>
std::string type_name< cl_int4 > ()
 
template<>
std::string type_name< cl_int8 > ()
 
template<>
std::string type_name< cl_int16 > ()
 
template<>
std::string type_name< cl_uint > ()
 
template<>
std::string type_name< cl_uint2 > ()
 
template<>
std::string type_name< cl_uint4 > ()
 
template<>
std::string type_name< cl_uint8 > ()
 
template<>
std::string type_name< cl_uint16 > ()
 
template<>
std::string type_name< cl_long > ()
 
template<>
std::string type_name< cl_long2 > ()
 
template<>
std::string type_name< cl_long4 > ()
 
template<>
std::string type_name< cl_long8 > ()
 
template<>
std::string type_name< cl_long16 > ()
 
template<>
std::string type_name< cl_ulong > ()
 
template<>
std::string type_name< cl_ulong2 > ()
 
template<>
std::string type_name< cl_ulong4 > ()
 
template<>
std::string type_name< cl_ulong8 > ()
 
template<>
std::string type_name< cl_ulong16 > ()
 
size_t nextpow2 (size_t x)
 Return next power of 2.
 
size_t alignup (size_t n, size_t m=16U)
 Align n to the next multiple of m.
 
template<size_t I, class Function , class Tuple >
std::enable_if<(I==std::tuple_size
< Tuple >::value), void >
::type 
for_each (const Tuple &, Function &)
 Iterate over tuple elements.
 
size_t num_workgroups (const cl::Device &device)
 Standard number of workgroups to launch on a device.
 
double device_vector_perf (const cl::CommandQueue &)
 Weights device wrt to vector performance. More...
 
double equal_weights (const cl::CommandQueue &)
 Assigns equal weight to each device. More...
 
void set_partitioning (std::function< double(const cl::CommandQueue &) > f)
 Partitioning scheme for vectors and matrices. More...
 
std::vector< size_t > partition (size_t n, const std::vector< cl::CommandQueue > &queue)
 
template<class T >
void copy (const vex::vector< T > &dv, std::vector< T > &hv, cl_bool blocking=CL_TRUE)
 Copy device vector to host vector.
 
template<class T >
void copy (const vex::vector< T > &dv, T *hv, cl_bool blocking=CL_TRUE)
 Copy device vector to host pointer.
 
template<class T >
void copy (const std::vector< T > &hv, vex::vector< T > &dv, cl_bool blocking=CL_TRUE)
 Copy host vector to device vector.
 
template<class T >
void copy (const T *hv, vex::vector< T > &dv, cl_bool blocking=CL_TRUE)
 Copy host pointer to device vector.
 
template<class InputIterator , class OutputIterator >
std::enable_if< std::is_same
< typename
std::iterator_traits
< InputIterator >::value_type,
typename std::iterator_traits
< OutputIterator >::value_type >
::value &&stored_on_device
< InputIterator >::value
&&!stored_on_device
< OutputIterator >::value,
OutputIterator >::type 
copy (InputIterator first, InputIterator last, OutputIterator result, cl_bool blocking=CL_TRUE)
 Copy range from device vector to host vector.
 
template<class InputIterator , class OutputIterator >
std::enable_if< std::is_same
< typename
std::iterator_traits
< InputIterator >::value_type,
typename std::iterator_traits
< OutputIterator >::value_type >
::value &&!stored_on_device
< InputIterator >::value
&&stored_on_device
< OutputIterator >::value,
OutputIterator >::type 
copy (InputIterator first, InputIterator last, OutputIterator result, cl_bool blocking=CL_TRUE)
 Copy range from host vector to device vector.
 
template<typename T >
void swap (vector< T > &x, vector< T > &y)
 Swap two vectors.
 
template<class T >
std::ostream & operator<< (std::ostream &o, const vex::vector< T > &t)
 Download and print the vector elements.
 

Variables

template<size_t I, class Function , class Tuple >
std::enable_if<(I
< std::tuple_size< Tuple >
::value), void >::typefor_each(const
Tuple &v, Function &f){f(std::get
< I >v));for_each< I+1 >v, f);}inline
cl::Context qctx(const
cl::CommandQueue &q){cl::Context
ctx;q.getInfo(CL_QUEUE_CONTEXT,&ctx);return
ctx;}inline cl::Device qdev(const
cl::CommandQueue &q){cl::Device
dev;q.getInfo(CL_QUEUE_DEVICE,&dev);return
dev;}inline bool is_cpu(const
cl::Device &d){return
d.getInfo< CL_DEVICE_TYPE >
)&CL_DEVICE_TYPE_CPU;}enum
device_options_kind{compile_options,
program_header};template
< device_options_kind kind >
struct device_options{static
const std::string &get(const
cl::Device &dev){if(options[dev()].empty())
options[dev()].push_back("");return
options[dev()].back();}static
void push(const cl::Device
&dev, const std::string &str){options[dev()].push_back(str);}static
void pop(const cl::Device &dev){if(!options[dev()].empty())
options[dev()].pop_back();}private:static
std::map< cl_device_id,
std::vector< std::string >
> options;};template
< device_options_kind kind >
std::map< cl_device_id,
std::vector< std::string >
> device_options< kind >
::options;inline std::string
get_compile_options(const
cl::Device &dev){return
device_options
< compile_options >::get(dev);}inline
std::string get_program_header(const
cl::Device &dev){return
device_options< program_header >
::get(dev);}inline void
push_compile_options(const
cl::Device &dev, const
std::string &str){device_options
< compile_options >::push(dev,
str);}inline void
pop_compile_options(const
cl::Device &dev){device_options
< compile_options >::pop(dev);}inline
void push_program_header(const
cl::Device &dev, const
std::string &str){device_options
< program_header >::push(dev,
str);}inline void
pop_program_header(const
cl::Device &dev){device_options
< program_header >::pop(dev);}inline
void push_compile_options(const
std::vector< cl::CommandQueue >
&queue, const std::string
&str){for(auto q=queue.begin();q!=queue.end();++q)
device_options
< compile_options >::push(qdev(*q),
str);}inline void
pop_compile_options(const
std::vector< cl::CommandQueue >
&queue){for(auto q=queue.begin();q!=queue.end();++q)
device_options
< compile_options >::pop(qdev(*q));}inline
void push_program_header(const
std::vector< cl::CommandQueue >
&queue, const std::string
&str){for(auto q=queue.begin();q!=queue.end();++q)
device_options< program_header >
::push(qdev(*q), str);}inline
void pop_program_header(const
std::vector< cl::CommandQueue >
&queue){for(auto q=queue.begin();q!=queue.end();++q)
device_options< program_header >
::pop(qdev(*q));}inline
std::string
standard_kernel_header(const
cl::Device &dev){return
std::string("#if defined(cl_khr_fp64)\n""#
pragma OPENCL EXTENSION
cl_khr_fp64: enable\n""#elif
defined(cl_amd_fp64)\n""#
pragma OPENCL EXTENSION
cl_amd_fp64: enable\n""#endif\n")+get_program_header(dev);}inline
cl::Program build_sources(const
cl::Context &context, const
std::string &source, const
std::string &options=""){cl::Program
program(context,
cl::Program::Sources(1,
std::make_pair(source.c_str(),
source.size())));auto device=context.getInfo
< CL_CONTEXT_DEVICES >);try{program.build(device,(options+"
"+get_compile_options(device[0])).c_str());}catch(const
cl::Error &){std::cerr
<< source<< std::endl
<< program.getBuildInfo
< CL_PROGRAM_BUILD_LOG >device[0])
<< std::endl;throw;}return
program;}inline uint
kernel_workgroup_size(const
cl::Kernel &kernel, const
cl::Device &device){size_t
wgsz=1024U;uint dev_wgsz=kernel.getWorkGroupInfo< CL_KERNEL_WORK_GROUP_SIZE >device);while(wgsz > dev_wgsz 
wgsz = 2
 Iterate over tuple elements.
 

Detailed Description

Vector expression template library for OpenCL.

Function Documentation

template<class DevFilter = Filter::AllFilter>
std::vector<cl::Device> vex::device_list ( DevFilter  filter = Filter::All)

Select devices by given criteria.

Parameters
filterDevice filter functor. Functors may be combined with logical operators.
Returns
list of devices satisfying the provided filter.

This example selects any GPU which supports double precision arithmetic:

auto devices = device_list(
Filter::Type(CL_DEVICE_TYPE_GPU) && Filter::DoublePrecision
);
double vex::device_spmv_perf ( const cl::CommandQueue &  q)
inline

Weights device wrt to spmv performance.

Returns device weight after spmv test.

Launches the following kernel on each device:

y = A * x;

where x and y are vectors, and A is matrix for 3D Poisson problem in square domain. Each device gets portion of the vector proportional to the performance of this operation.

References vex::SpMat< real, column_t, idx_t >::mul(), vex::profiler::tic_cl(), and vex::profiler::toc().

double vex::device_vector_perf ( const cl::CommandQueue &  q)
inline

Weights device wrt to vector performance.

Returns device weight after simple bandwidth test.

Launches the following kernel on each device:

a = b + c;

where a, b and c are device vectors. Each device gets portion of the vector proportional to the performance of this operation.

References vex::profiler::tic_cl(), and vex::profiler::toc().

double vex::equal_weights ( const cl::CommandQueue &  )
inline

Assigns equal weight to each device.

This results in equal partitioning.

template<class DevFilter = Filter::AllFilter>
std::pair<std::vector<cl::Context>, std::vector<cl::CommandQueue> > vex::queue_list ( DevFilter  filter = Filter::All,
cl_command_queue_properties  properties = 0 
)

Create command queues on devices by given criteria.

Parameters
filterDevice filter functor. Functors may be combined with logical operators.
propertiesCommand queue properties.
Returns
list of queues accociated with selected devices.
See Also
device_list

Referenced by vex::Context::Context().

void vex::set_partitioning ( std::function< double(const cl::CommandQueue &) >  f)
inline

Partitioning scheme for vectors and matrices.

Should be set once before any object of vector or matrix type is declared. Otherwise default parttioning function (partition_by_vector_perf) is selected.

template<typename T >
void vex::sort ( vex::vector< T > &  x)

Sort.

If there are more than one device in vector's queue list, then all partitions are sorted individually on GPUs and then merged on CPU.

References vex::vector< T >::begin(), copy(), vex::vector< T >::end(), vex::vector< T >::part_size(), vex::vector< T >::part_start(), vex::vector< T >::queue_list(), and vex::vector< T >::size().

template<typename T , class... Tail>
std::enable_if< And<std::is_same<T,Tail>...>::value, multivector<T, sizeof...(Tail) + 1, false> >::type vex::tie ( vex::vector< T > &  head,
vex::vector< Tail > &...  tail 
)

Ties several vex::vectors into a multivector.

The following example results in a single kernel:

vex::vector<double> x(ctx, 1024);
vex::vector<double> y(ctx, 1024);
vex::tie(x,y) = std::tie( x + y, y - x );

This is functionally equivalent to

tmp_x = x + y;
tmp_y = y - x;
x = tmp_x;
y = tmp_y;

but does not use temporaries and is more efficient.

Referenced by vex::generator::build_kernel(), vex::Context::Context(), and vex::generator::Kernel< NP >::operator()().