stabilize build system: depends, installer, boost/bdb fixes, cross targets groundwork

This commit is contained in:
2026-02-24 18:38:47 +00:00
parent da8c28aaeb
commit 65cb2619a7
13106 changed files with 2484322 additions and 1804 deletions
@@ -0,0 +1,184 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP
#define BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP
#include <boost/preprocessor/seq/for_each.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/reduce.hpp>
#include <boost/compute/algorithm/detail/serial_accumulate.hpp>
#include <boost/compute/container/array.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class T, class BinaryFunction>
inline T generic_accumulate(InputIterator first,
InputIterator last,
T init,
BinaryFunction function,
command_queue &queue)
{
const context &context = queue.get_context();
size_t size = iterator_range_size(first, last);
if(size == 0){
return init;
}
// accumulate on device
array<T, 1> device_result(context);
detail::serial_accumulate(
first, last, device_result.begin(), init, function, queue
);
// copy result to host
T result;
::boost::compute::copy_n(device_result.begin(), 1, &result, queue);
return result;
}
// returns true if we can use reduce() instead of accumulate() when
// accumulate() this is true when the function is commutative (such as
// addition of integers) and the initial value is the identity value
// for the operation (zero for addition, one for multiplication).
template<class T, class F>
inline bool can_accumulate_with_reduce(T init, F function)
{
(void) init;
(void) function;
return false;
}
/// \internal_
#define BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE(r, data, type) \
inline bool can_accumulate_with_reduce(type init, plus<type>) \
{ \
return init == type(0); \
} \
inline bool can_accumulate_with_reduce(type init, multiplies<type>) \
{ \
return init == type(1); \
}
BOOST_PP_SEQ_FOR_EACH(
BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE,
_,
(char_)(uchar_)(short_)(ushort_)(int_)(uint_)(long_)(ulong_)
)
template<class T>
inline bool can_accumulate_with_reduce(T init, min<T>)
{
return init == (std::numeric_limits<T>::max)();
}
template<class T>
inline bool can_accumulate_with_reduce(T init, max<T>)
{
return init == (std::numeric_limits<T>::min)();
}
#undef BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE
template<class InputIterator, class T, class BinaryFunction>
inline T dispatch_accumulate(InputIterator first,
InputIterator last,
T init,
BinaryFunction function,
command_queue &queue)
{
size_t size = iterator_range_size(first, last);
if(size == 0){
return init;
}
if(can_accumulate_with_reduce(init, function)){
T result;
reduce(first, last, &result, function, queue);
return result;
}
else {
return generic_accumulate(first, last, init, function, queue);
}
}
} // end detail namespace
/// Returns the result of applying \p function to the elements in the
/// range [\p first, \p last) and \p init.
///
/// If no function is specified, \c plus will be used.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param init initial value
/// \param function binary reduction function
/// \param queue command queue to perform the operation
///
/// \return the accumulated result value
///
/// In specific situations the call to \c accumulate() can be automatically
/// optimized to a call to the more efficient \c reduce() algorithm. This
/// occurs when the binary reduction function is recognized as associative
/// (such as the \c plus<int> function).
///
/// Note that because floating-point addition is not associative, calling
/// \c accumulate() with \c plus<float> results in a less efficient serial
/// reduction algorithm being executed. If a slight loss in precision is
/// acceptable, the more efficient parallel \c reduce() algorithm should be
/// used instead.
///
/// For example:
/// \code
/// // with vec = boost::compute::vector<int>
/// accumulate(vec.begin(), vec.end(), 0, plus<int>()); // fast
/// reduce(vec.begin(), vec.end(), &result, plus<int>()); // fast
///
/// // with vec = boost::compute::vector<float>
/// accumulate(vec.begin(), vec.end(), 0, plus<float>()); // slow
/// reduce(vec.begin(), vec.end(), &result, plus<float>()); // fast
/// \endcode
///
/// \see reduce()
template<class InputIterator, class T, class BinaryFunction>
inline T accumulate(InputIterator first,
InputIterator last,
T init,
BinaryFunction function,
command_queue &queue = system::default_queue())
{
return detail::dispatch_accumulate(first, last, init, function, queue);
}
/// \overload
template<class InputIterator, class T>
inline T accumulate(InputIterator first,
InputIterator last,
T init,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type IT;
return detail::dispatch_accumulate(first, last, init, plus<IT>(), queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP
@@ -0,0 +1,116 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP
#define BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/functional/operator.hpp>
#include <boost/compute/container/vector.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline OutputIterator
dispatch_adjacent_difference(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction op,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, last);
detail::meta_kernel k("adjacent_difference");
k << "const uint i = get_global_id(0);\n"
<< "if(i == 0){\n"
<< " " << result[k.var<uint_>("0")] << " = " << first[k.var<uint_>("0")] << ";\n"
<< "}\n"
<< "else {\n"
<< " " << result[k.var<uint_>("i")] << " = "
<< op(first[k.var<uint_>("i")], first[k.var<uint_>("i-1")]) << ";\n"
<< "}\n";
k.exec_1d(queue, 0, count, 1);
return result + count;
}
} // end detail namespace
/// Stores the difference of each pair of consecutive values in the range
/// [\p first, \p last) to the range beginning at \p result. If \p op is not
/// provided, \c minus<T> is used.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param result first element in the output range
/// \param op binary difference function
/// \param queue command queue to perform the operation
///
/// \return \c OutputIterator to the end of the result range
///
/// \see adjacent_find()
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline OutputIterator
adjacent_difference(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction op,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
if(first == last) {
return result;
}
if (first == result) {
vector<value_type> temp(detail::iterator_range_size(first, last),
queue.get_context());
copy(first, last, temp.begin(), queue);
return ::boost::compute::detail::dispatch_adjacent_difference(
temp.begin(), temp.end(), result, op, queue
);
}
else {
return ::boost::compute::detail::dispatch_adjacent_difference(
first, last, result, op, queue
);
}
}
/// \overload
template<class InputIterator, class OutputIterator>
inline OutputIterator
adjacent_difference(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
return ::boost::compute::adjacent_difference(
first, last, result, ::boost::compute::minus<value_type>(), queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP
@@ -0,0 +1,162 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP
#define BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/functional/operator.hpp>
#include <boost/compute/type_traits/vector_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Compare>
inline InputIterator
serial_adjacent_find(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue)
{
if(first == last){
return last;
}
const context &context = queue.get_context();
detail::scalar<uint_> output(context);
detail::meta_kernel k("serial_adjacent_find");
size_t size_arg = k.add_arg<const uint_>("size");
size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output");
k << k.decl<uint_>("result") << " = size;\n"
<< "for(uint i = 0; i < size - 1; i++){\n"
<< " if(" << compare(first[k.expr<uint_>("i")],
first[k.expr<uint_>("i+1")]) << "){\n"
<< " result = i;\n"
<< " break;\n"
<< " }\n"
<< "}\n"
<< "*output = result;\n";
k.set_arg<const uint_>(
size_arg, static_cast<uint_>(detail::iterator_range_size(first, last))
);
k.set_arg(output_arg, output.get_buffer());
k.exec_1d(queue, 0, 1, 1);
return first + output.read(queue);
}
template<class InputIterator, class Compare>
inline InputIterator
adjacent_find_with_atomics(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue)
{
if(first == last){
return last;
}
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(first, last);
// initialize output to the last index
detail::scalar<uint_> output(context);
output.write(static_cast<uint_>(count), queue);
detail::meta_kernel k("adjacent_find_with_atomics");
size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output");
k << "const uint i = get_global_id(0);\n"
<< "if(" << compare(first[k.expr<uint_>("i")],
first[k.expr<uint_>("i+1")]) << "){\n"
<< " atomic_min(output, i);\n"
<< "}\n";
k.set_arg(output_arg, output.get_buffer());
k.exec_1d(queue, 0, count - 1, 1);
return first + output.read(queue);
}
} // end detail namespace
/// Searches the range [\p first, \p last) for two identical adjacent
/// elements and returns an iterator pointing to the first.
///
/// \param first first element in the range to search
/// \param last last element in the range to search
/// \param compare binary comparison function
/// \param queue command queue to perform the operation
///
/// \return \c InputIteratorm to the first element which compares equal
/// to the following element. If none are equal, returns \c last.
///
/// \see find(), adjacent_difference()
template<class InputIterator, class Compare>
inline InputIterator
adjacent_find(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, last);
if(count < 32){
return detail::serial_adjacent_find(first, last, compare, queue);
}
else {
return detail::adjacent_find_with_atomics(first, last, compare, queue);
}
}
/// \overload
template<class InputIterator>
inline InputIterator
adjacent_find(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
using ::boost::compute::lambda::_1;
using ::boost::compute::lambda::_2;
using ::boost::compute::lambda::all;
if(vector_size<value_type>::value == 1){
return ::boost::compute::adjacent_find(
first, last, _1 == _2, queue
);
}
else {
return ::boost::compute::adjacent_find(
first, last, all(_1 == _2), queue
);
}
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP
@@ -0,0 +1,36 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP
#define BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/find_if_not.hpp>
namespace boost {
namespace compute {
/// Returns \c true if \p predicate returns \c true for all of the elements in
/// the range [\p first, \p last).
///
/// \see any_of(), none_of()
template<class InputIterator, class UnaryPredicate>
inline bool all_of(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return ::boost::compute::find_if_not(first, last, predicate, queue) == last;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP
@@ -0,0 +1,40 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP
#define BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/find_if.hpp>
namespace boost {
namespace compute {
/// Returns \c true if \p predicate returns \c true for any of the elements in
/// the range [\p first, \p last).
///
/// For example, to test if a vector contains any negative values:
///
/// \snippet test/test_any_all_none_of.cpp any_of
///
/// \see all_of(), none_of()
template<class InputIterator, class UnaryPredicate>
inline bool any_of(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return ::boost::compute::find_if(first, last, predicate, queue) != last;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP
@@ -0,0 +1,37 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP
#define BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/lower_bound.hpp>
namespace boost {
namespace compute {
/// Returns \c true if \p value is in the sorted range [\p first,
/// \p last).
template<class InputIterator, class T>
inline bool binary_search(InputIterator first,
InputIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
InputIterator position = lower_bound(first, last, value, queue);
return position != last && position.read(queue) == value;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP
@@ -0,0 +1,856 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
#define BOOST_COMPUTE_ALGORITHM_COPY_HPP
#include <algorithm>
#include <iterator>
#include <boost/utility/enable_if.hpp>
#include <boost/mpl/and.hpp>
#include <boost/mpl/not.hpp>
#include <boost/mpl/or.hpp>
#include <boost/compute/buffer.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/copy_on_device.hpp>
#include <boost/compute/algorithm/detail/copy_to_device.hpp>
#include <boost/compute/algorithm/detail/copy_to_host.hpp>
#include <boost/compute/async/future.hpp>
#include <boost/compute/container/mapped_view.hpp>
#include <boost/compute/detail/device_ptr.hpp>
#include <boost/compute/detail/is_contiguous_iterator.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/type_traits/is_device_iterator.hpp>
namespace boost {
namespace compute {
namespace detail {
namespace mpl = boost::mpl;
// meta-function returning true if copy() between InputIterator and
// OutputIterator can be implemented with clEnqueueCopyBuffer().
template<class InputIterator, class OutputIterator>
struct can_copy_with_copy_buffer :
mpl::and_<
mpl::or_<
boost::is_same<
InputIterator,
buffer_iterator<typename InputIterator::value_type>
>,
boost::is_same<
InputIterator,
detail::device_ptr<typename InputIterator::value_type>
>
>,
mpl::or_<
boost::is_same<
OutputIterator,
buffer_iterator<typename OutputIterator::value_type>
>,
boost::is_same<
OutputIterator,
detail::device_ptr<typename OutputIterator::value_type>
>
>,
boost::is_same<
typename InputIterator::value_type,
typename OutputIterator::value_type
>
>::type {};
// meta-function returning true if value_types of HostIterator and
// DeviceIterator are same
template<class HostIterator, class DeviceIterator>
struct is_same_value_type :
boost::is_same<
typename boost::remove_cv<
typename std::iterator_traits<HostIterator>::value_type
>::type,
typename boost::remove_cv<
typename DeviceIterator::value_type
>::type
>::type {};
// meta-function returning true if value_type of HostIterator is bool
template<class HostIterator>
struct is_bool_value_type :
boost::is_same<
typename boost::remove_cv<
typename std::iterator_traits<HostIterator>::value_type
>::type,
bool
>::type {};
// host -> device (async)
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
mpl::not_<
is_device_iterator<InputIterator>
>,
is_device_iterator<OutputIterator>,
is_same_value_type<InputIterator, OutputIterator>
>
>::type* = 0)
{
BOOST_STATIC_ASSERT_MSG(
is_contiguous_iterator<InputIterator>::value,
"copy_async() is only supported for contiguous host iterators"
);
return copy_to_device_async(first, last, result, queue);
}
// host -> device (async)
// Type mismatch between InputIterator and OutputIterator value_types
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
mpl::not_<
is_device_iterator<InputIterator>
>,
is_device_iterator<OutputIterator>,
mpl::not_<
is_same_value_type<InputIterator, OutputIterator>
>
>
>::type* = 0)
{
BOOST_STATIC_ASSERT_MSG(
is_contiguous_iterator<InputIterator>::value,
"copy_async() is only supported for contiguous host iterators"
);
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const context &context = queue.get_context();
size_t count = iterator_range_size(first, last);
if(count < size_t(1)) {
return future<OutputIterator>();
}
// map [first; last) to device and run copy kernel
// on device for copying & casting
::boost::compute::mapped_view<input_type> mapped_host(
// make sure it's a pointer to constant data
// to force read only mapping
const_cast<const input_type*>(
::boost::addressof(*first)
),
count,
context
);
return copy_on_device_async(
mapped_host.begin(), mapped_host.end(), result, queue
);
}
// host -> device
// InputIterator is a contiguous iterator
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
mpl::not_<
is_device_iterator<InputIterator>
>,
is_device_iterator<OutputIterator>,
is_same_value_type<InputIterator, OutputIterator>,
is_contiguous_iterator<InputIterator>
>
>::type* = 0)
{
return copy_to_device(first, last, result, queue);
}
// host -> device
// Type mismatch between InputIterator and OutputIterator value_types
// InputIterator is a contiguous iterator
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
mpl::not_<
is_device_iterator<InputIterator>
>,
is_device_iterator<OutputIterator>,
mpl::not_<
is_same_value_type<InputIterator, OutputIterator>
>,
is_contiguous_iterator<InputIterator>
>
>::type* = 0)
{
typedef typename OutputIterator::value_type output_type;
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const device &device = queue.get_device();
// loading parameters
std::string cache_key =
std::string("__boost_compute_copy_to_device_")
+ type_name<input_type>() + "_" + type_name<output_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
size_t map_copy_threshold;
size_t direct_copy_threshold;
// calculate default values of thresholds
if (device.type() & device::gpu) {
// GPUs
map_copy_threshold = 524288; // 0.5 MB
direct_copy_threshold = 52428800; // 50 MB
}
else {
// CPUs and other devices
map_copy_threshold = 134217728; // 128 MB
direct_copy_threshold = 0; // it's never efficient for CPUs
}
// load thresholds
map_copy_threshold =
parameters->get(
cache_key, "map_copy_threshold", map_copy_threshold
);
direct_copy_threshold =
parameters->get(
cache_key, "direct_copy_threshold", direct_copy_threshold
);
// select copy method based on thresholds & input_size_bytes
size_t count = iterator_range_size(first, last);
size_t input_size_bytes = count * sizeof(input_type);
// [0; map_copy_threshold) -> copy_to_device_map()
if(input_size_bytes < map_copy_threshold) {
return copy_to_device_map(first, last, result, queue);
}
// [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
// on host and then perform copy_to_device()
else if(input_size_bytes < direct_copy_threshold) {
std::vector<output_type> vector(first, last);
return copy_to_device(vector.begin(), vector.end(), result, queue);
}
// [direct_copy_threshold; inf) -> map [first; last) to device and
// run copy kernel on device for copying & casting
// At this point we are sure that count > 1 (first != last).
// Perform async copy to device, wait for it to be finished and
// return the result.
// At this point we are sure that count > 1 (first != last), so event
// returned by dispatch_copy_async() must be valid.
return dispatch_copy_async(first, last, result, queue).get();
}
// host -> device
// InputIterator is NOT a contiguous iterator
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
mpl::not_<
is_device_iterator<InputIterator>
>,
is_device_iterator<OutputIterator>,
mpl::not_<
is_contiguous_iterator<InputIterator>
>
>
>::type* = 0)
{
typedef typename OutputIterator::value_type output_type;
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const device &device = queue.get_device();
// loading parameters
std::string cache_key =
std::string("__boost_compute_copy_to_device_")
+ type_name<input_type>() + "_" + type_name<output_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
size_t map_copy_threshold;
size_t direct_copy_threshold;
// calculate default values of thresholds
if (device.type() & device::gpu) {
// GPUs
map_copy_threshold = 524288; // 0.5 MB
direct_copy_threshold = 52428800; // 50 MB
}
else {
// CPUs and other devices
map_copy_threshold = 134217728; // 128 MB
direct_copy_threshold = 0; // it's never efficient for CPUs
}
// load thresholds
map_copy_threshold =
parameters->get(
cache_key, "map_copy_threshold", map_copy_threshold
);
direct_copy_threshold =
parameters->get(
cache_key, "direct_copy_threshold", direct_copy_threshold
);
// select copy method based on thresholds & input_size_bytes
size_t input_size = iterator_range_size(first, last);
size_t input_size_bytes = input_size * sizeof(input_type);
// [0; map_copy_threshold) -> copy_to_device_map()
//
// if direct_copy_threshold is less than map_copy_threshold
// copy_to_device_map() is used for every input
if(input_size_bytes < map_copy_threshold
|| direct_copy_threshold <= map_copy_threshold) {
return copy_to_device_map(first, last, result, queue);
}
// [map_copy_threshold; inf) -> convert [first; last)
// on host and then perform copy_to_device()
std::vector<output_type> vector(first, last);
return copy_to_device(vector.begin(), vector.end(), result, queue);
}
// device -> host (async)
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
mpl::not_<
is_device_iterator<OutputIterator>
>,
is_same_value_type<OutputIterator, InputIterator>
>
>::type* = 0)
{
BOOST_STATIC_ASSERT_MSG(
is_contiguous_iterator<OutputIterator>::value,
"copy_async() is only supported for contiguous host iterators"
);
return copy_to_host_async(first, last, result, queue);
}
// device -> host (async)
// Type mismatch between InputIterator and OutputIterator value_types
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
mpl::not_<
is_device_iterator<OutputIterator>
>,
mpl::not_<
is_same_value_type<OutputIterator, InputIterator>
>
>
>::type* = 0)
{
BOOST_STATIC_ASSERT_MSG(
is_contiguous_iterator<OutputIterator>::value,
"copy_async() is only supported for contiguous host iterators"
);
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
const context &context = queue.get_context();
size_t count = iterator_range_size(first, last);
if(count < size_t(1)) {
return future<OutputIterator>();
}
// map host memory to device
buffer mapped_host(
context,
count * sizeof(output_type),
buffer::write_only | buffer::use_host_ptr,
static_cast<void*>(
::boost::addressof(*result)
)
);
// copy async on device
::boost::compute::future<buffer_iterator<output_type> > future =
copy_on_device_async(
first,
last,
make_buffer_iterator<output_type>(mapped_host),
queue
);
// update host memory asynchronously by maping and unmaping memory
event map_event;
void* ptr = queue.enqueue_map_buffer_async(
mapped_host,
CL_MAP_READ,
0,
count * sizeof(output_type),
map_event,
future.get_event()
);
event unmap_event =
queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
return make_future(result + count, unmap_event);
}
// device -> host
// OutputIterator is a contiguous iterator
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
mpl::not_<
is_device_iterator<OutputIterator>
>,
is_same_value_type<OutputIterator, InputIterator>,
is_contiguous_iterator<OutputIterator>,
mpl::not_<
is_bool_value_type<OutputIterator>
>
>
>::type* = 0)
{
return copy_to_host(first, last, result, queue);
}
// device -> host
// Type mismatch between InputIterator and OutputIterator value_types
// OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
// is a boolean type.
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
mpl::not_<
is_device_iterator<OutputIterator>
>,
mpl::or_<
mpl::not_<
is_contiguous_iterator<OutputIterator>
>,
is_bool_value_type<OutputIterator>
>
>
>::type* = 0)
{
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
typedef typename InputIterator::value_type input_type;
const device &device = queue.get_device();
// loading parameters
std::string cache_key =
std::string("__boost_compute_copy_to_host_")
+ type_name<input_type>() + "_" + type_name<output_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
size_t map_copy_threshold;
size_t direct_copy_threshold;
// calculate default values of thresholds
if (device.type() & device::gpu) {
// GPUs
map_copy_threshold = 33554432; // 30 MB
direct_copy_threshold = 0; // it's never efficient for GPUs
}
else {
// CPUs and other devices
map_copy_threshold = 134217728; // 128 MB
direct_copy_threshold = 0; // it's never efficient for CPUs
}
// load thresholds
map_copy_threshold =
parameters->get(
cache_key, "map_copy_threshold", map_copy_threshold
);
direct_copy_threshold =
parameters->get(
cache_key, "direct_copy_threshold", direct_copy_threshold
);
// select copy method based on thresholds & input_size_bytes
size_t count = iterator_range_size(first, last);
size_t input_size_bytes = count * sizeof(input_type);
// [0; map_copy_threshold) -> copy_to_host_map()
//
// if direct_copy_threshold is less than map_copy_threshold
// copy_to_host_map() is used for every input
if(input_size_bytes < map_copy_threshold
|| direct_copy_threshold <= map_copy_threshold) {
return copy_to_host_map(first, last, result, queue);
}
// [map_copy_threshold; inf) -> copy [first;last) to temporary vector
// then copy (and convert) to result using std::copy()
std::vector<input_type> vector(count);
copy_to_host(first, last, vector.begin(), queue);
return std::copy(vector.begin(), vector.end(), result);
}
// device -> host
// Type mismatch between InputIterator and OutputIterator value_types
// OutputIterator is a contiguous iterator
// value_type of OutputIterator is NOT a boolean type
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
mpl::not_<
is_device_iterator<OutputIterator>
>,
mpl::not_<
is_same_value_type<OutputIterator, InputIterator>
>,
is_contiguous_iterator<OutputIterator>,
mpl::not_<
is_bool_value_type<OutputIterator>
>
>
>::type* = 0)
{
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
typedef typename InputIterator::value_type input_type;
const device &device = queue.get_device();
// loading parameters
std::string cache_key =
std::string("__boost_compute_copy_to_host_")
+ type_name<input_type>() + "_" + type_name<output_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
size_t map_copy_threshold;
size_t direct_copy_threshold;
// calculate default values of thresholds
if (device.type() & device::gpu) {
// GPUs
map_copy_threshold = 524288; // 0.5 MB
direct_copy_threshold = 52428800; // 50 MB
}
else {
// CPUs and other devices
map_copy_threshold = 134217728; // 128 MB
direct_copy_threshold = 0; // it's never efficient for CPUs
}
// load thresholds
map_copy_threshold =
parameters->get(
cache_key, "map_copy_threshold", map_copy_threshold
);
direct_copy_threshold =
parameters->get(
cache_key, "direct_copy_threshold", direct_copy_threshold
);
// select copy method based on thresholds & input_size_bytes
size_t count = iterator_range_size(first, last);
size_t input_size_bytes = count * sizeof(input_type);
// [0; map_copy_threshold) -> copy_to_host_map()
if(input_size_bytes < map_copy_threshold) {
return copy_to_host_map(first, last, result, queue);
}
// [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
// temporary vector then copy (and convert) to result using std::copy()
else if(input_size_bytes < direct_copy_threshold) {
std::vector<input_type> vector(count);
copy_to_host(first, last, vector.begin(), queue);
return std::copy(vector.begin(), vector.end(), result);
}
// [direct_copy_threshold; inf) -> map [result; result + input_size) to
// device and run copy kernel on device for copying & casting
// map host memory to device.
// Perform async copy to host, wait for it to be finished and
// return the result.
// At this point we are sure that count > 1 (first != last), so event
// returned by dispatch_copy_async() must be valid.
return dispatch_copy_async(first, last, result, queue).get();
}
// device -> device
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
is_device_iterator<OutputIterator>,
mpl::not_<
can_copy_with_copy_buffer<
InputIterator, OutputIterator
>
>
>
>::type* = 0)
{
return copy_on_device(first, last, result, queue);
}
// device -> device (specialization for buffer iterators)
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
is_device_iterator<OutputIterator>,
can_copy_with_copy_buffer<
InputIterator, OutputIterator
>
>
>::type* = 0)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
difference_type n = std::distance(first, last);
if(n < 1){
// nothing to copy
return result;
}
queue.enqueue_copy_buffer(first.get_buffer(),
result.get_buffer(),
first.get_index() * sizeof(value_type),
result.get_index() * sizeof(value_type),
static_cast<size_t>(n) * sizeof(value_type));
return result + n;
}
// device -> device (async)
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
is_device_iterator<OutputIterator>,
mpl::not_<
can_copy_with_copy_buffer<
InputIterator, OutputIterator
>
>
>
>::type* = 0)
{
return copy_on_device_async(first, last, result, queue);
}
// device -> device (async, specialization for buffer iterators)
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if<
mpl::and_<
is_device_iterator<InputIterator>,
is_device_iterator<OutputIterator>,
can_copy_with_copy_buffer<
InputIterator, OutputIterator
>
>
>::type* = 0)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
difference_type n = std::distance(first, last);
if(n < 1){
// nothing to copy
return make_future(result, event());
}
event event_ =
queue.enqueue_copy_buffer(
first.get_buffer(),
result.get_buffer(),
first.get_index() * sizeof(value_type),
result.get_index() * sizeof(value_type),
static_cast<size_t>(n) * sizeof(value_type)
);
return make_future(result + n, event_);
}
// host -> host
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
typename boost::enable_if_c<
!is_device_iterator<InputIterator>::value &&
!is_device_iterator<OutputIterator>::value
>::type* = 0)
{
(void) queue;
return std::copy(first, last, result);
}
} // end detail namespace
/// Copies the values in the range [\p first, \p last) to the range
/// beginning at \p result.
///
/// The generic copy() function can be used for a variety of data
/// transfer tasks and provides a standard interface to the following
/// OpenCL functions:
///
/// \li \c clEnqueueReadBuffer()
/// \li \c clEnqueueWriteBuffer()
/// \li \c clEnqueueCopyBuffer()
///
/// Unlike the aforementioned OpenCL functions, copy() will also work
/// with non-contiguous data-structures (e.g. \c std::list<T>) as
/// well as with "fancy" iterators (e.g. transform_iterator).
///
/// \param first first element in the range to copy
/// \param last last element in the range to copy
/// \param result first element in the result range
/// \param queue command queue to perform the operation
///
/// \return \c OutputIterator to the end of the result range
///
/// For example, to copy an array of \c int values on the host to a vector on
/// the device:
/// \code
/// // array on the host
/// int data[] = { 1, 2, 3, 4 };
///
/// // vector on the device
/// boost::compute::vector<int> vec(4, context);
///
/// // copy values to the device vector
/// boost::compute::copy(data, data + 4, vec.begin(), queue);
/// \endcode
///
/// The copy algorithm can also be used with standard containers such as
/// \c std::vector<T>:
/// \code
/// std::vector<int> host_vector = ...
/// boost::compute::vector<int> device_vector = ...
///
/// // copy from the host to the device
/// boost::compute::copy(
/// host_vector.begin(), host_vector.end(), device_vector.begin(), queue
/// );
///
/// // copy from the device to the host
/// boost::compute::copy(
/// device_vector.begin(), device_vector.end(), host_vector.begin(), queue
/// );
/// \endcode
///
/// \see copy_n(), copy_if(), copy_async()
template<class InputIterator, class OutputIterator>
inline OutputIterator copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
return detail::dispatch_copy(first, last, result, queue);
}
/// Copies the values in the range [\p first, \p last) to the range
/// beginning at \p result. The copy is performed asynchronously.
///
/// \see copy()
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
return detail::dispatch_copy_async(first, last, result, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP
@@ -0,0 +1,58 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP
#define BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP
#include <boost/compute/algorithm/transform_if.hpp>
#include <boost/compute/functional/identity.hpp>
namespace boost {
namespace compute {
namespace detail {
// like the copy_if() algorithm but writes the indices of the values for which
// predicate returns true.
template<class InputIterator, class OutputIterator, class Predicate>
inline OutputIterator copy_index_if(InputIterator first,
InputIterator last,
OutputIterator result,
Predicate predicate,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type T;
return detail::transform_if_impl(
first, last, result, identity<T>(), predicate, true, queue
);
}
} // end detail namespace
/// Copies each element in the range [\p first, \p last) for which
/// \p predicate returns \c true to the range beginning at \p result.
template<class InputIterator, class OutputIterator, class Predicate>
inline OutputIterator copy_if(InputIterator first,
InputIterator last,
OutputIterator result,
Predicate predicate,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type T;
return ::boost::compute::transform_if(
first, last, result, identity<T>(), predicate, queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP
@@ -0,0 +1,51 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_COPY_N_HPP
#define BOOST_COMPUTE_ALGORITHM_COPY_N_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
namespace boost {
namespace compute {
/// Copies \p count elements from \p first to \p result.
///
/// For example, to copy four values from the host to the device:
/// \code
/// // values on the host and vector on the device
/// float values[4] = { 1.f, 2.f, 3.f, 4.f };
/// boost::compute::vector<float> vec(4, context);
///
/// // copy from the host to the device
/// boost::compute::copy_n(values, 4, vec.begin(), queue);
/// \endcode
///
/// \see copy()
template<class InputIterator, class Size, class OutputIterator>
inline OutputIterator copy_n(InputIterator first,
Size count,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
return ::boost::compute::copy(first,
first + static_cast<difference_type>(count),
result,
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_COPY_N_HPP
@@ -0,0 +1,55 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_COUNT_HPP
#define BOOST_COMPUTE_ALGORITHM_COUNT_HPP
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/count_if.hpp>
#include <boost/compute/type_traits/vector_size.hpp>
namespace boost {
namespace compute {
/// Returns the number of occurrences of \p value in the range
/// [\p first, \p last).
///
/// \see count_if()
template<class InputIterator, class T>
inline size_t count(InputIterator first,
InputIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
using ::boost::compute::_1;
using ::boost::compute::lambda::all;
if(vector_size<value_type>::value == 1){
return ::boost::compute::count_if(first,
last,
_1 == value,
queue);
}
else {
return ::boost::compute::count_if(first,
last,
all(_1 == value),
queue);
}
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_COUNT_HPP
@@ -0,0 +1,62 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP
#define BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP
#include <boost/compute/device.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/count_if_with_ballot.hpp>
#include <boost/compute/algorithm/detail/count_if_with_reduce.hpp>
#include <boost/compute/algorithm/detail/count_if_with_threads.hpp>
#include <boost/compute/algorithm/detail/serial_count_if.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
/// Returns the number of elements in the range [\p first, \p last)
/// for which \p predicate returns \c true.
template<class InputIterator, class Predicate>
inline size_t count_if(InputIterator first,
InputIterator last,
Predicate predicate,
command_queue &queue = system::default_queue())
{
const device &device = queue.get_device();
size_t input_size = detail::iterator_range_size(first, last);
if(input_size == 0){
return 0;
}
if(device.type() & device::cpu){
if(input_size < 1024){
return detail::serial_count_if(first, last, predicate, queue);
}
else {
return detail::count_if_with_threads(first, last, predicate, queue);
}
}
else {
if(input_size < 32){
return detail::serial_count_if(first, last, predicate, queue);
}
else {
return detail::count_if_with_reduce(first, last, predicate, queue);
}
}
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP
@@ -0,0 +1,162 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
#include <iterator>
#include <boost/compute/algorithm/find_if.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Balanced Path kernel class
///
/// Subclass of meta_kernel to break two sets into tiles according
/// to their balanced path.
///
class balanced_path_kernel : public meta_kernel
{
public:
unsigned int tile_size;
balanced_path_kernel() : meta_kernel("balanced_path")
{
tile_size = 4;
}
template<class InputIterator1, class InputIterator2,
class OutputIterator1, class OutputIterator2,
class Compare>
void set_range(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator1 result_a,
OutputIterator2 result_b,
Compare comp)
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
m_a_count = iterator_range_size(first1, last1);
m_a_count_arg = add_arg<uint_>("a_count");
m_b_count = iterator_range_size(first2, last2);
m_b_count_arg = add_arg<uint_>("b_count");
*this <<
"uint i = get_global_id(0);\n" <<
"uint target = (i+1)*" << tile_size << ";\n" <<
"uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" <<
"uint end = min(target,a_count);\n" <<
"uint a_index, b_index;\n" <<
"while(start<end)\n" <<
"{\n" <<
" a_index = (start + end)/2;\n" <<
" b_index = target - a_index - 1;\n" <<
" if(!(" << comp(first2[expr<uint_>("b_index")],
first1[expr<uint_>("a_index")]) << "))\n" <<
" start = a_index + 1;\n" <<
" else end = a_index;\n" <<
"}\n" <<
"a_index = start;\n" <<
"b_index = target - start;\n" <<
"if(b_index < b_count)\n" <<
"{\n" <<
" " << decl<const value_type>("x") << " = " <<
first2[expr<uint_>("b_index")] << ";\n" <<
" uint a_start = 0, a_end = a_index, a_mid;\n" <<
" uint b_start = 0, b_end = b_index, b_mid;\n" <<
" while(a_start<a_end)\n" <<
" {\n" <<
" a_mid = (a_start + a_end)/2;\n" <<
" if(" << comp(first1[expr<uint_>("a_mid")], expr<value_type>("x")) << ")\n" <<
" a_start = a_mid+1;\n" <<
" else a_end = a_mid;\n" <<
" }\n" <<
" while(b_start<b_end)\n" <<
" {\n" <<
" b_mid = (b_start + b_end)/2;\n" <<
" if(" << comp(first2[expr<uint_>("b_mid")], expr<value_type>("x")) << ")\n" <<
" b_start = b_mid+1;\n" <<
" else b_end = b_mid;\n" <<
" }\n" <<
" uint a_run = a_index - a_start;\n" <<
" uint b_run = b_index - b_start;\n" <<
" uint x_count = a_run + b_run;\n" <<
" uint b_advance = max(x_count / 2, x_count - a_run);\n" <<
" b_end = min(b_count, b_start + b_advance + 1);\n" <<
" uint temp_start = b_index, temp_end = b_end, temp_mid;" <<
" while(temp_start < temp_end)\n" <<
" {\n" <<
" temp_mid = (temp_start + temp_end + 1)/2;\n" <<
" if(" << comp(expr<value_type>("x"), first2[expr<uint_>("temp_mid")]) << ")\n" <<
" temp_end = temp_mid-1;\n" <<
" else temp_start = temp_mid;\n" <<
" }\n" <<
" b_run = temp_start - b_start + 1;\n" <<
" b_advance = min(b_advance, b_run);\n" <<
" uint a_advance = x_count - b_advance;\n" <<
" uint star = convert_uint((a_advance == b_advance + 1) " <<
"&& (b_advance < b_run));\n" <<
" a_index = a_start + a_advance;\n" <<
" b_index = target - a_index + star;\n" <<
"}\n" <<
result_a[expr<uint_>("i")] << " = a_index;\n" <<
result_b[expr<uint_>("i")] << " = b_index;\n";
}
template<class InputIterator1, class InputIterator2,
class OutputIterator1, class OutputIterator2>
void set_range(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator1 result_a,
OutputIterator2 result_b)
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::less<value_type> less_than;
set_range(first1, last1, first2, last2, result_a, result_b, less_than);
}
event exec(command_queue &queue)
{
if((m_a_count + m_b_count)/tile_size == 0) {
return event();
}
set_arg(m_a_count_arg, uint_(m_a_count));
set_arg(m_b_count_arg, uint_(m_b_count));
return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size);
}
private:
size_t m_a_count;
size_t m_a_count_arg;
size_t m_b_count;
size_t m_b_count_arg;
};
} //end detail namespace
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
@@ -0,0 +1,133 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
#include <boost/compute/functional.hpp>
#include <boost/compute/algorithm/find_if.hpp>
#include <boost/compute/algorithm/transform.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
namespace detail{
///
/// \brief Binary find kernel class
///
/// Subclass of meta_kernel to perform single step in binary find.
///
template<class InputIterator, class UnaryPredicate>
class binary_find_kernel : public meta_kernel
{
public:
binary_find_kernel(InputIterator first,
InputIterator last,
UnaryPredicate predicate)
: meta_kernel("binary_find")
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
m_index_arg = add_arg<uint_ *>(memory_object::global_memory, "index");
m_block_arg = add_arg<uint_>("block");
atomic_min<uint_> atomic_min_uint;
*this <<
"uint i = get_global_id(0) * block;\n" <<
decl<value_type>("value") << "=" << first[var<uint_>("i")] << ";\n" <<
"if(" << predicate(var<value_type>("value")) << ") {\n" <<
atomic_min_uint(var<uint_ *>("index"), var<uint_>("i")) << ";\n" <<
"}\n";
}
size_t m_index_arg;
size_t m_block_arg;
};
///
/// \brief Binary find algorithm
///
/// Finds the end of true values in the partitioned range [first, last).
/// \return Iterator pointing to end of true values
///
/// \param first Iterator pointing to start of range
/// \param last Iterator pointing to end of range
/// \param predicate Predicate according to which the range is partitioned
/// \param queue Queue on which to execute
///
template<class InputIterator, class UnaryPredicate>
inline InputIterator binary_find(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
const device &device = queue.get_device();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
const std::string cache_key = "__boost_binary_find";
size_t find_if_limit = 128;
size_t threads = parameters->get(cache_key, "tpb", 128);
size_t count = iterator_range_size(first, last);
InputIterator search_first = first;
InputIterator search_last = last;
scalar<uint_> index(queue.get_context());
// construct and compile binary_find kernel
binary_find_kernel<InputIterator, UnaryPredicate>
binary_find_kernel(search_first, search_last, predicate);
::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context());
// set buffer for index
kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer());
while(count > find_if_limit) {
index.write(static_cast<uint_>(count), queue);
// set block and run binary_find kernel
uint_ block = static_cast<uint_>((count - 1)/(threads - 1));
kernel.set_arg(binary_find_kernel.m_block_arg, block);
queue.enqueue_1d_range_kernel(kernel, 0, threads, 0);
size_t i = index.read(queue);
if(i == count) {
search_first = search_last - ((count - 1)%(threads - 1));
break;
} else {
search_last = search_first + i;
search_first = search_last - ((count - 1)/(threads - 1));
}
// Make sure that first and last stay within the input range
search_last = (std::min)(search_last, last);
search_last = (std::max)(search_last, first);
search_first = (std::max)(search_first, first);
search_first = (std::min)(search_first, last);
count = iterator_range_size(search_first, search_last);
}
return find_if(search_first, search_last, predicate, queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
@@ -0,0 +1,77 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
#include <iterator>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Compact kernel class
///
/// Subclass of meta_kernel to compact the result of set kernels to
/// get actual sets
///
class compact_kernel : public meta_kernel
{
public:
unsigned int tile_size;
compact_kernel() : meta_kernel("compact")
{
tile_size = 4;
}
template<class InputIterator1, class InputIterator2, class OutputIterator>
void set_range(InputIterator1 start,
InputIterator2 counts_begin,
InputIterator2 counts_end,
OutputIterator result)
{
m_count = iterator_range_size(counts_begin, counts_end) - 1;
*this <<
"uint i = get_global_id(0);\n" <<
"uint count = i*" << tile_size << ";\n" <<
"for(uint j = " << counts_begin[expr<uint_>("i")] << "; j<" <<
counts_begin[expr<uint_>("i+1")] << "; j++, count++)\n" <<
"{\n" <<
result[expr<uint_>("j")] << " = " << start[expr<uint_>("count")]
<< ";\n" <<
"}\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
};
} //end detail namespace
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
@@ -0,0 +1,190 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/async/future.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/iterator/discard_iterator.hpp>
#include <boost/compute/memory/svm_ptr.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/detail/work_size.hpp>
#include <boost/compute/detail/vendor.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator>
inline event copy_on_device_cpu(InputIterator first,
OutputIterator result,
size_t count,
command_queue &queue)
{
meta_kernel k("copy");
const device& device = queue.get_device();
k <<
"uint block = " <<
"(uint)ceil(((float)count)/get_global_size(0));\n" <<
"uint index = get_global_id(0) * block;\n" <<
"uint end = min(count, index + block);\n" <<
"while(index < end){\n" <<
result[k.var<uint_>("index")] << '=' <<
first[k.var<uint_>("index")] << ";\n" <<
"index++;\n" <<
"}\n";
k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
size_t global_work_size = device.compute_units();
if(count <= 1024) global_work_size = 1;
return k.exec_1d(queue, 0, global_work_size);
}
template<class InputIterator, class OutputIterator>
inline event copy_on_device_gpu(InputIterator first,
OutputIterator result,
size_t count,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const device& device = queue.get_device();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
std::string cache_key =
"__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
uint_ vpt = parameters->get(cache_key, "vpt", 4);
uint_ tpb = parameters->get(cache_key, "tpb", 128);
meta_kernel k("copy");
k <<
"uint index = get_local_id(0) + " <<
"(" << vpt * tpb << " * get_group_id(0));\n" <<
"for(uint i = 0; i < " << vpt << "; i++){\n" <<
" if(index < count){\n" <<
result[k.var<uint_>("index")] << '=' <<
first[k.var<uint_>("index")] << ";\n" <<
" index += " << tpb << ";\n"
" }\n"
"}\n";
k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
size_t global_work_size = calculate_work_size(count, vpt, tpb);
return k.exec_1d(queue, 0, global_work_size, tpb);
}
template<class InputIterator, class OutputIterator>
inline event dispatch_copy_on_device(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue)
{
const size_t count = detail::iterator_range_size(first, last);
if(count == 0){
// nothing to do
return event();
}
const device& device = queue.get_device();
// copy_on_device_cpu() does not work for CPU on Apple platform
// due to bug in its compiler.
// See https://github.com/boostorg/compute/pull/626
if((device.type() & device::cpu) && !is_apple_platform_device(device))
{
return copy_on_device_cpu(first, result, count, queue);
}
return copy_on_device_gpu(first, result, count, queue);
}
template<class InputIterator, class OutputIterator>
inline OutputIterator copy_on_device(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue)
{
dispatch_copy_on_device(first, last, result, queue);
return result + std::distance(first, last);
}
template<class InputIterator>
inline discard_iterator copy_on_device(InputIterator first,
InputIterator last,
discard_iterator result,
command_queue &queue)
{
(void) queue;
return result + std::distance(first, last);
}
template<class InputIterator, class OutputIterator>
inline future<OutputIterator> copy_on_device_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue)
{
event event_ = dispatch_copy_on_device(first, last, result, queue);
return make_future(result + std::distance(first, last), event_);
}
#ifdef CL_VERSION_2_0
// copy_on_device() specialization for svm_ptr
template<class T>
inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
svm_ptr<T> last,
svm_ptr<T> result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
queue.enqueue_svm_memcpy(
result.get(), first.get(), count * sizeof(T)
);
return result + count;
}
template<class T>
inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
svm_ptr<T> last,
svm_ptr<T> result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return future<svm_ptr<T> >();
}
event event_ = queue.enqueue_svm_memcpy_async(
result.get(), first.get(), count * sizeof(T)
);
return make_future(result + count, event_);
}
#endif // CL_VERSION_2_0
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
@@ -0,0 +1,193 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
#include <iterator>
#include <boost/utility/addressof.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/async/future.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/memory/svm_ptr.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class HostIterator, class DeviceIterator>
inline DeviceIterator copy_to_device(HostIterator first,
HostIterator last,
DeviceIterator result,
command_queue &queue)
{
typedef typename
std::iterator_traits<DeviceIterator>::value_type
value_type;
typedef typename
std::iterator_traits<DeviceIterator>::difference_type
difference_type;
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
size_t offset = result.get_index();
queue.enqueue_write_buffer(result.get_buffer(),
offset * sizeof(value_type),
count * sizeof(value_type),
::boost::addressof(*first));
return result + static_cast<difference_type>(count);
}
template<class HostIterator, class DeviceIterator>
inline DeviceIterator copy_to_device_map(HostIterator first,
HostIterator last,
DeviceIterator result,
command_queue &queue)
{
typedef typename
std::iterator_traits<DeviceIterator>::value_type
value_type;
typedef typename
std::iterator_traits<DeviceIterator>::difference_type
difference_type;
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
size_t offset = result.get_index();
// map result buffer to host
value_type *pointer = static_cast<value_type*>(
queue.enqueue_map_buffer(
result.get_buffer(),
CL_MAP_WRITE,
offset * sizeof(value_type),
count * sizeof(value_type)
)
);
// copy [first; last) to result buffer
std::copy(first, last, pointer);
// unmap result buffer
boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
result.get_buffer(),
static_cast<void*>(pointer)
);
unmap_event.wait();
return result + static_cast<difference_type>(count);
}
template<class HostIterator, class DeviceIterator>
inline future<DeviceIterator> copy_to_device_async(HostIterator first,
HostIterator last,
DeviceIterator result,
command_queue &queue)
{
typedef typename
std::iterator_traits<DeviceIterator>::value_type
value_type;
typedef typename
std::iterator_traits<DeviceIterator>::difference_type
difference_type;
size_t count = iterator_range_size(first, last);
if(count == 0){
return future<DeviceIterator>();
}
size_t offset = result.get_index();
event event_ =
queue.enqueue_write_buffer_async(result.get_buffer(),
offset * sizeof(value_type),
count * sizeof(value_type),
::boost::addressof(*first));
return make_future(result + static_cast<difference_type>(count), event_);
}
#ifdef CL_VERSION_2_0
// copy_to_device() specialization for svm_ptr
template<class HostIterator, class T>
inline svm_ptr<T> copy_to_device(HostIterator first,
HostIterator last,
svm_ptr<T> result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
queue.enqueue_svm_memcpy(
result.get(), ::boost::addressof(*first), count * sizeof(T)
);
return result + count;
}
template<class HostIterator, class T>
inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
HostIterator last,
svm_ptr<T> result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return future<svm_ptr<T> >();
}
event event_ = queue.enqueue_svm_memcpy_async(
result.get(), ::boost::addressof(*first), count * sizeof(T)
);
return make_future(result + count, event_);
}
template<class HostIterator, class T>
inline svm_ptr<T> copy_to_device_map(HostIterator first,
HostIterator last,
svm_ptr<T> result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
// map
queue.enqueue_svm_map(result.get(), count * sizeof(T), CL_MAP_WRITE);
// copy [first; last) to result buffer
std::copy(first, last, static_cast<T*>(result.get()));
// unmap result
queue.enqueue_svm_unmap(result.get()).wait();
return result + count;
}
#endif // CL_VERSION_2_0
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
@@ -0,0 +1,198 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
#include <iterator>
#include <boost/utility/addressof.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/async/future.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/memory/svm_ptr.hpp>
#include <boost/compute/detail/iterator_plus_distance.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class DeviceIterator, class HostIterator>
inline HostIterator copy_to_host(DeviceIterator first,
DeviceIterator last,
HostIterator result,
command_queue &queue)
{
typedef typename
std::iterator_traits<DeviceIterator>::value_type
value_type;
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
const buffer &buffer = first.get_buffer();
size_t offset = first.get_index();
queue.enqueue_read_buffer(buffer,
offset * sizeof(value_type),
count * sizeof(value_type),
::boost::addressof(*result));
return iterator_plus_distance(result, count);
}
template<class DeviceIterator, class HostIterator>
inline HostIterator copy_to_host_map(DeviceIterator first,
DeviceIterator last,
HostIterator result,
command_queue &queue)
{
typedef typename
std::iterator_traits<DeviceIterator>::value_type
value_type;
typedef typename
std::iterator_traits<DeviceIterator>::difference_type
difference_type;
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
size_t offset = first.get_index();
// map [first; last) buffer to host
value_type *pointer = static_cast<value_type*>(
queue.enqueue_map_buffer(
first.get_buffer(),
CL_MAP_READ,
offset * sizeof(value_type),
count * sizeof(value_type)
)
);
// copy [first; last) to result buffer
std::copy(
pointer,
pointer + static_cast<difference_type>(count),
result
);
// unmap [first; last)
boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
first.get_buffer(),
static_cast<void*>(pointer)
);
unmap_event.wait();
return iterator_plus_distance(result, count);
}
template<class DeviceIterator, class HostIterator>
inline future<HostIterator> copy_to_host_async(DeviceIterator first,
DeviceIterator last,
HostIterator result,
command_queue &queue)
{
typedef typename
std::iterator_traits<DeviceIterator>::value_type
value_type;
size_t count = iterator_range_size(first, last);
if(count == 0){
return future<HostIterator>();
}
const buffer &buffer = first.get_buffer();
size_t offset = first.get_index();
event event_ =
queue.enqueue_read_buffer_async(buffer,
offset * sizeof(value_type),
count * sizeof(value_type),
::boost::addressof(*result));
return make_future(iterator_plus_distance(result, count), event_);
}
#ifdef CL_VERSION_2_0
// copy_to_host() specialization for svm_ptr
template<class T, class HostIterator>
inline HostIterator copy_to_host(svm_ptr<T> first,
svm_ptr<T> last,
HostIterator result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
queue.enqueue_svm_memcpy(
::boost::addressof(*result), first.get(), count * sizeof(T)
);
return result + count;
}
template<class T, class HostIterator>
inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
svm_ptr<T> last,
HostIterator result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return future<HostIterator>();
}
event event_ = queue.enqueue_svm_memcpy_async(
::boost::addressof(*result), first.get(), count * sizeof(T)
);
return make_future(iterator_plus_distance(result, count), event_);
}
template<class T, class HostIterator>
inline HostIterator copy_to_host_map(svm_ptr<T> first,
svm_ptr<T> last,
HostIterator result,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
if(count == 0){
return result;
}
// map
queue.enqueue_svm_map(first.get(), count * sizeof(T), CL_MAP_READ);
// copy [first; last) to result
std::copy(
static_cast<T*>(first.get()),
static_cast<T*>(last.get()),
result
);
// unmap [first; last)
queue.enqueue_svm_unmap(first.get()).wait();
return iterator_plus_distance(result, count);
}
#endif // CL_VERSION_2_0
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
@@ -0,0 +1,78 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
#include <boost/compute/context.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/algorithm/reduce.hpp>
#include <boost/compute/functional/detail/nvidia_ballot.hpp>
#include <boost/compute/functional/detail/nvidia_popcount.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Predicate>
inline size_t count_if_with_ballot(InputIterator first,
InputIterator last,
Predicate predicate,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
size_t block_size = 32;
size_t block_count = count / block_size;
if(block_count * block_size != count){
block_count++;
}
const ::boost::compute::context &context = queue.get_context();
::boost::compute::vector<uint_> counts(block_count, context);
::boost::compute::detail::nvidia_popcount<uint_> popc;
::boost::compute::detail::nvidia_ballot<uint_> ballot;
meta_kernel k("count_if_with_ballot");
k <<
"const uint gid = get_global_id(0);\n" <<
"bool value = false;\n" <<
"if(gid < count)\n" <<
" value = " << predicate(first[k.var<const uint_>("gid")]) << ";\n" <<
"uint bits = " << ballot(k.var<const uint_>("value")) << ";\n" <<
"if(get_local_id(0) == 0)\n" <<
counts.begin()[k.var<uint_>("get_group_id(0)") ]
<< " = " << popc(k.var<uint_>("bits")) << ";\n";
k.add_set_arg<const uint_>("count", count);
k.exec_1d(queue, 0, block_size * block_count, block_size);
uint_ result;
::boost::compute::reduce(
counts.begin(),
counts.end(),
&result,
queue
);
return result;
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
@@ -0,0 +1,87 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
#include <boost/compute/algorithm/reduce.hpp>
#include <boost/compute/iterator/transform_iterator.hpp>
#include <boost/compute/types/fundamental.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class Predicate, class Arg>
struct invoked_countable_predicate
{
invoked_countable_predicate(Predicate p, Arg a)
: predicate(p), arg(a)
{
}
Predicate predicate;
Arg arg;
};
template<class Predicate, class Arg>
inline meta_kernel& operator<<(meta_kernel &kernel,
const invoked_countable_predicate<Predicate, Arg> &expr)
{
return kernel << "(" << expr.predicate(expr.arg) << " ? 1 : 0)";
}
// the countable_predicate wraps Predicate and converts its result from
// bool to ulong so that it can be used with reduce()
template<class Predicate>
struct countable_predicate
{
typedef ulong_ result_type;
countable_predicate(Predicate predicate)
: m_predicate(predicate)
{
}
template<class Arg>
invoked_countable_predicate<Predicate, Arg> operator()(const Arg &arg) const
{
return invoked_countable_predicate<Predicate, Arg>(m_predicate, arg);
}
Predicate m_predicate;
};
// counts the number of elements matching predicate using reduce()
template<class InputIterator, class Predicate>
inline size_t count_if_with_reduce(InputIterator first,
InputIterator last,
Predicate predicate,
command_queue &queue)
{
countable_predicate<Predicate> reduce_predicate(predicate);
ulong_ count = 0;
::boost::compute::reduce(
::boost::compute::make_transform_iterator(first, reduce_predicate),
::boost::compute::make_transform_iterator(last, reduce_predicate),
&count,
::boost::compute::plus<ulong_>(),
queue
);
return static_cast<size_t>(count);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
@@ -0,0 +1,129 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
#include <numeric>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/container/vector.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Predicate>
class count_if_with_threads_kernel : meta_kernel
{
public:
typedef typename
std::iterator_traits<InputIterator>::value_type
value_type;
count_if_with_threads_kernel()
: meta_kernel("count_if_with_threads")
{
}
void set_args(InputIterator first,
InputIterator last,
Predicate predicate)
{
typedef typename std::iterator_traits<InputIterator>::value_type T;
m_size = detail::iterator_range_size(first, last);
m_size_arg = add_arg<const ulong_>("size");
m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts");
*this <<
// thread parameters
"const uint gid = get_global_id(0);\n" <<
"const uint block_size = size / get_global_size(0);\n" <<
"const uint start = block_size * gid;\n" <<
"uint end = 0;\n" <<
"if(gid == get_global_size(0) - 1)\n" <<
" end = size;\n" <<
"else\n" <<
" end = block_size * gid + block_size;\n" <<
// count values
"uint count = 0;\n" <<
"for(uint i = start; i < end; i++){\n" <<
decl<const T>("value") << "="
<< first[expr<uint_>("i")] << ";\n" <<
if_(predicate(var<const T>("value"))) << "{\n" <<
"count++;\n" <<
"}\n" <<
"}\n" <<
// write count
"counts[gid] = count;\n";
}
size_t exec(command_queue &queue)
{
const device &device = queue.get_device();
const context &context = queue.get_context();
size_t threads = device.compute_units();
const size_t minimum_block_size = 2048;
if(m_size / threads < minimum_block_size){
threads = static_cast<size_t>(
(std::max)(
std::ceil(float(m_size) / minimum_block_size),
1.0f
)
);
}
// storage for counts
::boost::compute::vector<ulong_> counts(threads, context);
// exec kernel
set_arg(m_size_arg, static_cast<ulong_>(m_size));
set_arg(m_counts_arg, counts.get_buffer());
exec_1d(queue, 0, threads, 1);
// copy counts to the host
std::vector<ulong_> host_counts(threads);
::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue);
// return sum of counts
return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0));
}
private:
size_t m_size;
size_t m_size_arg;
size_t m_counts_arg;
};
// counts values that match the predicate using one thread per block. this is
// optimized for cpu-type devices with a small number of compute units.
template<class InputIterator, class Predicate>
inline size_t count_if_with_threads(InputIterator first,
InputIterator last,
Predicate predicate,
command_queue &queue)
{
count_if_with_threads_kernel<InputIterator, Predicate> kernel;
kernel.set_args(first, last, predicate);
return kernel.exec(queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
@@ -0,0 +1,70 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/algorithm/detail/find_extrema_on_cpu.hpp>
#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Compare>
inline InputIterator find_extrema(InputIterator first,
InputIterator last,
Compare compare,
const bool find_minimum,
command_queue &queue)
{
size_t count = iterator_range_size(first, last);
// handle trivial cases
if(count == 0 || count == 1){
return first;
}
const device &device = queue.get_device();
// CPU
if(device.type() & device::cpu) {
return find_extrema_on_cpu(first, last, compare, find_minimum, queue);
}
// GPU
// use serial method for small inputs
if(count < 512)
{
return serial_find_extrema(first, last, compare, find_minimum, queue);
}
// find_extrema_with_reduce() is used only if requirements are met
if(find_extrema_with_reduce_requirements_met(first, last, queue))
{
return find_extrema_with_reduce(first, last, compare, find_minimum, queue);
}
// use serial method for OpenCL version 1.0 due to
// problems with atomic_cmpxchg()
#ifndef CL_VERSION_1_1
return serial_find_extrema(first, last, compare, find_minimum, queue);
#endif
return find_extrema_with_atomics(first, last, compare, find_minimum, queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
@@ -0,0 +1,138 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
#include <algorithm>
#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Compare>
inline InputIterator find_extrema_on_cpu(InputIterator first,
InputIterator last,
Compare compare,
const bool find_minimum,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
size_t count = iterator_range_size(first, last);
const device &device = queue.get_device();
const uint_ compute_units = queue.get_device().compute_units();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
std::string cache_key =
"__boost_find_extrema_cpu_"
+ boost::lexical_cast<std::string>(sizeof(input_type));
// for inputs smaller than serial_find_extrema_threshold
// serial_find_extrema algorithm is used
uint_ serial_find_extrema_threshold = parameters->get(
cache_key,
"serial_find_extrema_threshold",
16384 * sizeof(input_type)
);
serial_find_extrema_threshold =
(std::max)(serial_find_extrema_threshold, uint_(2 * compute_units));
const context &context = queue.get_context();
if(count < serial_find_extrema_threshold) {
return serial_find_extrema(first, last, compare, find_minimum, queue);
}
meta_kernel k("find_extrema_on_cpu");
buffer output(context, sizeof(input_type) * compute_units);
buffer output_idx(
context, sizeof(uint_) * compute_units,
buffer::read_write | buffer::alloc_host_ptr
);
size_t count_arg = k.add_arg<uint_>("count");
size_t output_arg =
k.add_arg<input_type *>(memory_object::global_memory, "output");
size_t output_idx_arg =
k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");
k <<
"uint block = " <<
"(uint)ceil(((float)count)/get_global_size(0));\n" <<
"uint index = get_global_id(0) * block;\n" <<
"uint end = min(count, index + block);\n" <<
"uint value_index = index;\n" <<
k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" <<
"index++;\n" <<
"while(index < end){\n" <<
k.decl<input_type>("candidate") <<
" = " << first[k.var<uint_>("index")] << ";\n" <<
"#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
"bool compare = " << compare(k.var<input_type>("candidate"),
k.var<input_type>("value")) << ";\n" <<
"#else\n" <<
"bool compare = " << compare(k.var<input_type>("value"),
k.var<input_type>("candidate")) << ";\n" <<
"#endif\n" <<
"value = compare ? candidate : value;\n" <<
"value_index = compare ? index : value_index;\n" <<
"index++;\n" <<
"}\n" <<
"output[get_global_id(0)] = value;\n" <<
"output_idx[get_global_id(0)] = value_index;\n";
size_t global_work_size = compute_units;
std::string options;
if(!find_minimum){
options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
}
kernel kernel = k.compile(context, options);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(output_arg, output);
kernel.set_arg(output_idx_arg, output_idx);
queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
buffer_iterator<input_type> result = serial_find_extrema(
make_buffer_iterator<input_type>(output),
make_buffer_iterator<input_type>(output, global_work_size),
compare,
find_minimum,
queue
);
uint_* output_idx_host_ptr =
static_cast<uint_*>(
queue.enqueue_map_buffer(
output_idx, command_queue::map_read,
0, global_work_size * sizeof(uint_)
)
);
difference_type extremum_idx =
static_cast<difference_type>(*(output_idx_host_ptr + result.get_index()));
return first + extremum_idx;
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
@@ -0,0 +1,108 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
#include <boost/compute/types.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/functional/atomic.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Compare>
inline InputIterator find_extrema_with_atomics(InputIterator first,
InputIterator last,
Compare compare,
const bool find_minimum,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
const context &context = queue.get_context();
meta_kernel k("find_extrema");
atomic_cmpxchg<uint_> atomic_cmpxchg_uint;
k <<
"const uint gid = get_global_id(0);\n" <<
"uint old_index = *index;\n" <<
k.decl<value_type>("old") <<
" = " << first[k.var<uint_>("old_index")] << ";\n" <<
k.decl<value_type>("new") <<
" = " << first[k.var<uint_>("gid")] << ";\n" <<
k.decl<bool>("compare_result") << ";\n" <<
"#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
"while(" <<
"(compare_result = " << compare(k.var<value_type>("old"),
k.var<value_type>("new")) << ")" <<
" || (!(compare_result" <<
" || " << compare(k.var<value_type>("new"),
k.var<value_type>("old")) << ") "
"&& gid < old_index)){\n" <<
"#else\n" <<
// while condition explained for minimum case with less (<)
// as comparison function:
// while(new_value < old_value
// OR (new_value == old_value AND new_index < old_index))
"while(" <<
"(compare_result = " << compare(k.var<value_type>("new"),
k.var<value_type>("old")) << ")" <<
" || (!(compare_result" <<
" || " << compare(k.var<value_type>("old"),
k.var<value_type>("new")) << ") "
"&& gid < old_index)){\n" <<
"#endif\n" <<
" if(" << atomic_cmpxchg_uint(k.var<uint_ *>("index"),
k.var<uint_>("old_index"),
k.var<uint_>("gid")) << " == old_index)\n" <<
" break;\n" <<
" else\n" <<
" old_index = *index;\n" <<
"old = " << first[k.var<uint_>("old_index")] << ";\n" <<
"}\n";
size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index");
std::string options;
if(!find_minimum){
options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
}
kernel kernel = k.compile(context, options);
// setup index buffer
scalar<uint_> index(context);
kernel.set_arg(index_arg_index, index.get_buffer());
// initialize index
index.write(0, queue);
// run kernel
size_t count = iterator_range_size(first, last);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
// read index and return iterator
return first + static_cast<difference_type>(index.read(queue));
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
@@ -0,0 +1,443 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
#include <algorithm>
#include <boost/compute/types.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/allocator/pinned_allocator.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/utility/program_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator>
bool find_extrema_with_reduce_requirements_met(InputIterator first,
InputIterator last,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const device &device = queue.get_device();
// device must have dedicated local memory storage
// otherwise reduction would be highly inefficient
if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
{
return false;
}
const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
// local memory size in bytes (per compute unit)
const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
std::string cache_key = std::string("__boost_find_extrema_reduce_")
+ type_name<input_type>();
// load parameters
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// Get preferred work group size
size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
work_group_size = (std::min)(max_work_group_size, work_group_size);
// local memory size needed to perform parallel reduction
size_t required_local_mem_size = 0;
// indices size
required_local_mem_size += sizeof(uint_) * work_group_size;
// values size
required_local_mem_size += sizeof(input_type) * work_group_size;
// at least 4 work groups per compute unit otherwise reduction
// would be highly inefficient
return ((required_local_mem_size * 4) <= local_mem_size);
}
/// \internal_
/// Algorithm finds the first extremum in given range, i.e., with the lowest
/// index.
///
/// If \p use_input_idx is false, it's assumed that input data is ordered by
/// increasing index and \p input_idx is not used in the algorithm.
template<class InputIterator, class ResultIterator, class Compare>
inline void find_extrema_with_reduce(InputIterator input,
vector<uint_>::iterator input_idx,
size_t count,
ResultIterator result,
vector<uint_>::iterator result_idx,
size_t work_groups_no,
size_t work_group_size,
Compare compare,
const bool find_minimum,
const bool use_input_idx,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const context &context = queue.get_context();
meta_kernel k("find_extrema_reduce");
size_t count_arg = k.add_arg<uint_>("count");
size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
k <<
// Work item global id
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
// Index of element that will be read from input buffer
k.decl<uint_>("idx") << " = gid;\n" <<
k.decl<input_type>("acc") << ";\n" <<
k.decl<uint_>("acc_idx") << ";\n" <<
"if(gid < count) {\n" <<
// Real index of currently best element
"#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
"#else\n" <<
k.var<uint_>("acc_idx") << " = idx;\n" <<
"#endif\n" <<
// Init accumulator with first[get_global_id(0)]
"acc = " << input[k.var<uint_>("idx")] << ";\n" <<
"idx += get_global_size(0);\n" <<
"}\n" <<
k.decl<bool>("compare_result") << ";\n" <<
k.decl<bool>("equal") << ";\n\n" <<
"while( idx < count ){\n" <<
// Next element
k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
"#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
"#endif\n" <<
// Comparison between currently best element (acc) and next element
"#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
"compare_result = " << compare(k.var<input_type>("next"),
k.var<input_type>("acc")) << ";\n" <<
"# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
"equal = !compare_result && !" <<
compare(k.var<input_type>("acc"),
k.var<input_type>("next")) << ";\n" <<
"# endif\n" <<
"#else\n" <<
"compare_result = " << compare(k.var<input_type>("acc"),
k.var<input_type>("next")) << ";\n" <<
"# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
"equal = !compare_result && !" <<
compare(k.var<input_type>("next"),
k.var<input_type>("acc")) << ";\n" <<
"# endif\n" <<
"#endif\n" <<
// save the winner
"acc = compare_result ? acc : next;\n" <<
"#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
"acc_idx = compare_result ? " <<
"acc_idx : " <<
"(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
"#else\n" <<
"acc_idx = compare_result ? acc_idx : idx;\n" <<
"#endif\n" <<
"idx += get_global_size(0);\n" <<
"}\n\n" <<
// Work item local id
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
"block[lid] = acc;\n" <<
"block_idx[lid] = acc_idx;\n" <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
k.decl<uint_>("group_offset") <<
" = count - (get_local_size(0) * get_group_id(0));\n\n";
k <<
"#pragma unroll\n"
"for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
"offset = offset / 2) {\n" <<
"if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
k.decl<input_type>("mine") << " = block[lid];\n" <<
k.decl<input_type>("other") << " = block[lid+offset];\n" <<
"#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
"compare_result = " << compare(k.var<input_type>("other"),
k.var<input_type>("mine")) << ";\n" <<
"equal = !compare_result && !" <<
compare(k.var<input_type>("mine"),
k.var<input_type>("other")) << ";\n" <<
"#else\n" <<
"compare_result = " << compare(k.var<input_type>("mine"),
k.var<input_type>("other")) << ";\n" <<
"equal = !compare_result && !" <<
compare(k.var<input_type>("other"),
k.var<input_type>("mine")) << ";\n" <<
"#endif\n" <<
"block[lid] = compare_result ? mine : other;\n" <<
k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
"block_idx[lid] = compare_result ? " <<
"mine_idx : " <<
"(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
"}\n"
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"}\n\n" <<
// write block result to global output
"if(lid == 0){\n" <<
result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
"}";
std::string options;
if(!find_minimum){
options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
}
if(use_input_idx){
options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
}
kernel kernel = k.compile(context, options);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
queue.enqueue_1d_range_kernel(kernel,
0,
work_groups_no * work_group_size,
work_group_size);
}
template<class InputIterator, class ResultIterator, class Compare>
inline void find_extrema_with_reduce(InputIterator input,
size_t count,
ResultIterator result,
vector<uint_>::iterator result_idx,
size_t work_groups_no,
size_t work_group_size,
Compare compare,
const bool find_minimum,
command_queue &queue)
{
// dummy will not be used
buffer_iterator<uint_> dummy = result_idx;
return find_extrema_with_reduce(
input, dummy, count, result, result_idx, work_groups_no,
work_group_size, compare, find_minimum, false, queue
);
}
template<class InputIterator, class Compare>
InputIterator find_extrema_with_reduce(InputIterator first,
InputIterator last,
Compare compare,
const bool find_minimum,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const context &context = queue.get_context();
const device &device = queue.get_device();
// Getting information about used queue and device
const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
const size_t count = detail::iterator_range_size(first, last);
std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+ type_name<input_type>();
// load parameters
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// get preferred work group size and preferred number
// of work groups per compute unit
size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
// calculate work group size and number of work groups
work_group_size = (std::min)(max_work_group_size, work_group_size);
size_t work_groups_no = compute_units_no * work_groups_per_cu;
work_groups_no = (std::min)(
work_groups_no,
static_cast<size_t>(std::ceil(float(count) / work_group_size))
);
// phase I: finding candidates for extremum
// device buffors for extremum candidates and their indices
// each work-group computes its candidate
vector<input_type> candidates(work_groups_no, context);
vector<uint_> candidates_idx(work_groups_no, context);
// finding candidates for first extremum and their indices
find_extrema_with_reduce(
first, count, candidates.begin(), candidates_idx.begin(),
work_groups_no, work_group_size, compare, find_minimum, queue
);
// phase II: finding extremum from among the candidates
// zero-copy buffers for final result (value and index)
vector<input_type, ::boost::compute::pinned_allocator<input_type> >
result(1, context);
vector<uint_, ::boost::compute::pinned_allocator<uint_> >
result_idx(1, context);
// get extremum from among the candidates
find_extrema_with_reduce(
candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
);
// mapping extremum index to host
uint_* result_idx_host_ptr =
static_cast<uint_*>(
queue.enqueue_map_buffer(
result_idx.get_buffer(), command_queue::map_read,
0, sizeof(uint_)
)
);
return first + static_cast<difference_type>(*result_idx_host_ptr);
}
template<class InputIterator>
InputIterator find_extrema_with_reduce(InputIterator first,
InputIterator last,
::boost::compute::less<
typename std::iterator_traits<
InputIterator
>::value_type
>
compare,
const bool find_minimum,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
const context &context = queue.get_context();
const device &device = queue.get_device();
// Getting information about used queue and device
const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
const size_t count = detail::iterator_range_size(first, last);
std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+ type_name<input_type>();
// load parameters
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// get preferred work group size and preferred number
// of work groups per compute unit
size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
// calculate work group size and number of work groups
work_group_size = (std::min)(max_work_group_size, work_group_size);
size_t work_groups_no = compute_units_no * work_groups_per_cu;
work_groups_no = (std::min)(
work_groups_no,
static_cast<size_t>(std::ceil(float(count) / work_group_size))
);
// phase I: finding candidates for extremum
// device buffors for extremum candidates and their indices
// each work-group computes its candidate
// zero-copy buffers are used to eliminate copying data back to host
vector<input_type, ::boost::compute::pinned_allocator<input_type> >
candidates(work_groups_no, context);
vector<uint_, ::boost::compute::pinned_allocator <uint_> >
candidates_idx(work_groups_no, context);
// finding candidates for first extremum and their indices
find_extrema_with_reduce(
first, count, candidates.begin(), candidates_idx.begin(),
work_groups_no, work_group_size, compare, find_minimum, queue
);
// phase II: finding extremum from among the candidates
// mapping candidates and their indices to host
input_type* candidates_host_ptr =
static_cast<input_type*>(
queue.enqueue_map_buffer(
candidates.get_buffer(), command_queue::map_read,
0, work_groups_no * sizeof(input_type)
)
);
uint_* candidates_idx_host_ptr =
static_cast<uint_*>(
queue.enqueue_map_buffer(
candidates_idx.get_buffer(), command_queue::map_read,
0, work_groups_no * sizeof(uint_)
)
);
input_type* i = candidates_host_ptr;
uint_* idx = candidates_idx_host_ptr;
uint_* extremum_idx = idx;
input_type extremum = *candidates_host_ptr;
i++; idx++;
// find extremum (serial) from among the candidates on host
if(!find_minimum) {
while(idx != (candidates_idx_host_ptr + work_groups_no)) {
input_type next = *i;
bool compare_result = next > extremum;
bool equal = next == extremum;
extremum = compare_result ? next : extremum;
extremum_idx = compare_result ? idx : extremum_idx;
extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
idx++, i++;
}
}
else {
while(idx != (candidates_idx_host_ptr + work_groups_no)) {
input_type next = *i;
bool compare_result = next < extremum;
bool equal = next == extremum;
extremum = compare_result ? next : extremum;
extremum_idx = compare_result ? idx : extremum_idx;
extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
idx++, i++;
}
}
return first + static_cast<difference_type>(*extremum_idx);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
@@ -0,0 +1,212 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
#include <iterator>
#include <boost/compute/types.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
const size_t count,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
const context &context = queue.get_context();
detail::meta_kernel k("find_if");
size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
atomic_min<uint_> atomic_min_uint;
k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
<< k.decl<const value_type>("value") << "="
<< first[k.var<const uint_>("i")] << ";\n"
<< "if(" << predicate(k.var<const value_type>("value")) << "){\n"
<< " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
<< "}\n";
kernel kernel = k.compile(context);
scalar<uint_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
// initialize index to the last iterator's index
index.write(static_cast<uint_>(count), queue);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
// read index and return iterator
return first + static_cast<difference_type>(index.read(queue));
}
template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
const size_t count,
const size_t vpt,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
const context &context = queue.get_context();
const device &device = queue.get_device();
detail::meta_kernel k("find_if");
size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
size_t count_arg = k.add_arg<const uint_>("count");
size_t vpt_arg = k.add_arg<const uint_>("vpt");
atomic_min<uint_> atomic_min_uint;
// for GPUs reads from global memory are coalesced
if(device.type() & device::gpu) {
k <<
k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
k.decl<const uint_>("end") << " = min(" <<
"id + (lsize *" << k.var<uint_>("vpt") << ")," <<
"count" <<
");\n" <<
// checking if the index is already found
"__local uint local_index;\n" <<
"if(get_local_id(0) == 0){\n" <<
" local_index = *index;\n " <<
"};\n" <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"if(local_index < id){\n" <<
" return;\n" <<
"}\n" <<
"while(id < end){\n" <<
" " << k.decl<const value_type>("value") << " = " <<
first[k.var<const uint_>("id")] << ";\n"
" if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
" " << atomic_min_uint(k.var<uint_ *>("index"),
k.var<uint_>("id")) << ";\n" <<
" return;\n"
" }\n" <<
" id+=lsize;\n" <<
"}\n";
// for CPUs (and other devices) reads are ordered so the big cache is
// efficiently used.
} else {
k <<
k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
k.decl<const uint_>("end") << " = min(" <<
"id + " << k.var<uint_>("vpt") << "," <<
"count" <<
");\n" <<
"while(id < end && (*index) > id){\n" <<
" " << k.decl<const value_type>("value") << " = " <<
first[k.var<const uint_>("id")] << ";\n"
" if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
" " << atomic_min_uint(k.var<uint_ *>("index"),
k.var<uint_>("id")) << ";\n" <<
" return;\n" <<
" }\n" <<
" id++;\n" <<
"}\n";
}
kernel kernel = k.compile(context);
scalar<uint_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
// initialize index to the last iterator's index
index.write(static_cast<uint_>(count), queue);
const size_t global_wg_size = static_cast<size_t>(
std::ceil(float(count) / vpt)
);
queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
// read index and return iterator
return first + static_cast<difference_type>(index.read(queue));
}
template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return last;
}
const device &device = queue.get_device();
// load cached parameters
std::string cache_key = std::string("__boost_find_if_with_atomics_")
+ type_name<value_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// for relatively small inputs on GPUs kernel checking one value per thread
// (work-item) is more efficient than its multiple values per thread version
if(device.type() & device::gpu){
const size_t one_vpt_threshold =
parameters->get(cache_key, "one_vpt_threshold", 1048576);
if(count <= one_vpt_threshold){
return find_if_with_atomics_one_vpt(
first, last, predicate, count, queue
);
}
}
// values per thread
size_t vpt;
if(device.type() & device::gpu){
// get vpt parameter
vpt = parameters->get(cache_key, "vpt", 32);
} else {
// for CPUs work is split equally between compute units
const size_t max_compute_units =
device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
vpt = static_cast<size_t>(
std::ceil(float(count) / max_compute_units)
);
}
return find_if_with_atomics_multiple_vpt(
first, last, predicate, count, vpt, queue
);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
@@ -0,0 +1,136 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
#include <iterator>
#include <boost/utility/result_of.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class Iterator, class BinaryFunction>
inline void inplace_reduce(Iterator first,
Iterator last,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<Iterator>::value_type
value_type;
size_t input_size = iterator_range_size(first, last);
if(input_size < 2){
return;
}
const context &context = queue.get_context();
size_t block_size = 64;
size_t values_per_thread = 8;
size_t block_count = input_size / (block_size * values_per_thread);
if(block_count * block_size * values_per_thread != input_size)
block_count++;
vector<value_type> output(block_count, context);
meta_kernel k("inplace_reduce");
size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
size_t input_size_arg = k.add_arg<const uint_>("input_size");
size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
k <<
"const uint gid = get_global_id(0);\n" <<
"const uint lid = get_local_id(0);\n" <<
"const uint values_per_thread =\n"
<< uint_(values_per_thread) << ";\n" <<
// thread reduce
"const uint index = gid * values_per_thread;\n" <<
"if(index < input_size){\n" <<
k.decl<value_type>("sum") << " = input[index];\n" <<
"for(uint i = 1;\n" <<
"i < values_per_thread && (index + i) < input_size;\n" <<
"i++){\n" <<
" sum = " <<
function(k.var<value_type>("sum"),
k.var<value_type>("input[index+i]")) << ";\n" <<
"}\n" <<
"scratch[lid] = sum;\n" <<
"}\n" <<
// local reduce
"for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" uint mask = (i << 1) - 1;\n" <<
" uint next_index = (gid + i) * values_per_thread;\n"
" if((lid & mask) == 0 && next_index < input_size){\n" <<
" scratch[lid] = " <<
function(k.var<value_type>("scratch[lid]"),
k.var<value_type>("scratch[lid+i]")) << ";\n" <<
" }\n" <<
"}\n" <<
// write output for block
"if(lid == 0){\n" <<
" output[get_group_id(0)] = scratch[0];\n" <<
"}\n"
;
const buffer *input_buffer = &first.get_buffer();
const buffer *output_buffer = &output.get_buffer();
kernel kernel = k.compile(context);
while(input_size > 1){
kernel.set_arg(input_arg, *input_buffer);
kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
kernel.set_arg(output_arg, *output_buffer);
kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
queue.enqueue_1d_range_kernel(kernel,
0,
block_count * block_size,
block_size);
input_size =
static_cast<size_t>(
std::ceil(float(input_size) / (block_size * values_per_thread)
)
);
block_count = input_size / (block_size * values_per_thread);
if(block_count * block_size * values_per_thread != input_size)
block_count++;
std::swap(input_buffer, output_buffer);
}
if(input_buffer != &first.get_buffer()){
::boost::compute::copy(output.begin(),
output.begin() + 1,
first,
queue);
}
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
@@ -0,0 +1,165 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
#include <boost/compute/kernel.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class Iterator, class Compare>
inline void serial_insertion_sort(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
{
typedef typename std::iterator_traits<Iterator>::value_type T;
size_t count = iterator_range_size(first, last);
if(count < 2){
return;
}
meta_kernel k("serial_insertion_sort");
size_t local_data_arg = k.add_arg<T *>(memory_object::local_memory, "data");
size_t count_arg = k.add_arg<uint_>("n");
k <<
// copy data to local memory
"for(uint i = 0; i < n; i++){\n" <<
" data[i] = " << first[k.var<uint_>("i")] << ";\n"
"}\n"
// sort data in local memory
"for(uint i = 1; i < n; i++){\n" <<
" " << k.decl<const T>("value") << " = data[i];\n" <<
" uint pos = i;\n" <<
" while(pos > 0 && " <<
compare(k.var<const T>("value"),
k.var<const T>("data[pos-1]")) << "){\n" <<
" data[pos] = data[pos-1];\n" <<
" pos--;\n" <<
" }\n" <<
" data[pos] = value;\n" <<
"}\n" <<
// copy sorted data to output
"for(uint i = 0; i < n; i++){\n" <<
" " << first[k.var<uint_>("i")] << " = data[i];\n"
"}\n";
const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
kernel.set_arg(local_data_arg, local_buffer<T>(count));
kernel.set_arg(count_arg, static_cast<uint_>(count));
queue.enqueue_task(kernel);
}
template<class Iterator>
inline void serial_insertion_sort(Iterator first,
Iterator last,
command_queue &queue)
{
typedef typename std::iterator_traits<Iterator>::value_type T;
::boost::compute::less<T> less;
return serial_insertion_sort(first, last, less, queue);
}
template<class KeyIterator, class ValueIterator, class Compare>
inline void serial_insertion_sort_by_key(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
Compare compare,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
size_t count = iterator_range_size(keys_first, keys_last);
if(count < 2){
return;
}
meta_kernel k("serial_insertion_sort_by_key");
size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys");
size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data");
size_t count_arg = k.add_arg<uint_>("n");
k <<
// copy data to local memory
"for(uint i = 0; i < n; i++){\n" <<
" keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n"
" data[i] = " << values_first[k.var<uint_>("i")] << ";\n"
"}\n"
// sort data in local memory
"for(uint i = 1; i < n; i++){\n" <<
" " << k.decl<const key_type>("key") << " = keys[i];\n" <<
" " << k.decl<const value_type>("value") << " = data[i];\n" <<
" uint pos = i;\n" <<
" while(pos > 0 && " <<
compare(k.var<const key_type>("key"),
k.var<const key_type>("keys[pos-1]")) << "){\n" <<
" keys[pos] = keys[pos-1];\n" <<
" data[pos] = data[pos-1];\n" <<
" pos--;\n" <<
" }\n" <<
" keys[pos] = key;\n" <<
" data[pos] = value;\n" <<
"}\n" <<
// copy sorted data to output
"for(uint i = 0; i < n; i++){\n" <<
" " << keys_first[k.var<uint_>("i")] << " = keys[i];\n"
" " << values_first[k.var<uint_>("i")] << " = data[i];\n"
"}\n";
const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0);
kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0);
kernel.set_arg(count_arg, static_cast<uint_>(count));
queue.enqueue_task(kernel);
}
template<class KeyIterator, class ValueIterator>
inline void serial_insertion_sort_by_key(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
serial_insertion_sort_by_key(
keys_first,
keys_last,
values_first,
boost::compute::less<key_type>(),
queue
);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
@@ -0,0 +1,116 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
#include <iterator>
#include <boost/compute/algorithm/find_if.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Merge Path kernel class
///
/// Subclass of meta_kernel to break two sets into tiles according
/// to their merge path
///
class merge_path_kernel : public meta_kernel
{
public:
unsigned int tile_size;
merge_path_kernel() : meta_kernel("merge_path")
{
tile_size = 4;
}
template<class InputIterator1, class InputIterator2,
class OutputIterator1, class OutputIterator2,
class Compare>
void set_range(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator1 result_a,
OutputIterator2 result_b,
Compare comp)
{
m_a_count = iterator_range_size(first1, last1);
m_a_count_arg = add_arg<uint_>("a_count");
m_b_count = iterator_range_size(first2, last2);
m_b_count_arg = add_arg<uint_>("b_count");
*this <<
"uint i = get_global_id(0);\n" <<
"uint target = (i+1)*" << tile_size << ";\n" <<
"uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" <<
"uint end = min(target,a_count);\n" <<
"uint a_index, b_index;\n" <<
"while(start<end)\n" <<
"{\n" <<
" a_index = (start + end)/2;\n" <<
" b_index = target - a_index - 1;\n" <<
" if(!(" << comp(first2[expr<uint_>("b_index")],
first1[expr<uint_>("a_index")]) << "))\n" <<
" start = a_index + 1;\n" <<
" else end = a_index;\n" <<
"}\n" <<
result_a[expr<uint_>("i")] << " = start;\n" <<
result_b[expr<uint_>("i")] << " = target - start;\n";
}
template<class InputIterator1, class InputIterator2,
class OutputIterator1, class OutputIterator2>
void set_range(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator1 result_a,
OutputIterator2 result_b)
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::less<value_type> less_than;
set_range(first1, last1, first2, last2, result_a, result_b, less_than);
}
event exec(command_queue &queue)
{
if((m_a_count + m_b_count)/tile_size == 0) {
return event();
}
set_arg(m_a_count_arg, uint_(m_a_count));
set_arg(m_b_count_arg, uint_(m_b_count));
return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size);
}
private:
size_t m_a_count;
size_t m_a_count_arg;
size_t m_b_count;
size_t m_b_count_arg;
};
} //end detail namespace
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
@@ -0,0 +1,366 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
#include <boost/compute/kernel.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_blocks(KeyIterator keys_first,
ValueIterator values_first,
KeyIterator keys_result,
ValueIterator values_result,
Compare compare,
size_t count,
const size_t block_size,
const bool sort_by_key,
command_queue &queue)
{
(void) values_result;
(void) values_first;
meta_kernel k("merge_sort_on_cpu_merge_blocks");
size_t count_arg = k.add_arg<const uint_>("count");
size_t block_size_arg = k.add_arg<uint_>("block_size");
k <<
k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
k.decl<uint_>("result_idx") << " = b1_start;\n" <<
// merging block 1 and block 2 (stable)
"while(b1_start < b1_end && b2_start < b2_end){\n" <<
" if( " << compare(keys_first[k.var<uint_>("b2_start")],
keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
" " << keys_result[k.var<uint_>("result_idx")] << " = " <<
keys_first[k.var<uint_>("b2_start")] << ";\n";
if(sort_by_key){
k <<
" " << values_result[k.var<uint_>("result_idx")] << " = " <<
values_first[k.var<uint_>("b2_start")] << ";\n";
}
k <<
" b2_start++;\n" <<
" }\n" <<
" else {\n" <<
" " << keys_result[k.var<uint_>("result_idx")] << " = " <<
keys_first[k.var<uint_>("b1_start")] << ";\n";
if(sort_by_key){
k <<
" " << values_result[k.var<uint_>("result_idx")] << " = " <<
values_first[k.var<uint_>("b1_start")] << ";\n";
}
k <<
" b1_start++;\n" <<
" }\n" <<
" result_idx++;\n" <<
"}\n" <<
"while(b1_start < b1_end){\n" <<
" " << keys_result[k.var<uint_>("result_idx")] << " = " <<
keys_first[k.var<uint_>("b1_start")] << ";\n";
if(sort_by_key){
k <<
" " << values_result[k.var<uint_>("result_idx")] << " = " <<
values_first[k.var<uint_>("b1_start")] << ";\n";
}
k <<
" b1_start++;\n" <<
" result_idx++;\n" <<
"}\n" <<
"while(b2_start < b2_end){\n" <<
" " << keys_result[k.var<uint_>("result_idx")] << " = " <<
keys_first[k.var<uint_>("b2_start")] << ";\n";
if(sort_by_key){
k <<
" " << values_result[k.var<uint_>("result_idx")] << " = " <<
values_first[k.var<uint_>("b2_start")] << ";\n";
}
k <<
" b2_start++;\n" <<
" result_idx++;\n" <<
"}\n";
const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<const uint_>(count));
kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
const size_t global_size = static_cast<size_t>(
std::ceil(float(count) / (2 * block_size))
);
queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
}
template<class Iterator, class Compare>
inline void merge_blocks(Iterator first,
Iterator result,
Compare compare,
size_t count,
const size_t block_size,
const bool sort_by_key,
command_queue &queue)
{
// dummy iterator as it's not sort by key
Iterator dummy;
merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
}
template<class Iterator, class Compare>
inline void dispatch_merge_blocks(Iterator first,
Iterator result,
Compare compare,
size_t count,
const size_t block_size,
const size_t input_size_threshold,
const size_t blocks_no_threshold,
command_queue &queue)
{
const size_t blocks_no = static_cast<size_t>(
std::ceil(float(count) / block_size)
);
// merge with merge path should used only for the large arrays and at the
// end of merging part when there are only a few big blocks left to be merged
if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
Iterator last = first + count;
for(size_t i = 0; i < count; i+= 2*block_size)
{
Iterator first1 = (std::min)(first + i, last);
Iterator last1 = (std::min)(first1 + block_size, last);
Iterator first2 = last1;
Iterator last2 = (std::min)(first2 + block_size, last);
Iterator block_result = (std::min)(result + i, result + count);
merge_with_merge_path(first1, last1, first2, last2,
block_result, compare, queue);
}
}
else {
merge_blocks(first, result, compare, count, block_size, false, queue);
}
}
template<class KeyIterator, class ValueIterator, class Compare>
inline void block_insertion_sort(KeyIterator keys_first,
ValueIterator values_first,
Compare compare,
const size_t count,
const size_t block_size,
const bool sort_by_key,
command_queue &queue)
{
(void) values_first;
typedef typename std::iterator_traits<KeyIterator>::value_type K;
typedef typename std::iterator_traits<ValueIterator>::value_type T;
meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
size_t count_arg = k.add_arg<uint_>("count");
size_t block_size_arg = k.add_arg<uint_>("block_size");
k <<
k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
// block insertion sort (stable)
"for(uint i = start+1; i < end; i++){\n" <<
" " << k.decl<const K>("key") << " = " <<
keys_first[k.var<uint_>("i")] << ";\n";
if(sort_by_key){
k <<
" " << k.decl<const T>("value") << " = " <<
values_first[k.var<uint_>("i")] << ";\n";
}
k <<
" uint pos = i;\n" <<
" while(pos > start && " <<
compare(k.var<const K>("key"),
keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
" " << keys_first[k.var<uint_>("pos")] << " = " <<
keys_first[k.var<uint_>("pos-1")] << ";\n";
if(sort_by_key){
k <<
" " << values_first[k.var<uint_>("pos")] << " = " <<
values_first[k.var<uint_>("pos-1")] << ";\n";
}
k <<
" pos--;\n" <<
" }\n" <<
" " << keys_first[k.var<uint_>("pos")] << " = key;\n";
if(sort_by_key) {
k <<
" " << values_first[k.var<uint_>("pos")] << " = value;\n";
}
k <<
"}\n"; // block insertion sort
const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
}
template<class Iterator, class Compare>
inline void block_insertion_sort(Iterator first,
Compare compare,
const size_t count,
const size_t block_size,
command_queue &queue)
{
// dummy iterator as it's not sort by key
Iterator dummy;
block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
}
// This sort is stable.
template<class Iterator, class Compare>
inline void merge_sort_on_cpu(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
{
typedef typename std::iterator_traits<Iterator>::value_type value_type;
size_t count = iterator_range_size(first, last);
if(count < 2){
return;
}
// for small input size only insertion sort is performed
else if(count <= 512){
block_insertion_sort(first, compare, count, count, queue);
return;
}
const context &context = queue.get_context();
const device &device = queue.get_device();
// loading parameters
std::string cache_key =
std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// When there is merge_with_path_blocks_no_threshold or less blocks left to
// merge AND input size is merge_with_merge_path_input_size_threshold or more
// merge_with_merge_path() algorithm is used to merge sorted blocks;
// otherwise merge_blocks() is used.
const size_t merge_with_path_blocks_no_threshold =
parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
const size_t merge_with_path_input_size_threshold =
parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
const size_t block_size =
parameters->get(cache_key, "insertion_sort_block_size", 64);
block_insertion_sort(first, compare, count, block_size, queue);
// temporary buffer for merge result
vector<value_type> temp(count, context);
bool result_in_temporary_buffer = false;
for(size_t i = block_size; i < count; i *= 2){
result_in_temporary_buffer = !result_in_temporary_buffer;
if(result_in_temporary_buffer) {
dispatch_merge_blocks(first, temp.begin(), compare, count, i,
merge_with_path_input_size_threshold,
merge_with_path_blocks_no_threshold,
queue);
} else {
dispatch_merge_blocks(temp.begin(), first, compare, count, i,
merge_with_path_input_size_threshold,
merge_with_path_blocks_no_threshold,
queue);
}
}
if(result_in_temporary_buffer) {
copy(temp.begin(), temp.end(), first, queue);
}
}
// This sort is stable.
template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
Compare compare,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
size_t count = iterator_range_size(keys_first, keys_last);
if(count < 2){
return;
}
// for small input size only insertion sort is performed
else if(count <= 512){
block_insertion_sort(keys_first, values_first, compare,
count, count, true, queue);
return;
}
const context &context = queue.get_context();
const device &device = queue.get_device();
// loading parameters
std::string cache_key =
std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
+ "_with_" + type_name<key_type>();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
const size_t block_size =
parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
block_insertion_sort(keys_first, values_first, compare,
count, block_size, true, queue);
// temporary buffer for merge results
vector<value_type> values_temp(count, context);
vector<key_type> keys_temp(count, context);
bool result_in_temporary_buffer = false;
for(size_t i = block_size; i < count; i *= 2){
result_in_temporary_buffer = !result_in_temporary_buffer;
if(result_in_temporary_buffer) {
merge_blocks(keys_first, values_first,
keys_temp.begin(), values_temp.begin(),
compare, count, i, true, queue);
} else {
merge_blocks(keys_temp.begin(), values_temp.begin(),
keys_first, values_first,
compare, count, i, true, queue);
}
}
if(result_in_temporary_buffer) {
copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
copy(values_temp.begin(), values_temp.end(), values_first, queue);
}
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
@@ -0,0 +1,590 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
#include <algorithm>
#include <boost/compute/kernel.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class KeyType, class ValueType>
inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,
size_t lmem_size,
bool sort_by_key)
{
size_t n = proposed_wg;
size_t lmem_required = n * sizeof(KeyType);
if(sort_by_key) {
lmem_required += n * sizeof(ValueType);
}
// try to force at least 4 work-groups of >64 elements
// for better occupancy
while(lmem_size < (lmem_required * 4) && (n > 64)) {
n /= 2;
lmem_required = n * sizeof(KeyType);
}
while(lmem_size < lmem_required && (n != 1)) {
n /= 2;
if(n < 1) n = 1;
lmem_required = n * sizeof(KeyType);
}
if(n < 2) { return 1; }
else if(n < 4) { return 2; }
else if(n < 8) { return 4; }
else if(n < 16) { return 8; }
else if(n < 32) { return 16; }
else if(n < 64) { return 32; }
else if(n < 128) { return 64; }
else if(n < 256) { return 128; }
else { return 256; }
}
/// Performs bitonic block sort according to \p compare.
///
/// Since bitonic sort can be only performed when input size is equal to 2^n,
/// in this case input size is block size (\p work_group_size), we would have
/// to require \p count be a exact multiple of block size. That would not be
/// great.
/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the
/// last block is not equal to 2^n (where n is some natural number) the odd-even
/// sort is performed for that block. That way bitonic_block_sort() works for
/// input of any size. Block size (\p work_group_size) still have to be equal
/// to 2^n.
///
/// This is NOT stable.
///
/// \param keys_first first key element in the range to sort
/// \param values_first first value element in the range to sort
/// \param compare comparison function for keys
/// \param count number of elements in the range; count > 0
/// \param work_group_size size of the work group, also the block size; must be
/// equal to n^2 where n is natural number
/// \param queue command queue to perform the operation
template<class KeyIterator, class ValueIterator, class Compare>
inline size_t bitonic_block_sort(KeyIterator keys_first,
ValueIterator values_first,
Compare compare,
const size_t count,
const bool sort_by_key,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
meta_kernel k("bitonic_block_sort");
size_t count_arg = k.add_arg<const uint_>("count");
size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
size_t local_vals_arg = 0;
if(sort_by_key) {
local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
}
k <<
// Work item global and local ids
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n";
// declare my_key and my_value
k <<
k.decl<key_type>("my_key") << ";\n";
// Instead of copying values (my_value) in local memory with keys
// we save local index (uchar) and copy my_value at the end at
// final index. This saves local memory.
if(sort_by_key)
{
k <<
k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
}
// load key
k <<
"if(gid < count) {\n" <<
k.var<key_type>("my_key") << " = " <<
keys_first[k.var<const uint_>("gid")] << ";\n" <<
"}\n";
// load key and index to local memory
k <<
"lkeys[lid] = my_key;\n";
if(sort_by_key)
{
k <<
"lidx[lid] = my_index;\n";
}
k <<
k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";
// When work group size is a power of 2 bitonic sorter can be used;
// otherwise, slower odd-even sort is used.
k <<
// check if n is power of 2
"if(((n != 0) && ((n & (~n + 1)) == n))) {\n";
// bitonic sort, not stable
k <<
// wait for keys and vals to be stored in local memory
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"#pragma unroll\n" <<
"for(" <<
k.decl<uint_>("length") << " = 1; " <<
"length < n; " <<
"length <<= 1" <<
") {\n" <<
// direction of sort: false -> asc, true -> desc
k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
"for(" <<
k.decl<uint_>("k") << " = length; " <<
"k > 0; " <<
"k >>= 1" <<
") {\n" <<
// sibling to compare with my key
k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
k.decl<bool>("compare") << " = " <<
compare(k.var<key_type>("sibling_key"),
k.var<key_type>("my_key")) << ";\n" <<
k.decl<bool>("swap") <<
" = compare ^ (sibling_idx < lid) ^ direction;\n" <<
"my_key = swap ? sibling_key : my_key;\n";
if(sort_by_key)
{
k <<
"my_index = swap ? lidx[sibling_idx] : my_index;\n";
}
k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"lkeys[lid] = my_key;\n";
if(sort_by_key)
{
k <<
"lidx[lid] = my_index;\n";
}
k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"}\n" <<
"}\n";
// end of bitonic sort
// odd-even sort, not stable
k <<
"}\n" <<
"else { \n";
k <<
k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<
k.decl<uint_>("oddsibling_idx") << " = " <<
"(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<
k.decl<uint_>("evensibling_idx") << " = " <<
"(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<
// wait for keys and vals to be stored in local memory
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"#pragma unroll\n" <<
"for(" <<
k.decl<uint_>("i") << " = 0; " <<
"i < n; " <<
"i++" <<
") {\n" <<
k.decl<uint_>("sibling_idx") <<
" = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<
k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
k.decl<bool>("compare") << " = " <<
compare(k.var<key_type>("sibling_key"),
k.var<key_type>("my_key")) << ";\n" <<
k.decl<bool>("swap") <<
" = compare ^ (sibling_idx < lid);\n" <<
"my_key = swap ? sibling_key : my_key;\n";
if(sort_by_key)
{
k <<
"my_index = swap ? lidx[sibling_idx] : my_index;\n";
}
k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"lkeys[lid] = my_key;\n";
if(sort_by_key)
{
k <<
"lidx[lid] = my_index;\n";
}
k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n"
"}\n" << // for
"}\n"; // else
// end of odd-even sort
// save key and value
k <<
"if(gid < count) {\n" <<
keys_first[k.var<const uint_>("gid")] << " = " <<
k.var<key_type>("my_key") << ";\n";
if(sort_by_key)
{
k << values_first[k.var<const uint_>("gid")] << " = " <<
values_first[k.var<const uint_>("offset + my_index")] << ";\n";
}
k <<
// end if
"}\n";
const context &context = queue.get_context();
const device &device = queue.get_device();
::boost::compute::kernel kernel = k.compile(context);
const size_t work_group_size =
pick_bitonic_block_sort_block_size<key_type, uchar_>(
kernel.get_work_group_info<size_t>(
device, CL_KERNEL_WORK_GROUP_SIZE
),
device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),
sort_by_key
);
const size_t global_size =
work_group_size * static_cast<size_t>(
std::ceil(float(count) / work_group_size)
);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));
if(sort_by_key) {
kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));
}
queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
// return size of the block
return work_group_size;
}
template<class KeyIterator, class ValueIterator, class Compare>
inline size_t block_sort(KeyIterator keys_first,
ValueIterator values_first,
Compare compare,
const size_t count,
const bool sort_by_key,
const bool stable,
command_queue &queue)
{
if(stable) {
// TODO: Implement stable block sort (stable odd-even merge sort)
return size_t(1);
}
return bitonic_block_sort(
keys_first, values_first,
compare, count,
sort_by_key, queue
);
}
/// space: O(n + m); n - number of keys, m - number of values
template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_blocks_on_gpu(KeyIterator keys_first,
ValueIterator values_first,
KeyIterator out_keys_first,
ValueIterator out_values_first,
Compare compare,
const size_t count,
const size_t block_size,
const bool sort_by_key,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
meta_kernel k("merge_blocks");
size_t count_arg = k.add_arg<const uint_>("count");
size_t block_size_arg = k.add_arg<const uint_>("block_size");
k <<
// get global id
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
"if(gid >= count) {\n" <<
"return;\n" <<
"}\n" <<
k.decl<const key_type>("my_key") << " = " <<
keys_first[k.var<const uint_>("gid")] << ";\n";
if(sort_by_key) {
k <<
k.decl<const value_type>("my_value") << " = " <<
values_first[k.var<const uint_>("gid")] << ";\n";
}
k <<
// get my block idx
k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<
k.decl<const bool>("my_block_idx_is_odd") << " = " <<
"my_block_idx & 0x1;\n" <<
k.decl<const uint_>("other_block_idx") << " = " <<
// if(my_block_idx is odd) {} else {}
"my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<
// get ranges of my block and the other block
// [my_block_start; my_block_end)
// [other_block_start; other_block_end)
k.decl<const uint_>("my_block_start") << " = " <<
"min(my_block_idx * block_size, count);\n" << // including
k.decl<const uint_>("my_block_end") << " = " <<
"min((my_block_idx + 1) * block_size, count);\n" << // excluding
k.decl<const uint_>("other_block_start") << " = " <<
"min(other_block_idx * block_size, count);\n" << // including
k.decl<const uint_>("other_block_end") << " = " <<
"min((other_block_idx + 1) * block_size, count);\n" << // excluding
// other block is empty, nothing to merge here
"if(other_block_start == count){\n" <<
out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";
if(sort_by_key) {
k <<
out_values_first[k.var<uint_>("gid")] << " = my_value;\n";
}
k <<
"return;\n" <<
"}\n" <<
// lower bound
// left_idx - lower bound
k.decl<uint_>("left_idx") << " = other_block_start;\n" <<
k.decl<uint_>("right_idx") << " = other_block_end;\n" <<
"while(left_idx < right_idx) {\n" <<
k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
k.decl<key_type>("mid_key") << " = " <<
keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
k.decl<bool>("smaller") << " = " <<
compare(k.var<key_type>("mid_key"),
k.var<key_type>("my_key")) << ";\n" <<
"left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<
"right_idx = smaller ? right_idx : mid_idx;\n" <<
"}\n" <<
// left_idx is found position in other block
// if my_block is odd we need to get the upper bound
"right_idx = other_block_end;\n" <<
"if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<
k.decl<key_type>("upper_key") << " = " <<
keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
"while(" <<
"!(" << compare(k.var<key_type>("upper_key"),
k.var<key_type>("my_key")) <<
") && " <<
"!(" << compare(k.var<key_type>("my_key"),
k.var<key_type>("upper_key")) <<
") && " <<
"left_idx < right_idx" <<
")" <<
"{\n" <<
k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
k.decl<key_type>("mid_key") << " = " <<
keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
k.decl<bool>("equal") << " = " <<
"!(" << compare(k.var<key_type>("mid_key"),
k.var<key_type>("my_key")) <<
") && " <<
"!(" << compare(k.var<key_type>("my_key"),
k.var<key_type>("mid_key")) <<
");\n" <<
"left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<
"right_idx = equal ? right_idx : mid_idx;\n" <<
"upper_key = equal ? upper_key : " <<
keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
"}\n" <<
"}\n" <<
k.decl<uint_>("offset") << " = 0;\n" <<
"offset += gid - my_block_start;\n" <<
"offset += left_idx - other_block_start;\n" <<
"offset += min(my_block_start, other_block_start);\n" <<
out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";
if(sort_by_key) {
k <<
out_values_first[k.var<uint_>("offset")] << " = my_value;\n";
}
const context &context = queue.get_context();
::boost::compute::kernel kernel = k.compile(context);
const size_t work_group_size = (std::min)(
size_t(256),
kernel.get_work_group_info<size_t>(
queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE
)
);
const size_t global_size =
work_group_size * static_cast<size_t>(
std::ceil(float(count) / work_group_size)
);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
}
template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
Compare compare,
bool stable,
command_queue &queue)
{
typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
size_t count = iterator_range_size(keys_first, keys_last);
if(count < 2){
return;
}
size_t block_size =
block_sort(
keys_first, values_first,
compare, count,
true /* sort_by_key */, stable /* stable */,
queue
);
// for small input size only block sort is performed
if(count <= block_size) {
return;
}
const context &context = queue.get_context();
bool result_in_temporary_buffer = false;
::boost::compute::vector<key_type> temp_keys(count, context);
::boost::compute::vector<value_type> temp_values(count, context);
for(; block_size < count; block_size *= 2) {
result_in_temporary_buffer = !result_in_temporary_buffer;
if(result_in_temporary_buffer) {
merge_blocks_on_gpu(keys_first, values_first,
temp_keys.begin(), temp_values.begin(),
compare, count, block_size,
true /* sort_by_key */, queue);
} else {
merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
keys_first, values_first,
compare, count, block_size,
true /* sort_by_key */, queue);
}
}
if(result_in_temporary_buffer) {
copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
}
}
template<class Iterator, class Compare>
inline void merge_sort_on_gpu(Iterator first,
Iterator last,
Compare compare,
bool stable,
command_queue &queue)
{
typedef typename std::iterator_traits<Iterator>::value_type key_type;
size_t count = iterator_range_size(first, last);
if(count < 2){
return;
}
Iterator dummy;
size_t block_size =
block_sort(
first, dummy,
compare, count,
false /* sort_by_key */, stable /* stable */,
queue
);
// for small input size only block sort is performed
if(count <= block_size) {
return;
}
const context &context = queue.get_context();
bool result_in_temporary_buffer = false;
::boost::compute::vector<key_type> temp_keys(count, context);
for(; block_size < count; block_size *= 2) {
result_in_temporary_buffer = !result_in_temporary_buffer;
if(result_in_temporary_buffer) {
merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,
compare, count, block_size,
false /* sort_by_key */, queue);
} else {
merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,
compare, count, block_size,
false /* sort_by_key */, queue);
}
}
if(result_in_temporary_buffer) {
copy_async(temp_keys.begin(), temp_keys.end(), first, queue);
}
}
template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
Compare compare,
command_queue &queue)
{
merge_sort_by_key_on_gpu(
keys_first, keys_last, values_first,
compare, false /* not stable */, queue
);
}
template<class Iterator, class Compare>
inline void merge_sort_on_gpu(Iterator first,
Iterator last,
Compare compare,
command_queue &queue)
{
merge_sort_on_gpu(
first, last, compare, false /* not stable */, queue
);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */
@@ -0,0 +1,203 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
#include <iterator>
#include <boost/compute/algorithm/detail/merge_path.hpp>
#include <boost/compute/algorithm/fill_n.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Serial merge kernel class
///
/// Subclass of meta_kernel to perform serial merge after tiling
///
class serial_merge_kernel : meta_kernel
{
public:
unsigned int tile_size;
serial_merge_kernel() : meta_kernel("merge")
{
tile_size = 4;
}
template<class InputIterator1, class InputIterator2,
class InputIterator3, class InputIterator4,
class OutputIterator, class Compare>
void set_range(InputIterator1 first1,
InputIterator2 first2,
InputIterator3 tile_first1,
InputIterator3 tile_last1,
InputIterator4 tile_first2,
OutputIterator result,
Compare comp)
{
m_count = iterator_range_size(tile_first1, tile_last1) - 1;
*this <<
"uint i = get_global_id(0);\n" <<
"uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
"uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
"uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
"uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
"uint index = i*" << tile_size << ";\n" <<
"while(start1<end1 && start2<end2)\n" <<
"{\n" <<
" if(!(" << comp(first2[expr<uint_>("start2")],
first1[expr<uint_>("start1")]) << "))\n" <<
" {\n" <<
result[expr<uint_>("index")] <<
" = " << first1[expr<uint_>("start1")] << ";\n" <<
" index++;\n" <<
" start1++;\n" <<
" }\n" <<
" else\n" <<
" {\n" <<
result[expr<uint_>("index")] <<
" = " << first2[expr<uint_>("start2")] << ";\n" <<
" index++;\n" <<
" start2++;\n" <<
" }\n" <<
"}\n" <<
"while(start1<end1)\n" <<
"{\n" <<
result[expr<uint_>("index")] <<
" = " << first1[expr<uint_>("start1")] << ";\n" <<
" index++;\n" <<
" start1++;\n" <<
"}\n" <<
"while(start2<end2)\n" <<
"{\n" <<
result[expr<uint_>("index")] <<
" = " << first2[expr<uint_>("start2")] << ";\n" <<
" index++;\n" <<
" start2++;\n" <<
"}\n";
}
template<class InputIterator1, class InputIterator2,
class InputIterator3, class InputIterator4,
class OutputIterator>
void set_range(InputIterator1 first1,
InputIterator2 first2,
InputIterator3 tile_first1,
InputIterator3 tile_last1,
InputIterator4 tile_first2,
OutputIterator result)
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::less<value_type> less_than;
set_range(first1, first2, tile_first1, tile_last1, tile_first2, result, less_than);
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
};
///
/// \brief Merge algorithm with merge path
///
/// Merges the sorted values in the range [\p first1, \p last1) with
/// the sorted values in the range [\p first2, last2) and stores the
/// result in the range beginning at \p result
///
/// \param first1 Iterator pointing to start of first set
/// \param last1 Iterator pointing to end of first set
/// \param first2 Iterator pointing to start of second set
/// \param last2 Iterator pointing to end of second set
/// \param result Iterator pointing to start of range in which the result
/// will be stored
/// \param comp Comparator which performs less than function
/// \param queue Queue on which to execute
///
template<class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
inline OutputIterator
merge_with_merge_path(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator result,
Compare comp,
command_queue &queue = system::default_queue())
{
typedef typename
std::iterator_traits<OutputIterator>::difference_type result_difference_type;
size_t tile_size = 1024;
size_t count1 = iterator_range_size(first1, last1);
size_t count2 = iterator_range_size(first2, last2);
vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
// Tile the sets
merge_path_kernel tiling_kernel;
tiling_kernel.tile_size = static_cast<unsigned int>(tile_size);
tiling_kernel.set_range(first1, last1, first2, last2,
tile_a.begin()+1, tile_b.begin()+1, comp);
fill_n(tile_a.begin(), 1, uint_(0), queue);
fill_n(tile_b.begin(), 1, uint_(0), queue);
tiling_kernel.exec(queue);
fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue);
fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue);
// Merge
serial_merge_kernel merge_kernel;
merge_kernel.tile_size = static_cast<unsigned int>(tile_size);
merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
tile_b.begin(), result, comp);
merge_kernel.exec(queue);
return result + static_cast<result_difference_type>(count1 + count2);
}
/// \overload
template<class InputIterator1, class InputIterator2, class OutputIterator>
inline OutputIterator
merge_with_merge_path(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::less<value_type> less_than;
return merge_with_merge_path(first1, last1, first2, last2, result, less_than, queue);
}
} //end detail namespace
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
@@ -0,0 +1,461 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
#include <iterator>
#include <boost/assert.hpp>
#include <boost/type_traits/is_signed.hpp>
#include <boost/type_traits/is_floating_point.hpp>
#include <boost/compute/kernel.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/exclusive_scan.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/type_traits/is_fundamental.hpp>
#include <boost/compute/type_traits/is_vector_type.hpp>
#include <boost/compute/utility/program_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
// meta-function returning true if type T is radix-sortable
template<class T>
struct is_radix_sortable :
boost::mpl::and_<
typename ::boost::compute::is_fundamental<T>::type,
typename boost::mpl::not_<typename is_vector_type<T>::type>::type
>
{
};
template<size_t N>
struct radix_sort_value_type
{
};
template<>
struct radix_sort_value_type<1>
{
typedef uchar_ type;
};
template<>
struct radix_sort_value_type<2>
{
typedef ushort_ type;
};
template<>
struct radix_sort_value_type<4>
{
typedef uint_ type;
};
template<>
struct radix_sort_value_type<8>
{
typedef ulong_ type;
};
template<typename T>
inline const char* enable_double()
{
return " -DT2_double=0";
}
template<>
inline const char* enable_double<double>()
{
return " -DT2_double=1";
}
const char radix_sort_source[] =
"#if T2_double\n"
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"#endif\n"
"#define K2_BITS (1 << K_BITS)\n"
"#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n"
"#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n"
"#if defined(ASC)\n" // asc order
"inline uint radix(const T x, const uint low_bit)\n"
"{\n"
"#if defined(IS_FLOATING_POINT)\n"
" const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
" return ((x ^ mask) >> low_bit) & RADIX_MASK;\n"
"#elif defined(IS_SIGNED)\n"
" return ((x ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
"#else\n"
" return (x >> low_bit) & RADIX_MASK;\n"
"#endif\n"
"}\n"
"#else\n" // desc order
// For signed types we just negate the x and for unsigned types we
// subtract the x from max value of its type ((T)(-1) is a max value
// of type T when T is an unsigned type).
"inline uint radix(const T x, const uint low_bit)\n"
"{\n"
"#if defined(IS_FLOATING_POINT)\n"
" const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
" return (((-x) ^ mask) >> low_bit) & RADIX_MASK;\n"
"#elif defined(IS_SIGNED)\n"
" return (((-x) ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
"#else\n"
" return (((T)(-1) - x) >> low_bit) & RADIX_MASK;\n"
"#endif\n"
"}\n"
"#endif\n" // #if defined(ASC)
"__kernel void count(__global const T *input,\n"
" const uint input_offset,\n"
" const uint input_size,\n"
" __global uint *global_counts,\n"
" __global uint *global_offsets,\n"
" __local uint *local_counts,\n"
" const uint low_bit)\n"
"{\n"
// work-item parameters
" const uint gid = get_global_id(0);\n"
" const uint lid = get_local_id(0);\n"
// zero local counts
" if(lid < K2_BITS){\n"
" local_counts[lid] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
// reduce local counts
" if(gid < input_size){\n"
" T value = input[input_offset+gid];\n"
" uint bucket = radix(value, low_bit);\n"
" atomic_inc(local_counts + bucket);\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
// write block-relative offsets
" if(lid < K2_BITS){\n"
" global_counts[K2_BITS*get_group_id(0) + lid] = local_counts[lid];\n"
// write global offsets
" if(get_group_id(0) == (get_num_groups(0) - 1)){\n"
" global_offsets[lid] = local_counts[lid];\n"
" }\n"
" }\n"
"}\n"
"__kernel void scan(__global const uint *block_offsets,\n"
" __global uint *global_offsets,\n"
" const uint block_count)\n"
"{\n"
" __global const uint *last_block_offsets =\n"
" block_offsets + K2_BITS * (block_count - 1);\n"
// calculate and scan global_offsets
" uint sum = 0;\n"
" for(uint i = 0; i < K2_BITS; i++){\n"
" uint x = global_offsets[i] + last_block_offsets[i];\n"
" global_offsets[i] = sum;\n"
" sum += x;\n"
" }\n"
"}\n"
"__kernel void scatter(__global const T *input,\n"
" const uint input_offset,\n"
" const uint input_size,\n"
" const uint low_bit,\n"
" __global const uint *counts,\n"
" __global const uint *global_offsets,\n"
"#ifndef SORT_BY_KEY\n"
" __global T *output,\n"
" const uint output_offset)\n"
"#else\n"
" __global T *keys_output,\n"
" const uint keys_output_offset,\n"
" __global T2 *values_input,\n"
" const uint values_input_offset,\n"
" __global T2 *values_output,\n"
" const uint values_output_offset)\n"
"#endif\n"
"{\n"
// work-item parameters
" const uint gid = get_global_id(0);\n"
" const uint lid = get_local_id(0);\n"
// copy input to local memory
" T value;\n"
" uint bucket;\n"
" __local uint local_input[BLOCK_SIZE];\n"
" if(gid < input_size){\n"
" value = input[input_offset+gid];\n"
" bucket = radix(value, low_bit);\n"
" local_input[lid] = bucket;\n"
" }\n"
// copy block counts to local memory
" __local uint local_counts[(1 << K_BITS)];\n"
" if(lid < K2_BITS){\n"
" local_counts[lid] = counts[get_group_id(0) * K2_BITS + lid];\n"
" }\n"
// wait until local memory is ready
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(gid >= input_size){\n"
" return;\n"
" }\n"
// get global offset
" uint offset = global_offsets[bucket] + local_counts[bucket];\n"
// calculate local offset
" uint local_offset = 0;\n"
" for(uint i = 0; i < lid; i++){\n"
" if(local_input[i] == bucket)\n"
" local_offset++;\n"
" }\n"
"#ifndef SORT_BY_KEY\n"
// write value to output
" output[output_offset + offset + local_offset] = value;\n"
"#else\n"
// write key and value if doing sort_by_key
" keys_output[keys_output_offset+offset + local_offset] = value;\n"
" values_output[values_output_offset+offset + local_offset] =\n"
" values_input[values_input_offset+gid];\n"
"#endif\n"
"}\n";
template<class T, class T2>
inline void radix_sort_impl(const buffer_iterator<T> first,
const buffer_iterator<T> last,
const buffer_iterator<T2> values_first,
const bool ascending,
command_queue &queue)
{
typedef T value_type;
typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;
const device &device = queue.get_device();
const context &context = queue.get_context();
// if we have a valid values iterator then we are doing a
// sort by key and have to set up the values buffer
bool sort_by_key = (values_first.get_buffer().get() != 0);
// load (or create) radix sort program
std::string cache_key =
std::string("__boost_radix_sort_") + type_name<value_type>();
if(sort_by_key){
cache_key += std::string("_with_") + type_name<T2>();
}
boost::shared_ptr<program_cache> cache =
program_cache::get_global_cache(context);
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// sort parameters
const uint_ k = parameters->get(cache_key, "k", 4);
const uint_ k2 = 1 << k;
const uint_ block_size = parameters->get(cache_key, "tpb", 128);
// sort program compiler options
std::stringstream options;
options << "-DK_BITS=" << k;
options << " -DT=" << type_name<sort_type>();
options << " -DBLOCK_SIZE=" << block_size;
if(boost::is_floating_point<value_type>::value){
options << " -DIS_FLOATING_POINT";
}
if(boost::is_signed<value_type>::value){
options << " -DIS_SIGNED";
}
if(sort_by_key){
options << " -DSORT_BY_KEY";
options << " -DT2=" << type_name<T2>();
options << enable_double<T2>();
}
if(ascending){
options << " -DASC";
}
// load radix sort program
program radix_sort_program = cache->get_or_build(
cache_key, options.str(), radix_sort_source, context
);
kernel count_kernel(radix_sort_program, "count");
kernel scan_kernel(radix_sort_program, "scan");
kernel scatter_kernel(radix_sort_program, "scatter");
size_t count = detail::iterator_range_size(first, last);
uint_ block_count = static_cast<uint_>(count / block_size);
if(block_count * block_size != count){
block_count++;
}
// setup temporary buffers
vector<value_type> output(count, context);
vector<T2> values_output(sort_by_key ? count : 0, context);
vector<uint_> offsets(k2, context);
vector<uint_> counts(block_count * k2, context);
const buffer *input_buffer = &first.get_buffer();
uint_ input_offset = static_cast<uint_>(first.get_index());
const buffer *output_buffer = &output.get_buffer();
uint_ output_offset = 0;
const buffer *values_input_buffer = &values_first.get_buffer();
uint_ values_input_offset = static_cast<uint_>(values_first.get_index());
const buffer *values_output_buffer = &values_output.get_buffer();
uint_ values_output_offset = 0;
for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
// write counts
count_kernel.set_arg(0, *input_buffer);
count_kernel.set_arg(1, input_offset);
count_kernel.set_arg(2, static_cast<uint_>(count));
count_kernel.set_arg(3, counts);
count_kernel.set_arg(4, offsets);
count_kernel.set_arg(5, block_size * sizeof(uint_), 0);
count_kernel.set_arg(6, i * k);
queue.enqueue_1d_range_kernel(count_kernel,
0,
block_count * block_size,
block_size);
// scan counts
if(k == 1){
typedef uint2_ counter_type;
::boost::compute::exclusive_scan(
make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2),
make_buffer_iterator<counter_type>(counts.get_buffer()),
queue
);
}
else if(k == 2){
typedef uint4_ counter_type;
::boost::compute::exclusive_scan(
make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4),
make_buffer_iterator<counter_type>(counts.get_buffer()),
queue
);
}
else if(k == 4){
typedef uint16_ counter_type;
::boost::compute::exclusive_scan(
make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16),
make_buffer_iterator<counter_type>(counts.get_buffer()),
queue
);
}
else {
BOOST_ASSERT(false && "unknown k");
break;
}
// scan global offsets
scan_kernel.set_arg(0, counts);
scan_kernel.set_arg(1, offsets);
scan_kernel.set_arg(2, block_count);
queue.enqueue_task(scan_kernel);
// scatter values
scatter_kernel.set_arg(0, *input_buffer);
scatter_kernel.set_arg(1, input_offset);
scatter_kernel.set_arg(2, static_cast<uint_>(count));
scatter_kernel.set_arg(3, i * k);
scatter_kernel.set_arg(4, counts);
scatter_kernel.set_arg(5, offsets);
scatter_kernel.set_arg(6, *output_buffer);
scatter_kernel.set_arg(7, output_offset);
if(sort_by_key){
scatter_kernel.set_arg(8, *values_input_buffer);
scatter_kernel.set_arg(9, values_input_offset);
scatter_kernel.set_arg(10, *values_output_buffer);
scatter_kernel.set_arg(11, values_output_offset);
}
queue.enqueue_1d_range_kernel(scatter_kernel,
0,
block_count * block_size,
block_size);
// swap buffers
std::swap(input_buffer, output_buffer);
std::swap(values_input_buffer, values_output_buffer);
std::swap(input_offset, output_offset);
std::swap(values_input_offset, values_output_offset);
}
}
template<class Iterator>
inline void radix_sort(Iterator first,
Iterator last,
command_queue &queue)
{
radix_sort_impl(first, last, buffer_iterator<int>(), true, queue);
}
template<class KeyIterator, class ValueIterator>
inline void radix_sort_by_key(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
command_queue &queue)
{
radix_sort_impl(keys_first, keys_last, values_first, true, queue);
}
template<class Iterator>
inline void radix_sort(Iterator first,
Iterator last,
const bool ascending,
command_queue &queue)
{
radix_sort_impl(first, last, buffer_iterator<int>(), ascending, queue);
}
template<class KeyIterator, class ValueIterator>
inline void radix_sort_by_key(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
const bool ascending,
command_queue &queue)
{
radix_sort_impl(keys_first, keys_last, values_first, ascending, queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
@@ -0,0 +1,57 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/random/default_random_engine.hpp>
#include <boost/compute/random/uniform_real_distribution.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class OutputIterator, class Generator>
inline void random_fill(OutputIterator first,
OutputIterator last,
Generator &g,
command_queue &queue)
{
g.fill(first, last, queue);
}
template<class OutputIterator>
inline void
random_fill(OutputIterator first,
OutputIterator last,
typename std::iterator_traits<OutputIterator>::value_type lo,
typename std::iterator_traits<OutputIterator>::value_type hi,
command_queue &queue)
{
typedef typename
std::iterator_traits<OutputIterator>::value_type value_type;
typedef typename
boost::compute::default_random_engine engine_type;
typedef typename
boost::compute::uniform_real_distribution<value_type> distribution_type;
engine_type engine(queue);
distribution_type generator(lo, hi);
generator.fill(first, last, engine, queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
@@ -0,0 +1,119 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
#include <algorithm>
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/algorithm/detail/serial_reduce_by_key.hpp>
#include <boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp>
#include <boost/compute/type_traits.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction, class BinaryPredicate>
size_t reduce_by_key_on_gpu(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
BinaryFunction function,
BinaryPredicate predicate,
command_queue &queue)
{
return detail::reduce_by_key_with_scan(keys_first, keys_last, values_first,
keys_result, values_result, function,
predicate, queue);
}
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator>
bool reduce_by_key_on_gpu_requirements_met(InputKeyIterator keys_first,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
const size_t count,
command_queue &queue)
{
const device &device = queue.get_device();
return (count > 256)
&& !(device.type() & device::cpu)
&& reduce_by_key_with_scan_requirements_met(keys_first, values_first,
keys_result,values_result,
count, queue);
return true;
}
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction, class BinaryPredicate>
inline std::pair<OutputKeyIterator, OutputValueIterator>
dispatch_reduce_by_key(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
BinaryFunction function,
BinaryPredicate predicate,
command_queue &queue)
{
typedef typename
std::iterator_traits<OutputKeyIterator>::difference_type key_difference_type;
typedef typename
std::iterator_traits<OutputValueIterator>::difference_type value_difference_type;
const size_t count = detail::iterator_range_size(keys_first, keys_last);
if (count < 2) {
boost::compute::copy_n(keys_first, count, keys_result, queue);
boost::compute::copy_n(values_first, count, values_result, queue);
return
std::make_pair<OutputKeyIterator, OutputValueIterator>(
keys_result + static_cast<key_difference_type>(count),
values_result + static_cast<value_difference_type>(count)
);
}
size_t result_size = 0;
if(reduce_by_key_on_gpu_requirements_met(keys_first, values_first, keys_result,
values_result, count, queue)){
result_size =
detail::reduce_by_key_on_gpu(keys_first, keys_last, values_first,
keys_result, values_result, function,
predicate, queue);
}
else {
result_size =
detail::serial_reduce_by_key(keys_first, keys_last, values_first,
keys_result, values_result, function,
predicate, queue);
}
return
std::make_pair<OutputKeyIterator, OutputValueIterator>(
keys_result + static_cast<key_difference_type>(result_size),
values_result + static_cast<value_difference_type>(result_size)
);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
@@ -0,0 +1,541 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
#include <algorithm>
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/algorithm/inclusive_scan.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/read_write_single_value.hpp>
#include <boost/compute/type_traits.hpp>
#include <boost/compute/utility/program_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
/// \internal_
///
/// Fills \p new_keys_first with unsigned integer keys generated from vector
/// of original keys \p keys_first. New keys can be distinguish by simple equality
/// predicate.
///
/// \param keys_first iterator pointing to the first key
/// \param number_of_keys number of keys
/// \param predicate binary predicate for key comparison
/// \param new_keys_first iterator pointing to the new keys vector
/// \param preferred_work_group_size preferred work group size
/// \param queue command queue to perform the operation
///
/// Binary function \p predicate must take two keys as arguments and
/// return true only if they are considered the same.
///
/// The first new key equals zero and the last equals number of unique keys
/// minus one.
///
/// No local memory usage.
template<class InputKeyIterator, class BinaryPredicate>
inline void generate_uint_keys(InputKeyIterator keys_first,
size_t number_of_keys,
BinaryPredicate predicate,
vector<uint_>::iterator new_keys_first,
size_t preferred_work_group_size,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputKeyIterator>::value_type key_type;
detail::meta_kernel k("reduce_by_key_new_key_flags");
k.add_set_arg<const uint_>("count", uint_(number_of_keys));
k <<
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
k.decl<uint_>("value") << " = 0;\n" <<
"if(gid >= count){\n return;\n}\n" <<
"if(gid > 0){ \n" <<
k.decl<key_type>("key") << " = " <<
keys_first[k.var<const uint_>("gid")] << ";\n" <<
k.decl<key_type>("previous_key") << " = " <<
keys_first[k.var<const uint_>("gid - 1")] << ";\n" <<
" value = " << predicate(k.var<key_type>("previous_key"),
k.var<key_type>("key")) <<
" ? 0 : 1;\n" <<
"}\n else {\n" <<
" value = 0;\n" <<
"}\n" <<
new_keys_first[k.var<const uint_>("gid")] << " = value;\n";
const context &context = queue.get_context();
kernel kernel = k.compile(context);
size_t work_group_size = preferred_work_group_size;
size_t work_groups_no = static_cast<size_t>(
std::ceil(float(number_of_keys) / work_group_size)
);
queue.enqueue_1d_range_kernel(kernel,
0,
work_groups_no * work_group_size,
work_group_size);
inclusive_scan(new_keys_first, new_keys_first + number_of_keys,
new_keys_first, queue);
}
/// \internal_
/// Calculate carry-out for each work group.
/// Carry-out is a pair of the last key processed by a work group and sum of all
/// values under this key in this work group.
template<class InputValueIterator, class OutputValueIterator, class BinaryFunction>
inline void carry_outs(vector<uint_>::iterator keys_first,
InputValueIterator values_first,
size_t count,
vector<uint_>::iterator carry_out_keys_first,
OutputValueIterator carry_out_values_first,
BinaryFunction function,
size_t work_group_size,
command_queue &queue)
{
typedef typename
std::iterator_traits<OutputValueIterator>::value_type value_out_type;
detail::meta_kernel k("reduce_by_key_with_scan_carry_outs");
k.add_set_arg<const uint_>("count", uint_(count));
size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
k <<
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
k.decl<uint_>("key") << ";\n" <<
k.decl<value_out_type>("value") << ";\n" <<
"if(gid < count){\n" <<
k.var<uint_>("key") << " = " <<
keys_first[k.var<const uint_>("gid")] << ";\n" <<
k.var<value_out_type>("value") << " = " <<
values_first[k.var<const uint_>("gid")] << ";\n" <<
"lkeys[lid] = key;\n" <<
"lvals[lid] = value;\n" <<
"}\n" <<
// Calculate carry out for each work group by performing Hillis/Steele scan
// where only last element (key-value pair) is saved
k.decl<value_out_type>("result") << " = value;\n" <<
k.decl<uint_>("other_key") << ";\n" <<
k.decl<value_out_type>("other_value") << ";\n" <<
"for(" << k.decl<uint_>("offset") << " = 1; " <<
"offset < wg_size; offset *= 2){\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" if(lid >= offset){\n"
" other_key = lkeys[lid - offset];\n" <<
" if(other_key == key){\n" <<
" other_value = lvals[lid - offset];\n" <<
" result = " << function(k.var<value_out_type>("result"),
k.var<value_out_type>("other_value")) << ";\n" <<
" }\n" <<
" }\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" lvals[lid] = result;\n" <<
"}\n" <<
// save carry out
"if(lid == (wg_size - 1)){\n" <<
carry_out_keys_first[k.var<const uint_>("group_id")] << " = key;\n" <<
carry_out_values_first[k.var<const uint_>("group_id")] << " = result;\n" <<
"}\n";
size_t work_groups_no = static_cast<size_t>(
std::ceil(float(count) / work_group_size)
);
const context &context = queue.get_context();
kernel kernel = k.compile(context);
kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
queue.enqueue_1d_range_kernel(kernel,
0,
work_groups_no * work_group_size,
work_group_size);
}
/// \internal_
/// Calculate carry-in by performing inclusive scan by key on carry-outs vector.
template<class OutputValueIterator, class BinaryFunction>
inline void carry_ins(vector<uint_>::iterator carry_out_keys_first,
OutputValueIterator carry_out_values_first,
OutputValueIterator carry_in_values_first,
size_t carry_out_size,
BinaryFunction function,
size_t work_group_size,
command_queue &queue)
{
typedef typename
std::iterator_traits<OutputValueIterator>::value_type value_out_type;
uint_ values_pre_work_item = static_cast<uint_>(
std::ceil(float(carry_out_size) / work_group_size)
);
detail::meta_kernel k("reduce_by_key_with_scan_carry_ins");
k.add_set_arg<const uint_>("carry_out_size", uint_(carry_out_size));
k.add_set_arg<const uint_>("values_per_work_item", values_pre_work_item);
size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
k <<
k.decl<uint_>("id") << " = get_global_id(0) * values_per_work_item;\n" <<
k.decl<uint_>("idx") << " = id;\n" <<
k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
k.decl<uint_>("key") << ";\n" <<
k.decl<value_out_type>("value") << ";\n" <<
k.decl<uint_>("previous_key") << ";\n" <<
k.decl<value_out_type>("result") << ";\n" <<
"if(id < carry_out_size){\n" <<
k.var<uint_>("previous_key") << " = " <<
carry_out_keys_first[k.var<const uint_>("id")] << ";\n" <<
k.var<value_out_type>("result") << " = " <<
carry_out_values_first[k.var<const uint_>("id")] << ";\n" <<
carry_in_values_first[k.var<const uint_>("id")] << " = result;\n" <<
"}\n" <<
k.decl<const uint_>("end") << " = (id + values_per_work_item) <= carry_out_size" <<
" ? (values_per_work_item + id) : carry_out_size;\n" <<
"for(idx = idx + 1; idx < end; idx += 1){\n" <<
" key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" <<
" value = " << carry_out_values_first[k.var<const uint_>("idx")] << ";\n" <<
" if(previous_key == key){\n" <<
" result = " << function(k.var<value_out_type>("result"),
k.var<value_out_type>("value")) << ";\n" <<
" }\n else { \n" <<
" result = value;\n"
" }\n" <<
" " << carry_in_values_first[k.var<const uint_>("idx")] << " = result;\n" <<
" previous_key = key;\n"
"}\n" <<
// save the last key and result to local memory
"lkeys[lid] = previous_key;\n" <<
"lvals[lid] = result;\n" <<
// Hillis/Steele scan
"for(" << k.decl<uint_>("offset") << " = 1; " <<
"offset < wg_size; offset *= 2){\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" if(lid >= offset){\n"
" key = lkeys[lid - offset];\n" <<
" if(previous_key == key){\n" <<
" value = lvals[lid - offset];\n" <<
" result = " << function(k.var<value_out_type>("result"),
k.var<value_out_type>("value")) << ";\n" <<
" }\n" <<
" }\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" lvals[lid] = result;\n" <<
"}\n" <<
"barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"if(lid > 0){\n" <<
// load key-value reduced by previous work item
" previous_key = lkeys[lid - 1];\n" <<
" result = lvals[lid - 1];\n" <<
"}\n" <<
// add key-value reduced by previous work item
"for(idx = id; idx < id + values_per_work_item; idx += 1){\n" <<
// make sure all carry-ins are saved in global memory
" barrier( CLK_GLOBAL_MEM_FENCE );\n" <<
" if(lid > 0 && idx < carry_out_size) {\n"
" key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" <<
" value = " << carry_in_values_first[k.var<const uint_>("idx")] << ";\n" <<
" if(previous_key == key){\n" <<
" value = " << function(k.var<value_out_type>("result"),
k.var<value_out_type>("value")) << ";\n" <<
" }\n" <<
" " << carry_in_values_first[k.var<const uint_>("idx")] << " = value;\n" <<
" }\n" <<
"}\n";
const context &context = queue.get_context();
kernel kernel = k.compile(context);
kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
queue.enqueue_1d_range_kernel(kernel,
0,
work_group_size,
work_group_size);
}
/// \internal_
///
/// Perform final reduction by key. Each work item:
/// 1. Perform local work-group reduction (Hillis/Steele scan)
/// 2. Add carry-in (if keys are right)
/// 3. Save reduced value if next key is different than processed one
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction>
inline void final_reduction(InputKeyIterator keys_first,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
size_t count,
BinaryFunction function,
vector<uint_>::iterator new_keys_first,
vector<uint_>::iterator carry_in_keys_first,
OutputValueIterator carry_in_values_first,
size_t carry_in_size,
size_t work_group_size,
command_queue &queue)
{
typedef typename
std::iterator_traits<OutputValueIterator>::value_type value_out_type;
detail::meta_kernel k("reduce_by_key_with_scan_final_reduction");
k.add_set_arg<const uint_>("count", uint_(count));
size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
k <<
k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
k.decl<uint_>("key") << ";\n" <<
k.decl<value_out_type>("value") << ";\n"
"if(gid < count){\n" <<
k.var<uint_>("key") << " = " <<
new_keys_first[k.var<const uint_>("gid")] << ";\n" <<
k.var<value_out_type>("value") << " = " <<
values_first[k.var<const uint_>("gid")] << ";\n" <<
"lkeys[lid] = key;\n" <<
"lvals[lid] = value;\n" <<
"}\n" <<
// Hillis/Steele scan
k.decl<value_out_type>("result") << " = value;\n" <<
k.decl<uint_>("other_key") << ";\n" <<
k.decl<value_out_type>("other_value") << ";\n" <<
"for(" << k.decl<uint_>("offset") << " = 1; " <<
"offset < wg_size ; offset *= 2){\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" if(lid >= offset) {\n" <<
" other_key = lkeys[lid - offset];\n" <<
" if(other_key == key){\n" <<
" other_value = lvals[lid - offset];\n" <<
" result = " << function(k.var<value_out_type>("result"),
k.var<value_out_type>("other_value")) << ";\n" <<
" }\n" <<
" }\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" lvals[lid] = result;\n" <<
"}\n" <<
"if(gid >= count) {\n return;\n};\n" <<
k.decl<const bool>("save") << " = (gid < (count - 1)) ?"
<< new_keys_first[k.var<const uint_>("gid + 1")] << " != key" <<
": true;\n" <<
// Add carry in
k.decl<uint_>("carry_in_key") << ";\n" <<
"if(group_id > 0 && save) {\n" <<
" carry_in_key = " << carry_in_keys_first[k.var<const uint_>("group_id - 1")] << ";\n" <<
" if(key == carry_in_key){\n" <<
" other_value = " << carry_in_values_first[k.var<const uint_>("group_id - 1")] << ";\n" <<
" result = " << function(k.var<value_out_type>("result"),
k.var<value_out_type>("other_value")) << ";\n" <<
" }\n" <<
"}\n" <<
// Save result only if the next key is different or it's the last element.
"if(save){\n" <<
keys_result[k.var<uint_>("key")] << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" <<
values_result[k.var<uint_>("key")] << " = result;\n" <<
"}\n"
;
size_t work_groups_no = static_cast<size_t>(
std::ceil(float(count) / work_group_size)
);
const context &context = queue.get_context();
kernel kernel = k.compile(context);
kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
queue.enqueue_1d_range_kernel(kernel,
0,
work_groups_no * work_group_size,
work_group_size);
}
/// \internal_
/// Returns preferred work group size for reduce by key with scan algorithm.
template<class KeyType, class ValueType>
inline size_t get_work_group_size(const device& device)
{
std::string cache_key = std::string("__boost_reduce_by_key_with_scan")
+ "k_" + type_name<KeyType>() + "_v_" + type_name<ValueType>();
// load parameters
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
return (std::max)(
static_cast<size_t>(parameters->get(cache_key, "wgsize", 256)),
static_cast<size_t>(device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>())
);
}
/// \internal_
///
/// 1. For each work group carry-out value is calculated (it's done by key-oriented
/// Hillis/Steele scan). Carry-out is a pair of the last key processed by work
/// group and sum of all values under this key in work group.
/// 2. From every carry-out carry-in is calculated by performing inclusive scan
/// by key.
/// 3. Final reduction by key is performed (key-oriented Hillis/Steele scan),
/// carry-in values are added where needed.
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction, class BinaryPredicate>
inline size_t reduce_by_key_with_scan(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
BinaryFunction function,
BinaryPredicate predicate,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputValueIterator>::value_type value_type;
typedef typename
std::iterator_traits<InputKeyIterator>::value_type key_type;
typedef typename
std::iterator_traits<OutputValueIterator>::value_type value_out_type;
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(keys_first, keys_last);
if(count == 0){
return size_t(0);
}
const device &device = queue.get_device();
size_t work_group_size = get_work_group_size<value_type, key_type>(device);
// Replace original key with unsigned integer keys generated based on given
// predicate. New key is also an index for keys_result and values_result vectors,
// which points to place where reduced value should be saved.
vector<uint_> new_keys(count, context);
vector<uint_>::iterator new_keys_first = new_keys.begin();
generate_uint_keys(keys_first, count, predicate, new_keys_first,
work_group_size, queue);
// Calculate carry-out and carry-in vectors size
const size_t carry_out_size = static_cast<size_t>(
std::ceil(float(count) / work_group_size)
);
vector<uint_> carry_out_keys(carry_out_size, context);
vector<value_out_type> carry_out_values(carry_out_size, context);
carry_outs(new_keys_first, values_first, count, carry_out_keys.begin(),
carry_out_values.begin(), function, work_group_size, queue);
vector<value_out_type> carry_in_values(carry_out_size, context);
carry_ins(carry_out_keys.begin(), carry_out_values.begin(),
carry_in_values.begin(), carry_out_size, function, work_group_size,
queue);
final_reduction(keys_first, values_first, keys_result, values_result,
count, function, new_keys_first, carry_out_keys.begin(),
carry_in_values.begin(), carry_out_size, work_group_size,
queue);
const size_t result = read_single_value<uint_>(new_keys.get_buffer(),
count - 1, queue);
return result + 1;
}
/// \internal_
/// Return true if requirements for running reduce by key with scan on given
/// device are met (at least one work group of preferred size can be run).
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator>
bool reduce_by_key_with_scan_requirements_met(InputKeyIterator keys_first,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
const size_t count,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputValueIterator>::value_type value_type;
typedef typename
std::iterator_traits<InputKeyIterator>::value_type key_type;
typedef typename
std::iterator_traits<OutputValueIterator>::value_type value_out_type;
(void) keys_first;
(void) values_first;
(void) keys_result;
(void) values_result;
const device &device = queue.get_device();
// device must have dedicated local memory storage
if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
{
return false;
}
// local memory size in bytes (per compute unit)
const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
// preferred work group size
size_t work_group_size = get_work_group_size<key_type, value_type>(device);
// local memory size needed to perform parallel reduction
size_t required_local_mem_size = 0;
// keys size
required_local_mem_size += sizeof(uint_) * work_group_size;
// reduced values size
required_local_mem_size += sizeof(value_out_type) * work_group_size;
return (required_local_mem_size <= local_mem_size);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
@@ -0,0 +1,110 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
#include <algorithm>
#include <boost/compute/buffer.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/result_of.hpp>
#include <boost/compute/algorithm/detail/serial_reduce.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void reduce_on_cpu(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type T;
typedef typename
::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
const device &device = queue.get_device();
const uint_ compute_units = queue.get_device().compute_units();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
std::string cache_key =
"__boost_reduce_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
// for inputs smaller than serial_reduce_threshold
// serial_reduce algorithm is used
uint_ serial_reduce_threshold =
parameters->get(cache_key, "serial_reduce_threshold", 16384 * sizeof(T));
serial_reduce_threshold =
(std::max)(serial_reduce_threshold, uint_(compute_units));
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return;
}
else if(count < serial_reduce_threshold) {
return serial_reduce(first, last, result, function, queue);
}
meta_kernel k("reduce_on_cpu");
buffer output(context, sizeof(result_type) * compute_units);
size_t count_arg = k.add_arg<uint_>("count");
size_t output_arg =
k.add_arg<result_type *>(memory_object::global_memory, "output");
k <<
"uint block = " <<
"(uint)ceil(((float)count)/get_global_size(0));\n" <<
"uint index = get_global_id(0) * block;\n" <<
"uint end = min(count, index + block);\n" <<
k.decl<result_type>("result") << " = " << first[k.var<uint_>("index")] << ";\n" <<
"index++;\n" <<
"while(index < end){\n" <<
"result = " << function(k.var<T>("result"),
first[k.var<uint_>("index")]) << ";\n" <<
"index++;\n" <<
"}\n" <<
"output[get_global_id(0)] = result;\n";
size_t global_work_size = compute_units;
kernel kernel = k.compile(context);
// reduction to global_work_size elements
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(output_arg, output);
queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
// final reduction
reduce_on_cpu(
make_buffer_iterator<result_type>(output),
make_buffer_iterator<result_type>(output, global_work_size),
result,
function,
queue
);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
@@ -0,0 +1,286 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
#include <iterator>
#include <boost/compute/utility/source.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/vendor.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/detail/work_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/utility/program_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
/// \internal
/// body reduction inside a warp
template<typename T,bool isNvidiaDevice>
struct ReduceBody
{
static std::string body()
{
std::stringstream k;
// local reduction
k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" uint mask = (i << 1) - 1;\n" <<
" if((lid & mask) == 0){\n" <<
" scratch[lid] += scratch[lid+i];\n" <<
" }\n" <<
"}\n";
return k.str();
}
};
/// \internal
/// body reduction inside a warp
/// for nvidia device we can use the "unsafe"
/// memory optimisation
template<typename T>
struct ReduceBody<T,true>
{
static std::string body()
{
std::stringstream k;
// local reduction
// we use TPB to compile only useful instruction
// local reduction when size is greater than warp size
k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"if(TPB >= 1024){\n" <<
"if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
"if(TPB >= 512){\n" <<
"if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
"if(TPB >= 256){\n" <<
"if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
"if(TPB >= 128){\n" <<
"if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
// warp reduction
"if(lid < 32){\n" <<
// volatile this way we don't need any barrier
"volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
"if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
"if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
"if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
"if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
"if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
"if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
"}\n";
return k.str();
}
};
template<class InputIterator, class Function>
inline void initial_reduce(InputIterator first,
InputIterator last,
buffer result,
const Function &function,
kernel &reduce_kernel,
const uint_ vpt,
const uint_ tpb,
command_queue &queue)
{
(void) function;
(void) reduce_kernel;
typedef typename std::iterator_traits<InputIterator>::value_type Arg;
typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
size_t count = std::distance(first, last);
detail::meta_kernel k("initial_reduce");
k.add_set_arg<const uint_>("count", uint_(count));
size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
k <<
k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
"__local " << type_name<T>() << " scratch[TPB];\n" <<
// private reduction
k.decl<T>("sum") << " = 0;\n" <<
"for(uint i = 0; i < VPT; i++){\n" <<
" if(offset + lid + i*TPB < count){\n" <<
" sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
" }\n" <<
"}\n" <<
"scratch[lid] = sum;\n" <<
// local reduction
ReduceBody<T,false>::body() <<
// write sum to output
"if(lid == 0){\n" <<
" output[get_group_id(0)] = scratch[0];\n" <<
"}\n";
const context &context = queue.get_context();
std::stringstream options;
options << "-DVPT=" << vpt << " -DTPB=" << tpb;
kernel generic_reduce_kernel = k.compile(context, options.str());
generic_reduce_kernel.set_arg(output_arg, result);
size_t work_size = calculate_work_size(count, vpt, tpb);
queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
}
template<class T>
inline void initial_reduce(const buffer_iterator<T> &first,
const buffer_iterator<T> &last,
const buffer &result,
const plus<T> &function,
kernel &reduce_kernel,
const uint_ vpt,
const uint_ tpb,
command_queue &queue)
{
(void) function;
size_t count = std::distance(first, last);
reduce_kernel.set_arg(0, first.get_buffer());
reduce_kernel.set_arg(1, uint_(first.get_index()));
reduce_kernel.set_arg(2, uint_(count));
reduce_kernel.set_arg(3, result);
reduce_kernel.set_arg(4, uint_(0));
size_t work_size = calculate_work_size(count, vpt, tpb);
queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
}
template<class InputIterator, class T, class Function>
inline void reduce_on_gpu(InputIterator first,
InputIterator last,
buffer_iterator<T> result,
Function function,
command_queue &queue)
{
const device &device = queue.get_device();
const context &context = queue.get_context();
detail::meta_kernel k("reduce");
k.add_arg<const T*>(memory_object::global_memory, "input");
k.add_arg<const uint_>("offset");
k.add_arg<const uint_>("count");
k.add_arg<T*>(memory_object::global_memory, "output");
k.add_arg<const uint_>("output_offset");
k <<
k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
"__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
"__local " << type_name<T>() << " scratch[TPB];\n" <<
// private reduction
k.decl<T>("sum") << " = 0;\n" <<
"for(uint i = 0; i < VPT; i++){\n" <<
" if(block_offset + lid + i*TPB < count){\n" <<
" sum = sum + block[lid+i*TPB]; \n" <<
" }\n" <<
"}\n" <<
"scratch[lid] = sum;\n";
// discrimination on vendor name
if(is_nvidia_device(device))
k << ReduceBody<T,true>::body();
else
k << ReduceBody<T,false>::body();
k <<
// write sum to output
"if(lid == 0){\n" <<
" output[output_offset + get_group_id(0)] = scratch[0];\n" <<
"}\n";
std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
// load parameters
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
uint_ vpt = parameters->get(cache_key, "vpt", 8);
uint_ tpb = parameters->get(cache_key, "tpb", 128);
// reduce program compiler flags
std::stringstream options;
options << "-DT=" << type_name<T>()
<< " -DVPT=" << vpt
<< " -DTPB=" << tpb;
// load program
boost::shared_ptr<program_cache> cache =
program_cache::get_global_cache(context);
program reduce_program = cache->get_or_build(
cache_key, options.str(), k.source(), context
);
// create reduce kernel
kernel reduce_kernel(reduce_program, "reduce");
size_t count = std::distance(first, last);
// first pass, reduce from input to ping
buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
// update count after initial reduce
count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
// middle pass(es), reduce between ping and pong
const buffer *input_buffer = &ping;
buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
const buffer *output_buffer = &pong;
if(count > vpt * tpb){
while(count > vpt * tpb){
reduce_kernel.set_arg(0, *input_buffer);
reduce_kernel.set_arg(1, uint_(0));
reduce_kernel.set_arg(2, uint_(count));
reduce_kernel.set_arg(3, *output_buffer);
reduce_kernel.set_arg(4, uint_(0));
size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
if(work_size % tpb != 0){
work_size += tpb - work_size % tpb;
}
queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
std::swap(input_buffer, output_buffer);
count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
}
}
// final pass, reduce from ping/pong to result
reduce_kernel.set_arg(0, *input_buffer);
reduce_kernel.set_arg(1, uint_(0));
reduce_kernel.set_arg(2, uint_(count));
reduce_kernel.set_arg(3, result.get_buffer());
reduce_kernel.set_arg(4, uint_(result.get_index()));
queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
@@ -0,0 +1,45 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
#include <boost/compute/device.hpp>
#include <boost/compute/algorithm/detail/scan_on_cpu.hpp>
#include <boost/compute/algorithm/detail/scan_on_gpu.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator scan(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
const device &device = queue.get_device();
if(device.type() & device::cpu){
return scan_on_cpu(first, last, result, exclusive, init, op, queue);
}
else {
return scan_on_gpu(first, last, result, exclusive, init, op, queue);
}
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
@@ -0,0 +1,207 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
#include <iterator>
#include <boost/compute/device.hpp>
#include <boost/compute/kernel.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/serial_scan.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator scan_on_cpu(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type input_type;
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
const context &context = queue.get_context();
const device &device = queue.get_device();
const size_t compute_units = queue.get_device().compute_units();
boost::shared_ptr<parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
std::string cache_key =
"__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
// for inputs smaller than serial_scan_threshold
// serial_scan algorithm is used
uint_ serial_scan_threshold =
parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
serial_scan_threshold =
(std::max)(serial_scan_threshold, uint_(compute_units));
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return result;
}
else if(count < serial_scan_threshold) {
return serial_scan(first, last, result, exclusive, init, op, queue);
}
buffer block_partial_sums(context, sizeof(output_type) * compute_units );
// create scan kernel
meta_kernel k("scan_on_cpu_block_scan");
// Arguments
size_t count_arg = k.add_arg<uint_>("count");
size_t init_arg = k.add_arg<output_type>("initial_value");
size_t block_partial_sums_arg =
k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
k <<
"uint block = " <<
"(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
"uint index = get_global_id(0) * block;\n" <<
"uint end = min(count, index + block);\n";
if(!exclusive){
k <<
k.decl<output_type>("sum") << " = " <<
first[k.var<uint_>("index")] << ";\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"index++;\n";
}
else {
k <<
k.decl<output_type>("sum") << ";\n" <<
"if(index == 0){\n" <<
"sum = initial_value;\n" <<
"}\n" <<
"else {\n" <<
"sum = " << first[k.var<uint_>("index")] << ";\n" <<
"index++;\n" <<
"}\n";
}
k <<
"while(index < end){\n" <<
// load next value
k.decl<const input_type>("value") << " = "
<< first[k.var<uint_>("index")] << ";\n";
if(exclusive){
k <<
"if(get_global_id(0) == 0){\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"}\n";
}
k <<
"sum = " << op(k.var<output_type>("sum"),
k.var<output_type>("value")) << ";\n";
if(!exclusive){
k <<
"if(get_global_id(0) == 0){\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"}\n";
}
k <<
"index++;\n" <<
"}\n" << // end while
"block_partial_sums[get_global_id(0)] = sum;\n";
// compile scan kernel
kernel block_scan_kernel = k.compile(context);
// setup kernel arguments
block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
// execute the kernel
size_t global_work_size = compute_units;
queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
// scan is done
if(compute_units < 2) {
return result + count;
}
// final scan kernel
meta_kernel l("scan_on_cpu_final_scan");
// Arguments
count_arg = l.add_arg<uint_>("count");
block_partial_sums_arg =
l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
l <<
"uint block = " <<
"(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
"uint index = block + get_global_id(0) * block;\n" <<
"uint end = min(count, index + block);\n" <<
k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
"for(uint i = 0; i < get_global_id(0); i++) {\n" <<
"sum = " << op(k.var<output_type>("sum"),
k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
"}\n" <<
"while(index < end){\n";
if(exclusive){
l <<
l.decl<output_type>("value") << " = "
<< first[k.var<uint_>("index")] << ";\n" <<
result[k.var<uint_>("index")] << " = sum;\n" <<
"sum = " << op(k.var<output_type>("sum"),
k.var<output_type>("value")) << ";\n";
}
else {
l <<
"sum = " << op(k.var<output_type>("sum"),
first[k.var<uint_>("index")]) << ";\n" <<
result[k.var<uint_>("index")] << " = sum;\n";
}
l <<
"index++;\n" <<
"}\n";
// compile scan kernel
kernel final_scan_kernel = l.compile(context);
// setup kernel arguments
final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
// execute the kernel
global_work_size = compute_units;
queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
// return iterator pointing to the end of the result range
return result + count;
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
@@ -0,0 +1,330 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
#include <boost/compute/kernel.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class BinaryOperator>
class local_scan_kernel : public meta_kernel
{
public:
local_scan_kernel(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
BinaryOperator op)
: meta_kernel("local_scan")
{
typedef typename std::iterator_traits<InputIterator>::value_type T;
(void) last;
bool checked = true;
m_block_sums_arg = add_arg<T *>(memory_object::global_memory, "block_sums");
m_scratch_arg = add_arg<T *>(memory_object::local_memory, "scratch");
m_block_size_arg = add_arg<const cl_uint>("block_size");
m_count_arg = add_arg<const cl_uint>("count");
m_init_value_arg = add_arg<const T>("init");
// work-item parameters
*this <<
"const uint gid = get_global_id(0);\n" <<
"const uint lid = get_local_id(0);\n";
// check against data size
if(checked){
*this <<
"if(gid < count){\n";
}
// copy values from input to local memory
if(exclusive){
*this <<
decl<const T>("local_init") << "= (gid == 0) ? init : 0;\n" <<
"if(lid == 0){ scratch[lid] = local_init; }\n" <<
"else { scratch[lid] = " << first[expr<cl_uint>("gid-1")] << "; }\n";
}
else{
*this <<
"scratch[lid] = " << first[expr<cl_uint>("gid")] << ";\n";
}
if(checked){
*this <<
"}\n"
"else {\n" <<
" scratch[lid] = 0;\n" <<
"}\n";
}
// wait for all threads to read from input
*this <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
// perform scan
*this <<
"for(uint i = 1; i < block_size; i <<= 1){\n" <<
" " << decl<const T>("x") << " = lid >= i ? scratch[lid-i] : 0;\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" if(lid >= i){\n" <<
" scratch[lid] = " << op(var<T>("scratch[lid]"), var<T>("x")) << ";\n" <<
" }\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
"}\n";
// copy results to output
if(checked){
*this <<
"if(gid < count){\n";
}
*this <<
result[expr<cl_uint>("gid")] << " = scratch[lid];\n";
if(checked){
*this << "}\n";
}
// store sum for the block
if(exclusive){
*this <<
"if(lid == block_size - 1){\n" <<
" block_sums[get_group_id(0)] = " <<
op(first[expr<cl_uint>("gid")], var<T>("scratch[lid]")) <<
";\n" <<
"}\n";
}
else {
*this <<
"if(lid == block_size - 1){\n" <<
" block_sums[get_group_id(0)] = scratch[lid];\n" <<
"}\n";
}
}
size_t m_block_sums_arg;
size_t m_scratch_arg;
size_t m_block_size_arg;
size_t m_count_arg;
size_t m_init_value_arg;
};
template<class T, class BinaryOperator>
class write_scanned_output_kernel : public meta_kernel
{
public:
write_scanned_output_kernel(BinaryOperator op)
: meta_kernel("write_scanned_output")
{
bool checked = true;
m_output_arg = add_arg<T *>(memory_object::global_memory, "output");
m_block_sums_arg = add_arg<const T *>(memory_object::global_memory, "block_sums");
m_count_arg = add_arg<const cl_uint>("count");
// work-item parameters
*this <<
"const uint gid = get_global_id(0);\n" <<
"const uint block_id = get_group_id(0);\n";
// check against data size
if(checked){
*this << "if(gid < count){\n";
}
// write output
*this <<
"output[gid] = " <<
op(var<T>("block_sums[block_id]"), var<T>("output[gid] ")) << ";\n";
if(checked){
*this << "}\n";
}
}
size_t m_output_arg;
size_t m_block_sums_arg;
size_t m_count_arg;
};
template<class InputIterator>
inline size_t pick_scan_block_size(InputIterator first, InputIterator last)
{
size_t count = iterator_range_size(first, last);
if(count == 0) { return 0; }
else if(count <= 1) { return 1; }
else if(count <= 2) { return 2; }
else if(count <= 4) { return 4; }
else if(count <= 8) { return 8; }
else if(count <= 16) { return 16; }
else if(count <= 32) { return 32; }
else if(count <= 64) { return 64; }
else if(count <= 128) { return 128; }
else { return 256; }
}
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator scan_impl(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type
input_type;
typedef typename
std::iterator_traits<InputIterator>::difference_type
difference_type;
typedef typename
std::iterator_traits<OutputIterator>::value_type
output_type;
const context &context = queue.get_context();
const size_t count = detail::iterator_range_size(first, last);
size_t block_size = pick_scan_block_size(first, last);
size_t block_count = count / block_size;
if(block_count * block_size < count){
block_count++;
}
::boost::compute::vector<input_type> block_sums(block_count, context);
// zero block sums
input_type zero;
std::memset(&zero, 0, sizeof(input_type));
::boost::compute::fill(block_sums.begin(), block_sums.end(), zero, queue);
// local scan
local_scan_kernel<InputIterator, OutputIterator, BinaryOperator>
local_scan_kernel(first, last, result, exclusive, op);
::boost::compute::kernel kernel = local_scan_kernel.compile(context);
kernel.set_arg(local_scan_kernel.m_scratch_arg, local_buffer<input_type>(block_size));
kernel.set_arg(local_scan_kernel.m_block_sums_arg, block_sums);
kernel.set_arg(local_scan_kernel.m_block_size_arg, static_cast<cl_uint>(block_size));
kernel.set_arg(local_scan_kernel.m_count_arg, static_cast<cl_uint>(count));
kernel.set_arg(local_scan_kernel.m_init_value_arg, static_cast<output_type>(init));
queue.enqueue_1d_range_kernel(kernel,
0,
block_count * block_size,
block_size);
// inclusive scan block sums
if(block_count > 1){
scan_impl(block_sums.begin(),
block_sums.end(),
block_sums.begin(),
false,
init,
op,
queue
);
}
// add block sums to each block
if(block_count > 1){
write_scanned_output_kernel<input_type, BinaryOperator>
write_output_kernel(op);
kernel = write_output_kernel.compile(context);
kernel.set_arg(write_output_kernel.m_output_arg, result.get_buffer());
kernel.set_arg(write_output_kernel.m_block_sums_arg, block_sums);
kernel.set_arg(write_output_kernel.m_count_arg, static_cast<cl_uint>(count));
queue.enqueue_1d_range_kernel(kernel,
block_size,
block_count * block_size,
block_size);
}
return result + static_cast<difference_type>(count);
}
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator dispatch_scan(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
return scan_impl(first, last, result, exclusive, init, op, queue);
}
template<class InputIterator, class T, class BinaryOperator>
inline InputIterator dispatch_scan(InputIterator first,
InputIterator last,
InputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
if(first == result){
// scan input in-place
const context &context = queue.get_context();
// make a temporary copy the input
size_t count = iterator_range_size(first, last);
vector<value_type> tmp(count, context);
copy(first, last, tmp.begin(), queue);
// scan from temporary values
return scan_impl(tmp.begin(), tmp.end(), first, exclusive, init, op, queue);
}
else {
// scan input to output
return scan_impl(first, last, result, exclusive, init, op, queue);
}
}
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator scan_on_gpu(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
if(first == last){
return result;
}
return dispatch_scan(first, last, result, exclusive, init, op, queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
@@ -0,0 +1,86 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Search kernel class
///
/// Subclass of meta_kernel which is capable of performing pattern matching
///
template<class PatternIterator, class TextIterator, class OutputIterator>
class search_kernel : public meta_kernel
{
public:
search_kernel() : meta_kernel("search")
{}
void set_range(PatternIterator p_first,
PatternIterator p_last,
TextIterator t_first,
TextIterator t_last,
OutputIterator result)
{
m_p_count = iterator_range_size(p_first, p_last);
m_p_count_arg = add_arg<uint_>("p_count");
m_count = iterator_range_size(t_first, t_last);
m_count = m_count + 1 - m_p_count;
*this <<
"uint i = get_global_id(0);\n" <<
"const uint i1 = i;\n" <<
"uint j;\n" <<
"for(j = 0; j<p_count; j++,i++)\n" <<
"{\n" <<
" if(" << p_first[expr<uint_>("j")] << " != " <<
t_first[expr<uint_>("i")] << ")\n" <<
" j = p_count + 1;\n" <<
"}\n" <<
"if(j == p_count)\n" <<
result[expr<uint_>("i1")] << " = 1;\n" <<
"else\n" <<
result[expr<uint_>("i1")] << " = 0;\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
set_arg(m_p_count_arg, uint_(m_p_count));
return exec_1d(queue, 0, m_count);
}
private:
size_t m_p_count;
size_t m_p_count_arg;
size_t m_count;
};
} //end detail namespace
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
@@ -0,0 +1,56 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class T, class BinaryFunction>
inline void serial_accumulate(InputIterator first,
InputIterator last,
OutputIterator result,
T init,
BinaryFunction function,
command_queue &queue)
{
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(first, last);
meta_kernel k("serial_accumulate");
size_t init_arg = k.add_arg<T>("init");
size_t count_arg = k.add_arg<cl_uint>("count");
k <<
k.decl<T>("result") << " = init;\n" <<
"for(uint i = 0; i < count; i++)\n" <<
" result = " << function(k.var<T>("result"),
first[k.var<cl_uint>("i")]) << ";\n" <<
result[0] << " = result;\n";
kernel kernel = k.compile(context);
kernel.set_arg(init_arg, init);
kernel.set_arg(count_arg, static_cast<cl_uint>(count));
queue.enqueue_task(kernel);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
@@ -0,0 +1,68 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
#include <iterator>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
// counts values that match the predicate using a single thread
template<class InputIterator, class Predicate>
inline size_t serial_count_if(InputIterator first,
InputIterator last,
Predicate predicate,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
const context &context = queue.get_context();
size_t size = iterator_range_size(first, last);
meta_kernel k("serial_count_if");
k.add_set_arg("size", static_cast<uint_>(size));
size_t result_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result");
k <<
"uint count = 0;\n" <<
"for(uint i = 0; i < size; i++){\n" <<
k.decl<const value_type>("value") << "="
<< first[k.var<uint_>("i")] << ";\n" <<
"if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
"count++;\n" <<
"}\n"
"}\n"
"*result = count;\n";
kernel kernel = k.compile(context);
// setup result buffer
scalar<uint_> result(context);
kernel.set_arg(result_arg, result.get_buffer());
// run kernel
queue.enqueue_task(kernel);
// read index
return result.read(queue);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
@@ -0,0 +1,87 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
#include <boost/compute/command_queue.hpp>
#include <boost/compute/types/fundamental.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/container/detail/scalar.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Compare>
inline InputIterator serial_find_extrema(InputIterator first,
InputIterator last,
Compare compare,
const bool find_minimum,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
const context &context = queue.get_context();
meta_kernel k("serial_find_extrema");
k <<
k.decl<value_type>("value") << " = " << first[k.expr<uint_>("0")] << ";\n" <<
k.decl<uint_>("value_index") << " = 0;\n" <<
"for(uint i = 1; i < size; i++){\n" <<
" " << k.decl<value_type>("candidate") << "="
<< first[k.expr<uint_>("i")] << ";\n" <<
"#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
" if(" << compare(k.var<value_type>("candidate"),
k.var<value_type>("value")) << "){\n" <<
"#else\n" <<
" if(" << compare(k.var<value_type>("value"),
k.var<value_type>("candidate")) << "){\n" <<
"#endif\n" <<
" value = candidate;\n" <<
" value_index = i;\n" <<
" }\n" <<
"}\n" <<
"*index = value_index;\n";
size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index");
size_t size_arg_index = k.add_arg<uint_>("size");
std::string options;
if(!find_minimum){
options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
}
kernel kernel = k.compile(context, options);
// setup index buffer
scalar<uint_> index(context);
kernel.set_arg(index_arg_index, index.get_buffer());
// setup count
size_t count = iterator_range_size(first, last);
kernel.set_arg(size_arg_index, static_cast<uint_>(count));
// run kernel
queue.enqueue_task(kernel);
// read index and return iterator
return first + static_cast<difference_type>(index.read(queue));
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
@@ -0,0 +1,97 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
#define BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator1,
class InputIterator2,
class OutputIterator,
class Compare>
inline OutputIterator serial_merge(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator result,
Compare comp,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator1>::value_type
input_type1;
typedef typename
std::iterator_traits<InputIterator2>::value_type
input_type2;
typedef typename
std::iterator_traits<OutputIterator>::difference_type
result_difference_type;
std::ptrdiff_t size1 = std::distance(first1, last1);
std::ptrdiff_t size2 = std::distance(first2, last2);
meta_kernel k("serial_merge");
k.add_set_arg<uint_>("size1", static_cast<uint_>(size1));
k.add_set_arg<uint_>("size2", static_cast<uint_>(size2));
k <<
"uint i = 0;\n" << // index in result range
"uint j = 0;\n" << // index in first input range
"uint k = 0;\n" << // index in second input range
// fetch initial values from each range
k.decl<input_type1>("j_value") << " = " << first1[0] << ";\n" <<
k.decl<input_type2>("k_value") << " = " << first2[0] << ";\n" <<
// merge values from both input ranges to the result range
"while(j < size1 && k < size2){\n" <<
" if(" << comp(k.var<input_type1>("j_value"),
k.var<input_type2>("k_value")) << "){\n" <<
" " << result[k.var<uint_>("i++")] << " = j_value;\n" <<
" j_value = " << first1[k.var<uint_>("++j")] << ";\n" <<
" }\n" <<
" else{\n"
" " << result[k.var<uint_>("i++")] << " = k_value;\n"
" k_value = " << first2[k.var<uint_>("++k")] << ";\n" <<
" }\n"
"}\n"
// copy any remaining values from first range
"while(j < size1){\n" <<
result[k.var<uint_>("i++")] << " = " <<
first1[k.var<uint_>("j++")] << ";\n" <<
"}\n"
// copy any remaining values from second range
"while(k < size2){\n" <<
result[k.var<uint_>("i++")] << " = " <<
first2[k.var<uint_>("k++")] << ";\n" <<
"}\n";
// run kernel
k.exec(queue);
return result + static_cast<result_difference_type>(size1 + size2);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
@@ -0,0 +1,62 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/type_traits/result_of.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void serial_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type T;
typedef typename
::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return;
}
meta_kernel k("serial_reduce");
size_t count_arg = k.add_arg<cl_uint>("count");
k <<
k.decl<result_type>("result") << " = " << first[0] << ";\n" <<
"for(uint i = 1; i < count; i++)\n" <<
" result = " << function(k.var<T>("result"),
first[k.var<uint_>("i")]) << ";\n" <<
result[0] << " = result;\n";
kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<uint_>(count));
queue.enqueue_task(kernel);
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
@@ -0,0 +1,108 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
#include <iterator>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/type_traits/result_of.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction, class BinaryPredicate>
inline size_t serial_reduce_by_key(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
BinaryFunction function,
BinaryPredicate predicate,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputValueIterator>::value_type value_type;
typedef typename
std::iterator_traits<InputKeyIterator>::value_type key_type;
typedef typename
::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type;
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(keys_first, keys_last);
if(count < 1){
return count;
}
meta_kernel k("serial_reduce_by_key");
size_t count_arg = k.add_arg<uint_>("count");
size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory,
"result_size");
convert<result_type> to_result_type;
k <<
k.decl<result_type>("result") <<
" = " << to_result_type(values_first[0]) << ";\n" <<
k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" <<
k.decl<result_type>("value") << ";\n" <<
k.decl<key_type>("key") << ";\n" <<
k.decl<uint_>("size") << " = 1;\n" <<
keys_result[0] << " = previous_key;\n" <<
values_result[0] << " = result;\n" <<
"for(ulong i = 1; i < count; i++) {\n" <<
" value = " << to_result_type(values_first[k.var<uint_>("i")]) << ";\n" <<
" key = " << keys_first[k.var<uint_>("i")] << ";\n" <<
" if (" << predicate(k.var<key_type>("previous_key"),
k.var<key_type>("key")) << ") {\n" <<
" result = " << function(k.var<result_type>("result"),
k.var<result_type>("value")) << ";\n" <<
" }\n " <<
" else { \n" <<
keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
" result = value;\n" <<
" size++;\n" <<
" } \n" <<
" previous_key = key;\n" <<
"}\n" <<
keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
"*result_size = size;";
kernel kernel = k.compile(context);
scalar<uint_> result_size(context);
kernel.set_arg(result_size_arg, result_size.get_buffer());
kernel.set_arg(count_arg, static_cast<uint_>(count));
queue.enqueue_task(kernel);
return static_cast<size_t>(result_size.read(queue));
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
@@ -0,0 +1,103 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
#include <iterator>
#include <boost/compute/device.hpp>
#include <boost/compute/kernel.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator serial_scan(InputIterator first,
InputIterator last,
OutputIterator result,
bool exclusive,
T init,
BinaryOperator op,
command_queue &queue)
{
if(first == last){
return result;
}
typedef typename
std::iterator_traits<InputIterator>::value_type input_type;
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
const context &context = queue.get_context();
// create scan kernel
meta_kernel k("serial_scan");
// Arguments
size_t n_arg = k.add_arg<ulong_>("n");
size_t init_arg = k.add_arg<output_type>("initial_value");
if(!exclusive){
k <<
k.decl<const ulong_>("start_idx") << " = 1;\n" <<
k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
result[0] << " = sum;\n";
}
else {
k <<
k.decl<const ulong_>("start_idx") << " = 0;\n" <<
k.decl<output_type>("sum") << " = initial_value;\n";
}
k <<
"for(ulong i = start_idx; i < n; i++){\n" <<
k.decl<const input_type>("x") << " = "
<< first[k.var<ulong_>("i")] << ";\n";
if(exclusive){
k << result[k.var<ulong_>("i")] << " = sum;\n";
}
k << " sum = "
<< op(k.var<output_type>("sum"), k.var<output_type>("x"))
<< ";\n";
if(!exclusive){
k << result[k.var<ulong_>("i")] << " = sum;\n";
}
k << "}\n";
// compile scan kernel
kernel scan_kernel = k.compile(context);
// setup kernel arguments
size_t n = detail::iterator_range_size(first, last);
scan_kernel.set_arg<ulong_>(n_arg, n);
scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
// execute the kernel
queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
// return iterator pointing to the end of the result range
return result + n;
}
} // end detail namespace
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
@@ -0,0 +1,53 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_EQUAL_HPP
#define BOOST_COMPUTE_ALGORITHM_EQUAL_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/mismatch.hpp>
namespace boost {
namespace compute {
/// Returns \c true if the range [\p first1, \p last1) and the range
/// beginning at \p first2 are equal.
template<class InputIterator1, class InputIterator2>
inline bool equal(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
command_queue &queue = system::default_queue())
{
return ::boost::compute::mismatch(first1,
last1,
first2,
queue).first == last1;
}
/// \overload
template<class InputIterator1, class InputIterator2>
inline bool equal(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
command_queue &queue = system::default_queue())
{
if(std::distance(first1, last1) != std::distance(first2, last2)){
return false;
}
return ::boost::compute::equal(first1, last1, first2, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_EQUAL_HPP
@@ -0,0 +1,42 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP
#define BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP
#include <utility>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/lower_bound.hpp>
#include <boost/compute/algorithm/upper_bound.hpp>
namespace boost {
namespace compute {
/// Returns a pair of iterators containing the range of values equal
/// to \p value in the sorted range [\p first, \p last).
template<class InputIterator, class T>
inline std::pair<InputIterator, InputIterator>
equal_range(InputIterator first,
InputIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
return std::make_pair(
::boost::compute::lower_bound(first, last, value, queue),
::boost::compute::upper_bound(first, last, value, queue)
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP
@@ -0,0 +1,96 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP
#define BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP
#include <boost/compute/functional.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/scan.hpp>
namespace boost {
namespace compute {
/// Performs an exclusive scan of the elements in the range [\p first, \p last)
/// and stores the results in the range beginning at \p result.
///
/// Each element in the output is assigned to the sum of all the previous
/// values in the input.
///
/// \param first first element in the range to scan
/// \param last last element in the range to scan
/// \param result first element in the result range
/// \param init value used to initialize the scan sequence
/// \param binary_op associative binary operator
/// \param queue command queue to perform the operation
///
/// \return \c OutputIterator to the end of the result range
///
/// The default operation is to add the elements up.
///
/// \snippet test/test_scan.cpp exclusive_scan_int
///
/// But different associative operation can be specified as \p binary_op
/// instead (e.g., multiplication, maximum, minimum). Also value used to
/// initialized the scan sequence can be specified.
///
/// \snippet test/test_scan.cpp exclusive_scan_int_multiplies
///
/// \see inclusive_scan()
template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
inline OutputIterator
exclusive_scan(InputIterator first,
InputIterator last,
OutputIterator result,
T init,
BinaryOperator binary_op,
command_queue &queue = system::default_queue())
{
return detail::scan(first, last, result, true, init, binary_op, queue);
}
/// \overload
template<class InputIterator, class OutputIterator, class T>
inline OutputIterator
exclusive_scan(InputIterator first,
InputIterator last,
OutputIterator result,
T init,
command_queue &queue = system::default_queue())
{
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
return detail::scan(first, last, result, true,
init, boost::compute::plus<output_type>(),
queue);
}
/// \overload
template<class InputIterator, class OutputIterator>
inline OutputIterator
exclusive_scan(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
return detail::scan(first, last, result, true,
output_type(0), boost::compute::plus<output_type>(),
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP
@@ -0,0 +1,306 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FILL_HPP
#define BOOST_COMPUTE_ALGORITHM_FILL_HPP
#include <iterator>
#include <boost/mpl/int.hpp>
#include <boost/mpl/vector.hpp>
#include <boost/mpl/contains.hpp>
#include <boost/utility/enable_if.hpp>
#include <boost/compute/cl.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/async/future.hpp>
#include <boost/compute/iterator/constant_iterator.hpp>
#include <boost/compute/iterator/discard_iterator.hpp>
#include <boost/compute/detail/is_buffer_iterator.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
namespace mpl = boost::mpl;
// fills the range [first, first + count) with value using copy()
template<class BufferIterator, class T>
inline void fill_with_copy(BufferIterator first,
size_t count,
const T &value,
command_queue &queue)
{
::boost::compute::copy(
::boost::compute::make_constant_iterator(value, 0),
::boost::compute::make_constant_iterator(value, count),
first,
queue
);
}
// fills the range [first, first + count) with value using copy_async()
template<class BufferIterator, class T>
inline future<void> fill_async_with_copy(BufferIterator first,
size_t count,
const T &value,
command_queue &queue)
{
return ::boost::compute::copy_async(
::boost::compute::make_constant_iterator(value, 0),
::boost::compute::make_constant_iterator(value, count),
first,
queue
);
}
#if defined(CL_VERSION_1_2)
// meta-function returing true if Iterator points to a range of values
// that can be filled using clEnqueueFillBuffer(). to meet this criteria
// it must have a buffer accessible through iter.get_buffer() and the
// size of its value_type must by in {1, 2, 4, 8, 16, 32, 64, 128}.
template<class Iterator>
struct is_valid_fill_buffer_iterator :
public mpl::and_<
is_buffer_iterator<Iterator>,
mpl::contains<
mpl::vector<
mpl::int_<1>,
mpl::int_<2>,
mpl::int_<4>,
mpl::int_<8>,
mpl::int_<16>,
mpl::int_<32>,
mpl::int_<64>,
mpl::int_<128>
>,
mpl::int_<
sizeof(typename std::iterator_traits<Iterator>::value_type)
>
>
>::type { };
template<>
struct is_valid_fill_buffer_iterator<discard_iterator> : public boost::false_type {};
// specialization which uses clEnqueueFillBuffer for buffer iterators
template<class BufferIterator, class T>
inline void
dispatch_fill(BufferIterator first,
size_t count,
const T &value,
command_queue &queue,
typename boost::enable_if<
is_valid_fill_buffer_iterator<BufferIterator>
>::type* = 0)
{
typedef typename std::iterator_traits<BufferIterator>::value_type value_type;
if(count == 0){
// nothing to do
return;
}
// check if the device supports OpenCL 1.2 (required for enqueue_fill_buffer)
if(!queue.check_device_version(1, 2)){
return fill_with_copy(first, count, value, queue);
}
value_type pattern = static_cast<value_type>(value);
size_t offset = static_cast<size_t>(first.get_index());
if(count == 1){
// use clEnqueueWriteBuffer() directly when writing a single value
// to the device buffer. this is potentially more efficient and also
// works around a bug in the intel opencl driver.
queue.enqueue_write_buffer(
first.get_buffer(),
offset * sizeof(value_type),
sizeof(value_type),
&pattern
);
}
else {
queue.enqueue_fill_buffer(
first.get_buffer(),
&pattern,
sizeof(value_type),
offset * sizeof(value_type),
count * sizeof(value_type)
);
}
}
template<class BufferIterator, class T>
inline future<void>
dispatch_fill_async(BufferIterator first,
size_t count,
const T &value,
command_queue &queue,
typename boost::enable_if<
is_valid_fill_buffer_iterator<BufferIterator>
>::type* = 0)
{
typedef typename std::iterator_traits<BufferIterator>::value_type value_type;
// check if the device supports OpenCL 1.2 (required for enqueue_fill_buffer)
if(!queue.check_device_version(1, 2)){
return fill_async_with_copy(first, count, value, queue);
}
value_type pattern = static_cast<value_type>(value);
size_t offset = static_cast<size_t>(first.get_index());
event event_ =
queue.enqueue_fill_buffer(first.get_buffer(),
&pattern,
sizeof(value_type),
offset * sizeof(value_type),
count * sizeof(value_type));
return future<void>(event_);
}
#ifdef CL_VERSION_2_0
// specializations for svm_ptr<T>
template<class T>
inline void dispatch_fill(svm_ptr<T> first,
size_t count,
const T &value,
command_queue &queue)
{
if(count == 0){
return;
}
queue.enqueue_svm_fill(
first.get(), &value, sizeof(T), count * sizeof(T)
);
}
template<class T>
inline future<void> dispatch_fill_async(svm_ptr<T> first,
size_t count,
const T &value,
command_queue &queue)
{
if(count == 0){
return future<void>();
}
event event_ = queue.enqueue_svm_fill(
first.get(), &value, sizeof(T), count * sizeof(T)
);
return future<void>(event_);
}
#endif // CL_VERSION_2_0
// default implementations
template<class BufferIterator, class T>
inline void
dispatch_fill(BufferIterator first,
size_t count,
const T &value,
command_queue &queue,
typename boost::disable_if<
is_valid_fill_buffer_iterator<BufferIterator>
>::type* = 0)
{
fill_with_copy(first, count, value, queue);
}
template<class BufferIterator, class T>
inline future<void>
dispatch_fill_async(BufferIterator first,
size_t count,
const T &value,
command_queue &queue,
typename boost::disable_if<
is_valid_fill_buffer_iterator<BufferIterator>
>::type* = 0)
{
return fill_async_with_copy(first, count, value, queue);
}
#else
template<class BufferIterator, class T>
inline void dispatch_fill(BufferIterator first,
size_t count,
const T &value,
command_queue &queue)
{
fill_with_copy(first, count, value, queue);
}
template<class BufferIterator, class T>
inline future<void> dispatch_fill_async(BufferIterator first,
size_t count,
const T &value,
command_queue &queue)
{
return fill_async_with_copy(first, count, value, queue);
}
#endif // !defined(CL_VERSION_1_2)
} // end detail namespace
/// Fills the range [\p first, \p last) with \p value.
///
/// \param first first element in the range to fill
/// \param last last element in the range to fill
/// \param value value to copy to each element
/// \param queue command queue to perform the operation
///
/// For example, to fill a vector on the device with sevens:
/// \code
/// // vector on the device
/// boost::compute::vector<int> vec(10, context);
///
/// // fill vector with sevens
/// boost::compute::fill(vec.begin(), vec.end(), 7, queue);
/// \endcode
///
/// \see boost::compute::fill_n()
template<class BufferIterator, class T>
inline void fill(BufferIterator first,
BufferIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return;
}
detail::dispatch_fill(first, count, value, queue);
}
template<class BufferIterator, class T>
inline future<void> fill_async(BufferIterator first,
BufferIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return future<void>();
}
return detail::dispatch_fill_async(first, count, value, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FILL_HPP
@@ -0,0 +1,36 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FILL_N_HPP
#define BOOST_COMPUTE_ALGORITHM_FILL_N_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/fill.hpp>
namespace boost {
namespace compute {
/// Fills the range [\p first, \p first + count) with \p value.
///
/// \see fill()
template<class BufferIterator, class Size, class T>
inline void fill_n(BufferIterator first,
Size count,
const T &value,
command_queue &queue = system::default_queue())
{
::boost::compute::fill(first, first + count, value, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FILL_N_HPP
@@ -0,0 +1,57 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FIND_HPP
#define BOOST_COMPUTE_ALGORITHM_FIND_HPP
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/find_if.hpp>
#include <boost/compute/type_traits/vector_size.hpp>
namespace boost {
namespace compute {
/// Returns an iterator pointing to the first element in the range
/// [\p first, \p last) that equals \p value.
template<class InputIterator, class T>
inline InputIterator find(InputIterator first,
InputIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
using ::boost::compute::_1;
using ::boost::compute::lambda::all;
if(vector_size<value_type>::value == 1){
return ::boost::compute::find_if(
first,
last,
_1 == value,
queue
);
}
else {
return ::boost::compute::find_if(
first,
last,
all(_1 == value),
queue
);
}
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FIND_HPP
@@ -0,0 +1,136 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FIND_END_HPP
#define BOOST_COMPUTE_ALGORITHM_FIND_END_HPP
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/algorithm/detail/search_all.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Helper function for find_end
///
/// Basically a copy of find_if which returns last occurence
/// instead of first occurence
///
template<class InputIterator, class UnaryPredicate>
inline InputIterator find_end_helper(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return last;
}
const context &context = queue.get_context();
detail::meta_kernel k("find_end");
size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
atomic_max<int_> atomic_max_int;
k << k.decl<const int_>("i") << " = get_global_id(0);\n"
<< k.decl<const value_type>("value") << "="
<< first[k.var<const int_>("i")] << ";\n"
<< "if(" << predicate(k.var<const value_type>("value")) << "){\n"
<< " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
<< "}\n";
kernel kernel = k.compile(context);
scalar<int_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
index.write(static_cast<int_>(-1), queue);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
int result = static_cast<int>(index.read(queue));
if(result == -1){
return last;
}
else {
return first + static_cast<difference_type>(result);
}
}
} // end detail namespace
///
/// \brief Substring matching algorithm
///
/// Searches for the last match of the pattern [p_first, p_last)
/// in text [t_first, t_last).
/// \return Iterator pointing to beginning of last occurence
///
/// \param t_first Iterator pointing to start of text
/// \param t_last Iterator pointing to end of text
/// \param p_first Iterator pointing to start of pattern
/// \param p_last Iterator pointing to end of pattern
/// \param queue Queue on which to execute
///
template<class TextIterator, class PatternIterator>
inline TextIterator find_end(TextIterator t_first,
TextIterator t_last,
PatternIterator p_first,
PatternIterator p_last,
command_queue &queue = system::default_queue())
{
const context &context = queue.get_context();
// there is no need to check if pattern starts at last n - 1 indices
vector<uint_> matching_indices(
detail::iterator_range_size(t_first, t_last)
+ 1 - detail::iterator_range_size(p_first, p_last),
context
);
detail::search_kernel<PatternIterator,
TextIterator,
vector<uint_>::iterator> kernel;
kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin());
kernel.exec(queue);
using boost::compute::_1;
vector<uint_>::iterator index =
detail::find_end_helper(
matching_indices.begin(),
matching_indices.end(),
_1 == 1,
queue
);
// pattern was not found
if(index == matching_indices.end())
return t_last;
return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FIND_END_HPP
@@ -0,0 +1,35 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP
#define BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/find_if_with_atomics.hpp>
namespace boost {
namespace compute {
/// Returns an iterator pointing to the first element in the range
/// [\p first, \p last) for which \p predicate returns \c true.
template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return detail::find_if_with_atomics(first, last, predicate, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP
@@ -0,0 +1,43 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP
#define BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/find_if.hpp>
namespace boost {
namespace compute {
/// Returns an iterator pointing to the first element in the range
/// [\p first, \p last) for which \p predicate returns \c false.
///
/// \see find_if()
template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_not(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return ::boost::compute::find_if(
first,
last,
not1(predicate),
queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP
@@ -0,0 +1,65 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP
#define BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class Function>
struct for_each_kernel : public meta_kernel
{
for_each_kernel(InputIterator first, InputIterator last, Function function)
: meta_kernel("for_each")
{
// store range size
m_count = detail::iterator_range_size(first, last);
// setup kernel source
*this << function(first[get_global_id(0)]) << ";\n";
}
void exec(command_queue &queue)
{
exec_1d(queue, 0, m_count);
}
size_t m_count;
};
} // end detail namespace
/// Calls \p function on each element in the range [\p first, \p last).
///
/// \see transform()
template<class InputIterator, class UnaryFunction>
inline UnaryFunction for_each(InputIterator first,
InputIterator last,
UnaryFunction function,
command_queue &queue = system::default_queue())
{
detail::for_each_kernel<InputIterator, UnaryFunction> kernel(first, last, function);
kernel.exec(queue);
return function;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP
@@ -0,0 +1,35 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP
#define BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP
#include <boost/compute/algorithm/for_each.hpp>
namespace boost {
namespace compute {
/// Calls \p function on each element in the range [\p first, \p first
/// \c + \p count).
///
/// \see for_each()
template<class InputIterator, class Size, class UnaryFunction>
inline UnaryFunction for_each_n(InputIterator first,
Size count,
UnaryFunction function,
command_queue &queue = system::default_queue())
{
return ::boost::compute::for_each(first, first + count, function, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP
@@ -0,0 +1,82 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_GATHER_HPP
#define BOOST_COMPUTE_ALGORITHM_GATHER_HPP
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/exception.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/type_traits/type_name.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class MapIterator, class OutputIterator>
class gather_kernel : public meta_kernel
{
public:
gather_kernel() : meta_kernel("gather")
{}
void set_range(MapIterator first,
MapIterator last,
InputIterator input,
OutputIterator result)
{
m_count = iterator_range_size(first, last);
*this <<
"const uint i = get_global_id(0);\n" <<
result[expr<uint_>("i")] << "=" <<
input[first[expr<uint_>("i")]] << ";\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
};
} // end detail namespace
/// Copies the elements using the indices from the range [\p first, \p last)
/// to the range beginning at \p result using the input values from the range
/// beginning at \p input.
///
/// \see scatter()
template<class InputIterator, class MapIterator, class OutputIterator>
inline void gather(MapIterator first,
MapIterator last,
InputIterator input,
OutputIterator result,
command_queue &queue = system::default_queue())
{
detail::gather_kernel<InputIterator, MapIterator, OutputIterator> kernel;
kernel.set_range(first, last, input, result);
kernel.exec(queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_GATHER_HPP
@@ -0,0 +1,49 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_GENERATE_HPP
#define BOOST_COMPUTE_ALGORITHM_GENERATE_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/iterator/function_input_iterator.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
/// Stores the result of \p generator for each element in the range
/// [\p first, \p last).
template<class OutputIterator, class Generator>
inline void generate(OutputIterator first,
OutputIterator last,
Generator generator,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return;
}
::boost::compute::copy(
::boost::compute::make_function_input_iterator(generator,
first.get_index()),
::boost::compute::make_function_input_iterator(generator,
last.get_index()),
first,
queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_GENERATE_HPP
@@ -0,0 +1,35 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP
#define BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/generate.hpp>
namespace boost {
namespace compute {
/// Stores the result of \p generator for each element in the range
/// [\p first, \p first + \p count).
template<class OutputIterator, class Size, class Generator>
inline void generate_n(OutputIterator first,
Size count,
Generator generator,
command_queue &queue = system::default_queue())
{
::boost::compute::generate(first, first + count, generator, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP
@@ -0,0 +1,155 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_INCLUDES_HPP
#define BOOST_COMPUTE_ALGORITHM_INCLUDES_HPP
#include <iterator>
#include <boost/compute/algorithm/detail/balanced_path.hpp>
#include <boost/compute/algorithm/fill_n.hpp>
#include <boost/compute/algorithm/find.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/read_write_single_value.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Serial includes kernel class
///
/// Subclass of meta_kernel to perform includes operation after tiling
///
class serial_includes_kernel : meta_kernel
{
public:
serial_includes_kernel() : meta_kernel("includes")
{
}
template<class InputIterator1, class InputIterator2,
class InputIterator3, class InputIterator4,
class OutputIterator>
void set_range(InputIterator1 first1,
InputIterator2 first2,
InputIterator3 tile_first1,
InputIterator3 tile_last1,
InputIterator4 tile_first2,
OutputIterator result)
{
m_count = iterator_range_size(tile_first1, tile_last1) - 1;
*this <<
"uint i = get_global_id(0);\n" <<
"uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
"uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
"uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
"uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
"uint includes = 1;\n" <<
"while(start1<end1 && start2<end2)\n" <<
"{\n" <<
" if(" << first1[expr<uint_>("start1")] << " == " <<
first2[expr<uint_>("start2")] << ")\n" <<
" {\n" <<
" start1++; start2++;\n" <<
" }\n" <<
" else if(" << first1[expr<uint_>("start1")] << " < " <<
first2[expr<uint_>("start2")] << ")\n" <<
" start1++;\n" <<
" else\n" <<
" {\n" <<
" includes = 0;\n" <<
" break;\n" <<
" }\n" <<
"}\n" <<
"if(start2<end2)\n" <<
" includes = 0;\n" <<
result[expr<uint_>("i")] << " = includes;\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
};
} //end detail namespace
///
/// \brief Includes algorithm
///
/// Finds if the sorted range [first1, last1) includes the sorted
/// range [first2, last2). In other words, it checks if [first1, last1) is
/// a superset of [first2, last2).
///
/// \return True, if [first1, last1) includes [first2, last2). False otherwise.
///
/// \param first1 Iterator pointing to start of first set
/// \param last1 Iterator pointing to end of first set
/// \param first2 Iterator pointing to start of second set
/// \param last2 Iterator pointing to end of second set
/// \param queue Queue on which to execute
///
template<class InputIterator1, class InputIterator2>
inline bool includes(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
command_queue &queue = system::default_queue())
{
size_t tile_size = 1024;
size_t count1 = detail::iterator_range_size(first1, last1);
size_t count2 = detail::iterator_range_size(first2, last2);
vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
// Tile the sets
detail::balanced_path_kernel tiling_kernel;
tiling_kernel.tile_size = static_cast<unsigned int>(tile_size);
tiling_kernel.set_range(first1, last1, first2, last2,
tile_a.begin()+1, tile_b.begin()+1);
fill_n(tile_a.begin(), 1, uint_(0), queue);
fill_n(tile_b.begin(), 1, uint_(0), queue);
tiling_kernel.exec(queue);
fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue);
fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue);
vector<uint_> result((count1+count2+tile_size-1)/tile_size, queue.get_context());
// Find individually
detail::serial_includes_kernel includes_kernel;
includes_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
tile_b.begin(), result.begin());
includes_kernel.exec(queue);
return find(result.begin(), result.end(), 0, queue) == result.end();
}
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP
@@ -0,0 +1,81 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP
#define BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP
#include <boost/compute/functional.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/scan.hpp>
namespace boost {
namespace compute {
/// Performs an inclusive scan of the elements in the range [\p first, \p last)
/// and stores the results in the range beginning at \p result.
///
/// Each element in the output is assigned to the sum of the current value in
/// the input with the sum of every previous value in the input.
///
/// \param first first element in the range to scan
/// \param last last element in the range to scan
/// \param result first element in the result range
/// \param binary_op associative binary operator
/// \param queue command queue to perform the operation
///
/// \return \c OutputIterator to the end of the result range
///
/// The default operation is to add the elements up.
///
/// \snippet test/test_scan.cpp inclusive_scan_int
///
/// But different associative operation can be specified as \p binary_op
/// instead (e.g., multiplication, maximum, minimum).
///
/// \snippet test/test_scan.cpp inclusive_scan_int_multiplies
///
/// \see exclusive_scan()
template<class InputIterator, class OutputIterator, class BinaryOperator>
inline OutputIterator
inclusive_scan(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryOperator binary_op,
command_queue &queue = system::default_queue())
{
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
return detail::scan(first, last, result, false,
output_type(0), binary_op,
queue);
}
/// \overload
template<class InputIterator, class OutputIterator>
inline OutputIterator
inclusive_scan(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
return detail::scan(first, last, result, false,
output_type(0), boost::compute::plus<output_type>(),
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP
@@ -0,0 +1,93 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP
#define BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/accumulate.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/iterator/transform_iterator.hpp>
#include <boost/compute/iterator/zip_iterator.hpp>
#include <boost/compute/functional/detail/unpack.hpp>
namespace boost {
namespace compute {
/// Returns the inner product of the elements in the range
/// [\p first1, \p last1) with the elements in the range beginning
/// at \p first2.
template<class InputIterator1, class InputIterator2, class T>
inline T inner_product(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
T init,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type input_type;
ptrdiff_t n = std::distance(first1, last1);
return ::boost::compute::accumulate(
::boost::compute::make_transform_iterator(
::boost::compute::make_zip_iterator(
boost::make_tuple(first1, first2)
),
detail::unpack(multiplies<input_type>())
),
::boost::compute::make_transform_iterator(
::boost::compute::make_zip_iterator(
boost::make_tuple(last1, first2 + n)
),
detail::unpack(multiplies<input_type>())
),
init,
queue
);
}
/// \overload
template<class InputIterator1,
class InputIterator2,
class T,
class BinaryAccumulateFunction,
class BinaryTransformFunction>
inline T inner_product(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
T init,
BinaryAccumulateFunction accumulate_function,
BinaryTransformFunction transform_function,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
size_t count = detail::iterator_range_size(first1, last1);
vector<value_type> result(count, queue.get_context());
transform(first1,
last1,
first2,
result.begin(),
transform_function,
queue);
return ::boost::compute::accumulate(result.begin(),
result.end(),
init,
accumulate_function,
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP
@@ -0,0 +1,60 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP
#define BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/merge.hpp>
#include <boost/compute/container/vector.hpp>
namespace boost {
namespace compute {
/// Merges the sorted values in the range [\p first, \p middle) with
/// the sorted values in the range [\p middle, \p last) in-place.
template<class Iterator>
inline void inplace_merge(Iterator first,
Iterator middle,
Iterator last,
command_queue &queue = system::default_queue())
{
BOOST_ASSERT(first < middle && middle < last);
typedef typename std::iterator_traits<Iterator>::value_type T;
const context &context = queue.get_context();
ptrdiff_t left_size = std::distance(first, middle);
ptrdiff_t right_size = std::distance(middle, last);
vector<T> left(left_size, context);
vector<T> right(right_size, context);
copy(first, middle, left.begin(), queue);
copy(middle, last, right.begin(), queue);
::boost::compute::merge(
left.begin(),
left.end(),
right.begin(),
right.end(),
first,
queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP
@@ -0,0 +1,48 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_IOTA_HPP
#define BOOST_COMPUTE_ALGORITHM_IOTA_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/iterator/counting_iterator.hpp>
namespace boost {
namespace compute {
/// Fills the range [\p first, \p last) with sequential values starting at
/// \p value.
///
/// For example, the following code:
/// \snippet test/test_iota.cpp iota
///
/// Will fill \c vec with the values (\c 0, \c 1, \c 2, \c ...).
template<class BufferIterator, class T>
inline void iota(BufferIterator first,
BufferIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
T count = static_cast<T>(detail::iterator_range_size(first, last));
copy(
::boost::compute::make_counting_iterator(value),
::boost::compute::make_counting_iterator(value + count),
first,
queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_IOTA_HPP
@@ -0,0 +1,43 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_IS_PARTITIONED_HPP
#define BOOST_COMPUTE_ALGORITHM_IS_PARTITIONED_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/find_if.hpp>
#include <boost/compute/algorithm/find_if_not.hpp>
namespace boost {
namespace compute {
/// Returns \c true if the values in the range [\p first, \p last)
/// are partitioned according to \p predicate.
template<class InputIterator, class UnaryPredicate>
inline bool is_partitioned(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return ::boost::compute::find_if(
::boost::compute::find_if_not(first,
last,
predicate,
queue),
last,
predicate,
queue) == last;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
@@ -0,0 +1,67 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP
#define BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/algorithm/equal.hpp>
#include <boost/compute/algorithm/sort.hpp>
namespace boost {
namespace compute {
///
/// \brief Permutation checking algorithm
///
/// Checks if the range [first1, last1) can be permuted into the
/// range [first2, last2)
/// \return True, if it can be permuted. False, otherwise.
///
/// \param first1 Iterator pointing to start of first range
/// \param last1 Iterator pointing to end of first range
/// \param first2 Iterator pointing to start of second range
/// \param last2 Iterator pointing to end of second range
/// \param queue Queue on which to execute
///
template<class InputIterator1, class InputIterator2>
inline bool is_permutation(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type1;
typedef typename std::iterator_traits<InputIterator2>::value_type value_type2;
size_t count1 = detail::iterator_range_size(first1, last1);
size_t count2 = detail::iterator_range_size(first2, last2);
if(count1 != count2) return false;
vector<value_type1> temp1(first1, last1, queue);
vector<value_type2> temp2(first2, last2, queue);
sort(temp1.begin(), temp1.end(), queue);
sort(temp2.begin(), temp2.end(), queue);
return equal(temp1.begin(), temp1.end(),
temp2.begin(), queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP
@@ -0,0 +1,64 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP
#define BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP
#include <boost/compute/command_queue.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/functional/bind.hpp>
#include <boost/compute/functional/operator.hpp>
#include <boost/compute/algorithm/adjacent_find.hpp>
namespace boost {
namespace compute {
/// Returns \c true if the values in the range [\p first, \p last)
/// are in sorted order.
///
/// \param first first element in the range to check
/// \param last last element in the range to check
/// \param compare comparison function (by default \c less)
/// \param queue command queue to perform the operation
///
/// \return \c true if the range [\p first, \p last) is sorted
///
/// \see sort()
template<class InputIterator, class Compare>
inline bool is_sorted(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue = system::default_queue())
{
using ::boost::compute::placeholders::_1;
using ::boost::compute::placeholders::_2;
return ::boost::compute::adjacent_find(
first, last, ::boost::compute::bind(compare, _2, _1), queue
) == last;
}
/// \overload
template<class InputIterator>
inline bool is_sorted(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
return ::boost::compute::is_sorted(
first, last, ::boost::compute::less<value_type>(), queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP
@@ -0,0 +1,117 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Mageswaran.D <mageswaran1989@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#include <boost/compute/system.hpp>
#include <boost/compute/context.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/any_of.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/utility/program_cache.hpp>
namespace boost {
namespace compute {
namespace detail {
const char lexicographical_compare_source[] =
"__kernel void lexicographical_compare(const uint size1,\n"
" const uint size2,\n"
" __global const T1 *range1,\n"
" __global const T2 *range2,\n"
" __global bool *result_buf)\n"
"{\n"
" const uint i = get_global_id(0);\n"
" if((i != size1) && (i != size2)){\n"
//Individual elements are compared and results are stored in parallel.
//0 is true
" if(range1[i] < range2[i])\n"
" result_buf[i] = 0;\n"
" else\n"
" result_buf[i] = 1;\n"
" }\n"
" else\n"
" result_buf[i] = !((i == size1) && (i != size2));\n"
"}\n";
template<class InputIterator1, class InputIterator2>
inline bool dispatch_lexicographical_compare(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
command_queue &queue)
{
const boost::compute::context &context = queue.get_context();
boost::shared_ptr<program_cache> cache =
program_cache::get_global_cache(context);
size_t iterator_size1 = iterator_range_size(first1, last1);
size_t iterator_size2 = iterator_range_size(first2, last2);
size_t max_size = (std::max)(iterator_size1, iterator_size2);
if(max_size == 0){
return false;
}
boost::compute::vector<bool> result_vector(max_size, context);
typedef typename std::iterator_traits<InputIterator1>::value_type value_type1;
typedef typename std::iterator_traits<InputIterator2>::value_type value_type2;
// load (or create) lexicographical compare program
std::string cache_key =
std::string("__boost_lexicographical_compare")
+ type_name<value_type1>() + type_name<value_type2>();
std::stringstream options;
options << " -DT1=" << type_name<value_type1>();
options << " -DT2=" << type_name<value_type2>();
program lexicographical_compare_program = cache->get_or_build(
cache_key, options.str(), lexicographical_compare_source, context
);
kernel lexicographical_compare_kernel(lexicographical_compare_program,
"lexicographical_compare");
lexicographical_compare_kernel.set_arg<uint_>(0, iterator_size1);
lexicographical_compare_kernel.set_arg<uint_>(1, iterator_size2);
lexicographical_compare_kernel.set_arg(2, first1.get_buffer());
lexicographical_compare_kernel.set_arg(3, first2.get_buffer());
lexicographical_compare_kernel.set_arg(4, result_vector.get_buffer());
queue.enqueue_1d_range_kernel(lexicographical_compare_kernel,
0,
max_size,
0);
return boost::compute::any_of(result_vector.begin(),
result_vector.end(),
_1 == 0,
queue);
}
} // end detail namespace
/// Checks if the first range [first1, last1) is lexicographically
/// less than the second range [first2, last2).
template<class InputIterator1, class InputIterator2>
inline bool lexicographical_compare(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
command_queue &queue = system::default_queue())
{
return detail::dispatch_lexicographical_compare(first1, last1, first2, last2, queue);
}
} // end compute namespace
} // end boost namespac
@@ -0,0 +1,44 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP
#define BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/binary_find.hpp>
namespace boost {
namespace compute {
/// Returns an iterator pointing to the first element in the sorted
/// range [\p first, \p last) that is not less than \p value.
///
/// \see upper_bound()
template<class InputIterator, class T>
inline InputIterator
lower_bound(InputIterator first,
InputIterator last,
const T &value,
command_queue &queue = system::default_queue())
{
using ::boost::compute::_1;
InputIterator position =
detail::binary_find(first, last, _1 >= value, queue);
return position;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP
@@ -0,0 +1,74 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP
#define BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/algorithm/detail/find_extrema.hpp>
namespace boost {
namespace compute {
/// Returns an iterator pointing to the element in the range
/// [\p first, \p last) with the maximum value.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param compare comparison function object which returns true if the first
/// argument is less than (i.e. is ordered before) the second.
/// \param queue command queue to perform the operation
///
/// For example, to find \c int2 value with maximum first component in given vector:
/// \code
/// // comparison function object
/// BOOST_COMPUTE_FUNCTION(bool, compare_first, (const int2_ &a, const int2_ &b),
/// {
/// return a.x < b.x;
/// });
///
/// // create vector
/// boost::compute::vector<uint2_> data = ...
///
/// boost::compute::vector<uint2_>::iterator max =
/// boost::compute::max_element(data.begin(), data.end(), compare_first, queue);
/// \endcode
///
/// \see min_element()
template<class InputIterator, class Compare>
inline InputIterator
max_element(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue = system::default_queue())
{
return detail::find_extrema(first, last, compare, false, queue);
}
///\overload
template<class InputIterator>
inline InputIterator
max_element(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
return ::boost::compute::max_element(
first, last, ::boost::compute::less<value_type>(), queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP
@@ -0,0 +1,105 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_MERGE_HPP
#define BOOST_COMPUTE_ALGORITHM_MERGE_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
#include <boost/compute/algorithm/detail/serial_merge.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
/// Merges the sorted values in the range [\p first1, \p last1) with the sorted
/// values in the range [\p first2, last2) and stores the result in the range
/// beginning at \p result. Values are compared using the \p comp function. If
/// no comparision function is given, \c less is used.
///
/// \param first1 first element in the first range to merge
/// \param last1 last element in the first range to merge
/// \param first2 first element in the second range to merge
/// \param last2 last element in the second range to merge
/// \param result first element in the result range
/// \param comp comparison function (by default \c less)
/// \param queue command queue to perform the operation
///
/// \return \c OutputIterator to the end of the result range
///
/// \see inplace_merge()
template<class InputIterator1,
class InputIterator2,
class OutputIterator,
class Compare>
inline OutputIterator merge(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator result,
Compare comp,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type input1_type;
typedef typename std::iterator_traits<InputIterator2>::value_type input2_type;
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
const device &device = queue.get_device();
std::string cache_key =
std::string("__boost_merge_") + type_name<input1_type>() + "_"
+ type_name<input2_type>() + "_" + type_name<output_type>();
boost::shared_ptr<detail::parameter_cache> parameters =
detail::parameter_cache::get_global_cache(device);
// default serial merge threshold depends on device type
size_t default_serial_merge_threshold = 32768;
if(device.type() & device::gpu) {
default_serial_merge_threshold = 2048;
}
// loading serial merge threshold parameter
const size_t serial_merge_threshold =
parameters->get(cache_key, "serial_merge_threshold",
static_cast<uint_>(default_serial_merge_threshold));
// choosing merge algorithm
const size_t total_count =
detail::iterator_range_size(first1, last1)
+ detail::iterator_range_size(first2, last2);
// for small inputs serial merge turns out to outperform
// merge with merge path algorithm
if(total_count <= serial_merge_threshold){
return detail::serial_merge(first1, last1, first2, last2, result, comp, queue);
}
return detail::merge_with_merge_path(first1, last1, first2, last2, result, comp, queue);
}
/// \overload
template<class InputIterator1, class InputIterator2, class OutputIterator>
inline OutputIterator merge(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
less<value_type> less_than;
return merge(first1, last1, first2, last2, result, less_than, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_MERGE_HPP
@@ -0,0 +1,74 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP
#define BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/algorithm/detail/find_extrema.hpp>
namespace boost {
namespace compute {
/// Returns an iterator pointing to the element in range
/// [\p first, \p last) with the minimum value.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param compare comparison function object which returns true if the first
/// argument is less than (i.e. is ordered before) the second.
/// \param queue command queue to perform the operation
///
/// For example, to find \c int2 value with minimum first component in given vector:
/// \code
/// // comparison function object
/// BOOST_COMPUTE_FUNCTION(bool, compare_first, (const int2_ &a, const int2_ &b),
/// {
/// return a.x < b.x;
/// });
///
/// // create vector
/// boost::compute::vector<uint2_> data = ...
///
/// boost::compute::vector<uint2_>::iterator min =
/// boost::compute::min_element(data.begin(), data.end(), compare_first, queue);
/// \endcode
///
/// \see max_element()
template<class InputIterator, class Compare>
inline InputIterator
min_element(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue = system::default_queue())
{
return detail::find_extrema(first, last, compare, true, queue);
}
///\overload
template<class InputIterator>
inline InputIterator
min_element(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
return ::boost::compute::min_element(
first, last, ::boost::compute::less<value_type>(), queue
);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP
@@ -0,0 +1,70 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP
#define BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP
#include <utility>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/max_element.hpp>
#include <boost/compute/algorithm/min_element.hpp>
namespace boost {
namespace compute {
/// Returns a pair of iterators with the first pointing to the minimum
/// element and the second pointing to the maximum element in the range
/// [\p first, \p last).
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param compare comparison function object which returns true if the first
/// argument is less than (i.e. is ordered before) the second.
/// \param queue command queue to perform the operation
///
/// \see max_element(), min_element()
template<class InputIterator, class Compare>
inline std::pair<InputIterator, InputIterator>
minmax_element(InputIterator first,
InputIterator last,
Compare compare,
command_queue &queue = system::default_queue())
{
if(first == last){
// empty range
return std::make_pair(first, first);
}
return std::make_pair(min_element(first, last, compare, queue),
max_element(first, last, compare, queue));
}
///\overload
template<class InputIterator>
inline std::pair<InputIterator, InputIterator>
minmax_element(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
if(first == last){
// empty range
return std::make_pair(first, first);
}
return std::make_pair(min_element(first, last, queue),
max_element(first, last, queue));
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP
@@ -0,0 +1,89 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP
#define BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP
#include <iterator>
#include <utility>
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/find.hpp>
#include <boost/compute/iterator/transform_iterator.hpp>
#include <boost/compute/iterator/zip_iterator.hpp>
#include <boost/compute/functional/detail/unpack.hpp>
namespace boost {
namespace compute {
/// Returns a pair of iterators pointing to the first position where the
/// range [\p first1, \p last1) and the range starting at \p first2
/// differ.
template<class InputIterator1, class InputIterator2>
inline std::pair<InputIterator1, InputIterator2>
mismatch(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
::boost::compute::equal_to<value_type> op;
InputIterator2 last2 = first2 + std::distance(first1, last1);
InputIterator1 iter =
boost::get<0>(
::boost::compute::find(
::boost::compute::make_transform_iterator(
::boost::compute::make_zip_iterator(
boost::make_tuple(first1, first2)
),
detail::unpack(op)
),
::boost::compute::make_transform_iterator(
::boost::compute::make_zip_iterator(
boost::make_tuple(last1, last2)
),
detail::unpack(op)
),
false,
queue
).base().get_iterator_tuple()
);
return std::make_pair(iter, first2 + std::distance(first1, iter));
}
/// \overload
template<class InputIterator1, class InputIterator2>
inline std::pair<InputIterator1, InputIterator2>
mismatch(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
InputIterator2 last2,
command_queue &queue = system::default_queue())
{
if(std::distance(first1, last1) < std::distance(first2, last2)){
return ::boost::compute::mismatch(first1, last1, first2, queue);
}
else {
return ::boost::compute::mismatch(
first1, first1 + std::distance(first2, last2), first2, queue
);
}
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP
@@ -0,0 +1,170 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP
#define BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/algorithm/reverse.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Helper function for next_permutation
///
/// To find rightmost element which is smaller
/// than its next element
///
template<class InputIterator>
inline InputIterator next_permutation_helper(InputIterator first,
InputIterator last,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0 || count == 1){
return last;
}
count = count - 1;
const context &context = queue.get_context();
detail::meta_kernel k("next_permutation");
size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
atomic_max<int_> atomic_max_int;
k << k.decl<const int_>("i") << " = get_global_id(0);\n"
<< k.decl<const value_type>("cur_value") << "="
<< first[k.var<const int_>("i")] << ";\n"
<< k.decl<const value_type>("next_value") << "="
<< first[k.expr<const int_>("i+1")] << ";\n"
<< "if(cur_value < next_value){\n"
<< " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
<< "}\n";
kernel kernel = k.compile(context);
scalar<int_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
index.write(static_cast<int_>(-1), queue);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
int result = static_cast<int>(index.read(queue));
if(result == -1) return last;
else return first + result;
}
///
/// \brief Helper function for next_permutation
///
/// To find the smallest element to the right of the element found above
/// that is greater than it
///
template<class InputIterator, class ValueType>
inline InputIterator np_ceiling(InputIterator first,
InputIterator last,
ValueType value,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return last;
}
const context &context = queue.get_context();
detail::meta_kernel k("np_ceiling");
size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value");
atomic_max<int_> atomic_max_int;
k << k.decl<const int_>("i") << " = get_global_id(0);\n"
<< k.decl<const value_type>("cur_value") << "="
<< first[k.var<const int_>("i")] << ";\n"
<< "if(cur_value <= " << first[k.expr<int_>("*index")]
<< " && cur_value > value){\n"
<< " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
<< "}\n";
kernel kernel = k.compile(context);
scalar<int_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
index.write(static_cast<int_>(0), queue);
kernel.set_arg(value_arg, value);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
int result = static_cast<int>(index.read(queue));
return first + result;
}
} // end detail namespace
///
/// \brief Permutation generating algorithm
///
/// Transforms the range [first, last) into the next permutation from the
/// set of all permutations arranged in lexicographic order
/// \return Boolean value signifying if the last permutation was crossed
/// and the range was reset
///
/// \param first Iterator pointing to start of range
/// \param last Iterator pointing to end of range
/// \param queue Queue on which to execute
///
template<class InputIterator>
inline bool next_permutation(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
if(first == last) return false;
InputIterator first_element =
detail::next_permutation_helper(first, last, queue);
if(first_element == last)
{
reverse(first, last, queue);
return false;
}
value_type first_value = first_element.read(queue);
InputIterator ceiling_element =
detail::np_ceiling(first_element + 1, last, first_value, queue);
value_type ceiling_value = ceiling_element.read(queue);
first_element.write(ceiling_value, queue);
ceiling_element.write(first_value, queue);
reverse(first_element + 1, last, queue);
return true;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP
@@ -0,0 +1,36 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP
#define BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/find_if.hpp>
namespace boost {
namespace compute {
/// Returns \c true if \p predicate returns \c true for none of the elements in
/// the range [\p first, \p last).
///
/// \see all_of(), any_of()
template<class InputIterator, class UnaryPredicate>
inline bool none_of(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return ::boost::compute::find_if(first, last, predicate, queue) == last;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP
@@ -0,0 +1,87 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP
#define BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/fill_n.hpp>
#include <boost/compute/algorithm/find.hpp>
#include <boost/compute/algorithm/partition.hpp>
#include <boost/compute/algorithm/sort.hpp>
#include <boost/compute/functional/bind.hpp>
namespace boost {
namespace compute {
/// Rearranges the elements in the range [\p first, \p last) such that
/// the \p nth element would be in that position in a sorted sequence.
template<class Iterator, class Compare>
inline void nth_element(Iterator first,
Iterator nth,
Iterator last,
Compare compare,
command_queue &queue = system::default_queue())
{
if(nth == last) return;
typedef typename std::iterator_traits<Iterator>::value_type value_type;
while(1)
{
value_type value = nth.read(queue);
using boost::compute::placeholders::_1;
Iterator new_nth = partition(
first, last, ::boost::compute::bind(compare, _1, value), queue
);
Iterator old_nth = find(new_nth, last, value, queue);
value_type new_value = new_nth.read(queue);
fill_n(new_nth, 1, value, queue);
fill_n(old_nth, 1, new_value, queue);
new_value = nth.read(queue);
if(value == new_value) break;
if(std::distance(first, nth) < std::distance(first, new_nth))
{
last = new_nth;
}
else
{
first = new_nth;
}
}
}
/// \overload
template<class Iterator>
inline void nth_element(Iterator first,
Iterator nth,
Iterator last,
command_queue &queue = system::default_queue())
{
if(nth == last) return;
typedef typename std::iterator_traits<Iterator>::value_type value_type;
less<value_type> less_than;
return nth_element(first, nth, last, less_than, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP
@@ -0,0 +1,37 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP
#define BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/inclusive_scan.hpp>
namespace boost {
namespace compute {
/// Calculates the cumulative sum of the elements in the range [\p first,
/// \p last) and writes the resulting values to the range beginning at
/// \p result.
template<class InputIterator, class OutputIterator>
inline OutputIterator
partial_sum(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
return ::boost::compute::inclusive_scan(first, last, result, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP
@@ -0,0 +1,39 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
#define BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/stable_partition.hpp>
namespace boost {
namespace compute {
///
/// Partitions the elements in the range [\p first, \p last) according to
/// \p predicate. Order of the elements need not be preserved.
///
/// \see is_partitioned() and stable_partition()
///
template<class Iterator, class UnaryPredicate>
inline Iterator partition(Iterator first,
Iterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return stable_partition(first, last, predicate, queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
@@ -0,0 +1,63 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP
#define BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy_if.hpp>
namespace boost {
namespace compute {
/// Copies all of the elements in the range [\p first, \p last) for which
/// \p predicate returns \c true to the range beginning at \p first_true
/// and all of the elements for which \p predicate returns \c false to
/// the range beginning at \p first_false.
///
/// \see partition()
template<class InputIterator,
class OutputIterator1,
class OutputIterator2,
class UnaryPredicate>
inline std::pair<OutputIterator1, OutputIterator2>
partition_copy(InputIterator first,
InputIterator last,
OutputIterator1 first_true,
OutputIterator2 first_false,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
// copy true values
OutputIterator1 last_true =
::boost::compute::copy_if(first,
last,
first_true,
predicate,
queue);
// copy false values
OutputIterator2 last_false =
::boost::compute::copy_if(first,
last,
first_false,
not1(predicate),
queue);
// return iterators to the end of the true and the false ranges
return std::make_pair(last_true, last_false);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP
@@ -0,0 +1,46 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP
#define BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/binary_find.hpp>
namespace boost {
namespace compute {
///
/// \brief Partition point algorithm
///
/// Finds the end of true values in the partitioned range [first, last)
/// \return Iterator pointing to end of true values
///
/// \param first Iterator pointing to start of range
/// \param last Iterator pointing to end of range
/// \param predicate Unary predicate to be applied on each element
/// \param queue Queue on which to execute
///
/// \see partition() and stable_partition()
///
template<class InputIterator, class UnaryPredicate>
inline InputIterator partition_point(InputIterator first,
InputIterator last,
UnaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return detail::binary_find(first, last, not1(predicate), queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP
@@ -0,0 +1,170 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP
#define BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/algorithm/reverse.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Helper function for prev_permutation
///
/// To find rightmost element which is greater
/// than its next element
///
template<class InputIterator>
inline InputIterator prev_permutation_helper(InputIterator first,
InputIterator last,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0 || count == 1){
return last;
}
count = count - 1;
const context &context = queue.get_context();
detail::meta_kernel k("prev_permutation");
size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
atomic_max<int_> atomic_max_int;
k << k.decl<const int_>("i") << " = get_global_id(0);\n"
<< k.decl<const value_type>("cur_value") << "="
<< first[k.var<const int_>("i")] << ";\n"
<< k.decl<const value_type>("next_value") << "="
<< first[k.expr<const int_>("i+1")] << ";\n"
<< "if(cur_value > next_value){\n"
<< " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
<< "}\n";
kernel kernel = k.compile(context);
scalar<int_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
index.write(static_cast<int_>(-1), queue);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
int result = static_cast<int>(index.read(queue));
if(result == -1) return last;
else return first + result;
}
///
/// \brief Helper function for prev_permutation
///
/// To find the largest element to the right of the element found above
/// that is smaller than it
///
template<class InputIterator, class ValueType>
inline InputIterator pp_floor(InputIterator first,
InputIterator last,
ValueType value,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return last;
}
const context &context = queue.get_context();
detail::meta_kernel k("pp_floor");
size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value");
atomic_max<int_> atomic_max_int;
k << k.decl<const int_>("i") << " = get_global_id(0);\n"
<< k.decl<const value_type>("cur_value") << "="
<< first[k.var<const int_>("i")] << ";\n"
<< "if(cur_value >= " << first[k.expr<int_>("*index")]
<< " && cur_value < value){\n"
<< " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
<< "}\n";
kernel kernel = k.compile(context);
scalar<int_> index(context);
kernel.set_arg(index_arg, index.get_buffer());
index.write(static_cast<int_>(0), queue);
kernel.set_arg(value_arg, value);
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
int result = static_cast<int>(index.read(queue));
return first + result;
}
} // end detail namespace
///
/// \brief Permutation generating algorithm
///
/// Transforms the range [first, last) into the previous permutation from
/// the set of all permutations arranged in lexicographic order
/// \return Boolean value signifying if the first permutation was crossed
/// and the range was reset
///
/// \param first Iterator pointing to start of range
/// \param last Iterator pointing to end of range
/// \param queue Queue on which to execute
///
template<class InputIterator>
inline bool prev_permutation(InputIterator first,
InputIterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
if(first == last) return false;
InputIterator first_element =
detail::prev_permutation_helper(first, last, queue);
if(first_element == last)
{
reverse(first, last, queue);
return false;
}
value_type first_value = first_element.read(queue);
InputIterator ceiling_element =
detail::pp_floor(first_element + 1, last, first_value, queue);
value_type ceiling_value = ceiling_element.read(queue);
first_element.write(ceiling_value, queue);
ceiling_element.write(first_value, queue);
reverse(first_element + 1, last, queue);
return true;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP
@@ -0,0 +1,75 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP
#define BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP
#include <vector>
#include <algorithm>
#include <boost/range/algorithm_ext/iota.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/algorithm/scatter.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
/// Randomly shuffles the elements in the range [\p first, \p last).
///
/// \see scatter()
template<class Iterator>
inline void random_shuffle(Iterator first,
Iterator last,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<Iterator>::value_type value_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
return;
}
// generate shuffled indices on the host
std::vector<cl_uint> random_indices(count);
boost::iota(random_indices, 0);
std::random_shuffle(random_indices.begin(), random_indices.end());
// copy random indices to the device
const context &context = queue.get_context();
vector<cl_uint> indices(count, context);
::boost::compute::copy(random_indices.begin(),
random_indices.end(),
indices.begin(),
queue);
// make a copy of the values on the device
vector<value_type> tmp(count, context);
::boost::compute::copy(first,
last,
tmp.begin(),
queue);
// write values to their new locations
::boost::compute::scatter(tmp.begin(),
tmp.end(),
indices.begin(),
first,
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP
@@ -0,0 +1,301 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/array.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/algorithm/copy_n.hpp>
#include <boost/compute/algorithm/detail/inplace_reduce.hpp>
#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/type_traits/result_of.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class OutputIterator, class BinaryFunction>
size_t reduce(InputIterator first,
size_t count,
OutputIterator result,
size_t block_size,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type
input_type;
typedef typename
boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
result_type;
const context &context = queue.get_context();
size_t block_count = count / 2 / block_size;
size_t total_block_count =
static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
if(block_count != 0){
meta_kernel k("block_reduce");
size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
k <<
"const uint gid = get_global_id(0);\n" <<
"const uint lid = get_local_id(0);\n" <<
// copy values to local memory
"block[lid] = " <<
function(first[k.make_var<uint_>("gid*2+0")],
first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
// perform reduction
"for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
" barrier(CLK_LOCAL_MEM_FENCE);\n" <<
" uint mask = (i << 1) - 1;\n" <<
" if((lid & mask) == 0){\n" <<
" block[lid] = " <<
function(k.expr<input_type>("block[lid]"),
k.expr<input_type>("block[lid+i]")) << ";\n" <<
" }\n" <<
"}\n" <<
// write block result to global output
"if(lid == 0)\n" <<
" output[get_group_id(0)] = block[0];\n";
kernel kernel = k.compile(context);
kernel.set_arg(output_arg, result.get_buffer());
kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
queue.enqueue_1d_range_kernel(kernel,
0,
block_count * block_size,
block_size);
}
// serially reduce any leftovers
if(block_count * block_size * 2 < count){
size_t last_block_start = block_count * block_size * 2;
meta_kernel k("extra_serial_reduce");
size_t count_arg = k.add_arg<uint_>("count");
size_t offset_arg = k.add_arg<uint_>("offset");
size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
size_t output_offset_arg = k.add_arg<uint_>("output_offset");
k <<
k.decl<result_type>("result") << " = \n" <<
first[k.expr<uint_>("offset")] << ";\n" <<
"for(uint i = offset + 1; i < count; i++)\n" <<
" result = " <<
function(k.var<result_type>("result"),
first[k.var<uint_>("i")]) << ";\n" <<
"output[output_offset] = result;\n";
kernel kernel = k.compile(context);
kernel.set_arg(count_arg, static_cast<uint_>(count));
kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
kernel.set_arg(output_arg, result.get_buffer());
kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
queue.enqueue_task(kernel);
}
return total_block_count;
}
template<class InputIterator, class BinaryFunction>
inline vector<
typename boost::compute::result_of<
BinaryFunction(
typename std::iterator_traits<InputIterator>::value_type,
typename std::iterator_traits<InputIterator>::value_type
)
>::type
>
block_reduce(InputIterator first,
size_t count,
size_t block_size,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type
input_type;
typedef typename
boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
result_type;
const context &context = queue.get_context();
size_t total_block_count =
static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
vector<result_type> result_vector(total_block_count, context);
reduce(first, count, result_vector.begin(), block_size, function, queue);
return result_vector;
}
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void generic_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue)
{
typedef typename
std::iterator_traits<InputIterator>::value_type
input_type;
typedef typename
boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
result_type;
const device &device = queue.get_device();
const context &context = queue.get_context();
size_t count = detail::iterator_range_size(first, last);
if(device.type() & device::cpu){
array<result_type, 1> value(context);
detail::reduce_on_cpu(first, last, value.begin(), function, queue);
boost::compute::copy_n(value.begin(), 1, result, queue);
}
else {
size_t block_size = 256;
// first pass
vector<result_type> results = detail::block_reduce(first,
count,
block_size,
function,
queue);
if(results.size() > 1){
detail::inplace_reduce(results.begin(),
results.end(),
function,
queue);
}
boost::compute::copy_n(results.begin(), 1, result, queue);
}
}
template<class InputIterator, class OutputIterator, class T>
inline void dispatch_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
const plus<T> &function,
command_queue &queue)
{
const context &context = queue.get_context();
const device &device = queue.get_device();
// reduce to temporary buffer on device
array<T, 1> value(context);
if(device.type() & device::cpu){
detail::reduce_on_cpu(first, last, value.begin(), function, queue);
}
else {
reduce_on_gpu(first, last, value.begin(), function, queue);
}
// copy to result iterator
copy_n(value.begin(), 1, result, queue);
}
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void dispatch_reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue)
{
generic_reduce(first, last, result, function, queue);
}
} // end detail namespace
/// Returns the result of applying \p function to the elements in the
/// range [\p first, \p last).
///
/// If no function is specified, \c plus will be used.
///
/// \param first first element in the input range
/// \param last last element in the input range
/// \param result iterator pointing to the output
/// \param function binary reduction function
/// \param queue command queue to perform the operation
///
/// The \c reduce() algorithm assumes that the binary reduction function is
/// associative. When used with non-associative functions the result may
/// be non-deterministic and vary in precision. Notably this affects the
/// \c plus<float>() function as floating-point addition is not associative
/// and may produce slightly different results than a serial algorithm.
///
/// This algorithm supports both host and device iterators for the
/// result argument. This allows for values to be reduced and copied
/// to the host all with a single function call.
///
/// For example, to calculate the sum of the values in a device vector and
/// copy the result to a value on the host:
///
/// \snippet test/test_reduce.cpp sum_int
///
/// Note that while the the \c reduce() algorithm is conceptually identical to
/// the \c accumulate() algorithm, its implementation is substantially more
/// efficient on parallel hardware. For more information, see the documentation
/// on the \c accumulate() algorithm.
///
/// \see accumulate()
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline void reduce(InputIterator first,
InputIterator last,
OutputIterator result,
BinaryFunction function,
command_queue &queue = system::default_queue())
{
if(first == last){
return;
}
detail::dispatch_reduce(first, last, result, function, queue);
}
/// \overload
template<class InputIterator, class OutputIterator>
inline void reduce(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type T;
if(first == last){
return;
}
detail::dispatch_reduce(first, last, result, plus<T>(), queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
@@ -0,0 +1,118 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP
#define BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP
#include <iterator>
#include <utility>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/device.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/detail/reduce_by_key.hpp>
namespace boost {
namespace compute {
/// The \c reduce_by_key() algorithm performs reduction for each contiguous
/// subsequence of values determinate by equivalent keys.
///
/// Returns a pair of iterators at the end of the ranges [\p keys_result, keys_result_last)
/// and [\p values_result, \p values_result_last).
///
/// If no function is specified, \c plus will be used.
/// If no predicate is specified, \c equal_to will be used.
///
/// \param keys_first the first key
/// \param keys_last the last key
/// \param values_first the first input value
/// \param keys_result iterator pointing to the key output
/// \param values_result iterator pointing to the reduced value output
/// \param function binary reduction function
/// \param predicate binary predicate which returns true only if two keys are equal
/// \param queue command queue to perform the operation
///
/// The \c reduce_by_key() algorithm assumes that the binary reduction function
/// is associative. When used with non-associative functions the result may
/// be non-deterministic and vary in precision. Notably this affects the
/// \c plus<float>() function as floating-point addition is not associative
/// and may produce slightly different results than a serial algorithm.
///
/// For example, to calculate the sum of the values for each key:
///
/// \snippet test/test_reduce_by_key.cpp reduce_by_key_int
///
/// \see reduce()
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction, class BinaryPredicate>
inline std::pair<OutputKeyIterator, OutputValueIterator>
reduce_by_key(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
BinaryFunction function,
BinaryPredicate predicate,
command_queue &queue = system::default_queue())
{
return detail::dispatch_reduce_by_key(keys_first, keys_last, values_first,
keys_result, values_result,
function, predicate,
queue);
}
/// \overload
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator,
class BinaryFunction>
inline std::pair<OutputKeyIterator, OutputValueIterator>
reduce_by_key(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
BinaryFunction function,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type;
return reduce_by_key(keys_first, keys_last, values_first,
keys_result, values_result,
function, equal_to<key_type>(),
queue);
}
/// \overload
template<class InputKeyIterator, class InputValueIterator,
class OutputKeyIterator, class OutputValueIterator>
inline std::pair<OutputKeyIterator, OutputValueIterator>
reduce_by_key(InputKeyIterator keys_first,
InputKeyIterator keys_last,
InputValueIterator values_first,
OutputKeyIterator keys_result,
OutputValueIterator values_result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type;
typedef typename std::iterator_traits<InputValueIterator>::value_type value_type;
return reduce_by_key(keys_first, keys_last, values_first,
keys_result, values_result,
plus<value_type>(), equal_to<key_type>(),
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP
@@ -0,0 +1,54 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REMOVE_HPP
#define BOOST_COMPUTE_ALGORITHM_REMOVE_HPP
#include <boost/compute/lambda.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/remove_if.hpp>
#include <boost/compute/type_traits/vector_size.hpp>
namespace boost {
namespace compute {
/// Removes each element equal to \p value in the range [\p first,
/// \p last).
///
/// \see remove_if()
template<class Iterator, class T>
inline Iterator remove(Iterator first,
Iterator last,
const T &value,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<Iterator>::value_type value_type;
using ::boost::compute::_1;
using ::boost::compute::lambda::all;
if(vector_size<value_type>::value == 1){
return ::boost::compute::remove_if(first,
last,
_1 == value,
queue);
}
else {
return ::boost::compute::remove_if(first,
last,
all(_1 == value),
queue);
}
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REMOVE_HPP
@@ -0,0 +1,47 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP
#define BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/copy_if.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/functional/logical.hpp>
namespace boost {
namespace compute {
/// Removes each element for which \p predicate returns \c true in the
/// range [\p first, \p last).
///
/// \see remove()
template<class Iterator, class Predicate>
inline Iterator remove_if(Iterator first,
Iterator last,
Predicate predicate,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<Iterator>::value_type value_type;
// temporary storage for the input data
::boost::compute::vector<value_type> tmp(first, last, queue);
return ::boost::compute::copy_if(tmp.begin(),
tmp.end(),
first,
not1(predicate),
queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP
@@ -0,0 +1,90 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REPLACE_HPP
#define BOOST_COMPUTE_ALGORITHM_REPLACE_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class Iterator, class T>
class replace_kernel : public meta_kernel
{
public:
replace_kernel()
: meta_kernel("replace")
{
m_count = 0;
}
void set_range(Iterator first, Iterator last)
{
m_count = detail::iterator_range_size(first, last);
*this <<
"const uint i = get_global_id(0);\n" <<
"if(" << first[var<cl_uint>("i")] << " == " << var<T>("old_value") << ")\n" <<
" " << first[var<cl_uint>("i")] << '=' << var<T>("new_value") << ";\n";
}
void set_old_value(const T &old_value)
{
add_set_arg<T>("old_value", old_value);
}
void set_new_value(const T &new_value)
{
add_set_arg<T>("new_value", new_value);
}
void exec(command_queue &queue)
{
if(m_count == 0){
// nothing to do
return;
}
exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
};
} // end detail namespace
/// Replaces each instance of \p old_value in the range [\p first,
/// \p last) with \p new_value.
template<class Iterator, class T>
inline void replace(Iterator first,
Iterator last,
const T &old_value,
const T &new_value,
command_queue &queue = system::default_queue())
{
detail::replace_kernel<Iterator, T> kernel;
kernel.set_range(first, last);
kernel.set_old_value(old_value);
kernel.set_new_value(new_value);
kernel.exec(queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REPLACE_HPP
@@ -0,0 +1,62 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP
#define BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/algorithm/replace.hpp>
namespace boost {
namespace compute {
/// Copies the value in the range [\p first, \p last) to the range
/// beginning at \p result while replacing each instance of \p old_value
/// with \p new_value.
///
/// \see replace()
template<class InputIterator, class OutputIterator, class T>
inline OutputIterator
replace_copy(InputIterator first,
InputIterator last,
OutputIterator result,
const T &old_value,
const T &new_value,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type;
difference_type count = std::distance(first, last);
if(count == 0){
return result;
}
// copy data to result
::boost::compute::copy(first, last, result, queue);
// replace in result
::boost::compute::replace(result,
result + count,
old_value,
new_value,
queue);
// return iterator to the end of result
return result + count;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP
@@ -0,0 +1,74 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REVERSE_HPP
#define BOOST_COMPUTE_ALGORITHM_REVERSE_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class Iterator>
struct reverse_kernel : public meta_kernel
{
reverse_kernel(Iterator first, Iterator last)
: meta_kernel("reverse")
{
typedef typename std::iterator_traits<Iterator>::value_type value_type;
// store size of the range
m_size = detail::iterator_range_size(first, last);
add_set_arg<const cl_uint>("size", static_cast<const cl_uint>(m_size));
*this <<
decl<cl_uint>("i") << " = get_global_id(0);\n" <<
decl<cl_uint>("j") << " = size - get_global_id(0) - 1;\n" <<
decl<value_type>("tmp") << "=" << first[var<cl_uint>("i")] << ";\n" <<
first[var<cl_uint>("i")] << "=" << first[var<cl_uint>("j")] << ";\n" <<
first[var<cl_uint>("j")] << "= tmp;\n";
}
void exec(command_queue &queue)
{
exec_1d(queue, 0, m_size / 2);
}
size_t m_size;
};
} // end detail namespace
/// Reverses the elements in the range [\p first, \p last).
///
/// \see reverse_copy()
template<class Iterator>
inline void reverse(Iterator first,
Iterator last,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, last);
if(count < 2){
return;
}
detail::reverse_kernel<Iterator> kernel(first, last);
kernel.exec(queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REVERSE_HPP
@@ -0,0 +1,79 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP
#define BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP
#include <iterator>
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/algorithm/reverse.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class Iterator, class OutputIterator>
struct reverse_copy_kernel : public meta_kernel
{
reverse_copy_kernel(Iterator first, Iterator last, OutputIterator result)
: meta_kernel("reverse_copy")
{
// store size of the range
m_size = detail::iterator_range_size(first, last);
add_set_arg<const cl_uint>("size", static_cast<const cl_uint>(m_size));
*this <<
decl<cl_uint>("i") << " = get_global_id(0);\n" <<
decl<cl_uint>("j") << " = size - get_global_id(0) - 1;\n" <<
result[var<cl_uint>("j")] << "=" << first[var<cl_uint>("i")] << ";\n";
}
void exec(command_queue &queue)
{
exec_1d(queue, 0, m_size);
}
size_t m_size;
};
} // end detail namespace
/// Copies the elements in the range [\p first, \p last) in reversed
/// order to the range beginning at \p result.
///
/// \see reverse()
template<class InputIterator, class OutputIterator>
inline OutputIterator
reverse_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type;
difference_type count = std::distance(first, last);
detail::reverse_copy_kernel<InputIterator, OutputIterator>
kernel(first, last, result);
// run kernel
kernel.exec(queue);
// return iterator to the end of result
return result + count;
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP
@@ -0,0 +1,54 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ROTATE_HPP
#define BOOST_COMPUTE_ALGORITHM_ROTATE_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/copy.hpp>
#include <boost/compute/container/vector.hpp>
namespace boost {
namespace compute {
/// Performs left rotation such that element at \p n_first comes to the
/// beginning.
///
/// \see rotate_copy()
template<class InputIterator>
inline void rotate(InputIterator first,
InputIterator n_first,
InputIterator last,
command_queue &queue = system::default_queue())
{
//Handle trivial cases
if (n_first==first || n_first==last)
{
return;
}
//Handle others
typedef typename std::iterator_traits<InputIterator>::value_type T;
size_t count = detail::iterator_range_size(first, n_first);
size_t count2 = detail::iterator_range_size(first, last);
const context &context = queue.get_context();
vector<T> temp(count2, context);
::boost::compute::copy(first, last, temp.begin(), queue);
::boost::compute::copy(temp.begin()+count, temp.end(), first, queue);
::boost::compute::copy(temp.begin(), temp.begin()+count, last-count, queue);
}
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ROTATE_HPP
@@ -0,0 +1,41 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP
#define BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP
#include <boost/compute/system.hpp>
#include <boost/compute/algorithm/copy.hpp>
namespace boost {
namespace compute {
/// Performs left rotation such that element at n_first comes to the
/// beginning and the output is stored in range starting at result.
///
/// \see rotate()
template<class InputIterator, class OutputIterator>
inline void rotate_copy(InputIterator first,
InputIterator n_first,
InputIterator last,
OutputIterator result,
command_queue &queue = system::default_queue())
{
size_t count = detail::iterator_range_size(first, n_first);
size_t count2 = detail::iterator_range_size(n_first, last);
::boost::compute::copy(first+count, last, result, queue);
::boost::compute::copy(first, first+count, result+count2, queue);
}
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP
@@ -0,0 +1,99 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_SCATTER_HPP
#define BOOST_COMPUTE_ALGORITHM_SCATTER_HPP
#include <boost/algorithm/string/replace.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/exception.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class MapIterator, class OutputIterator>
class scatter_kernel : meta_kernel
{
public:
scatter_kernel() : meta_kernel("scatter")
{}
void set_range(InputIterator first,
InputIterator last,
MapIterator map,
OutputIterator result)
{
m_count = iterator_range_size(first, last);
m_input_offset = first.get_index();
m_output_offset = result.get_index();
m_input_offset_arg = add_arg<uint_>("input_offset");
m_output_offset_arg = add_arg<uint_>("output_offset");
*this <<
"const uint i = get_global_id(0);\n" <<
"uint i1 = " << map[expr<uint_>("i")] <<
" + output_offset;\n" <<
"uint i2 = i + input_offset;\n" <<
result[expr<uint_>("i1")] << "=" <<
first[expr<uint_>("i2")] << ";\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
set_arg(m_input_offset_arg, uint_(m_input_offset));
set_arg(m_output_offset_arg, uint_(m_output_offset));
return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
size_t m_input_offset;
size_t m_input_offset_arg;
size_t m_output_offset;
size_t m_output_offset_arg;
};
} // end detail namespace
/// Copies the elements from the range [\p first, \p last) to the range
/// beginning at \p result using the output indices from the range beginning
/// at \p map.
///
/// \see gather()
template<class InputIterator, class MapIterator, class OutputIterator>
inline void scatter(InputIterator first,
InputIterator last,
MapIterator map,
OutputIterator result,
command_queue &queue = system::default_queue())
{
detail::scatter_kernel<InputIterator, MapIterator, OutputIterator> kernel;
kernel.set_range(first, last, map, result);
kernel.exec(queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_SCATTER_HPP
@@ -0,0 +1,119 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2015 Jakub Pola <jakub.pola@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP
#define BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP
#include <boost/algorithm/string/replace.hpp>
#include <boost/compute/system.hpp>
#include <boost/compute/exception.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
namespace boost {
namespace compute {
namespace detail {
template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator, class Predicate>
class scatter_if_kernel : meta_kernel
{
public:
scatter_if_kernel() : meta_kernel("scatter_if")
{}
void set_range(InputIterator first,
InputIterator last,
MapIterator map,
StencilIterator stencil,
OutputIterator result,
Predicate predicate)
{
m_count = iterator_range_size(first, last);
m_input_offset = first.get_index();
m_output_offset = result.get_index();
m_input_offset_arg = add_arg<uint_>("input_offset");
m_output_offset_arg = add_arg<uint_>("output_offset");
*this <<
"const uint i = get_global_id(0);\n" <<
"uint i1 = " << map[expr<uint_>("i")] <<
" + output_offset;\n" <<
"uint i2 = i + input_offset;\n" <<
if_(predicate(stencil[expr<uint_>("i")])) << "\n" <<
result[expr<uint_>("i1")] << "=" <<
first[expr<uint_>("i2")] << ";\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
set_arg(m_input_offset_arg, uint_(m_input_offset));
set_arg(m_output_offset_arg, uint_(m_output_offset));
return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
size_t m_input_offset;
size_t m_input_offset_arg;
size_t m_output_offset;
size_t m_output_offset_arg;
};
} // end detail namespace
/// Copies the elements from the range [\p first, \p last) to the range
/// beginning at \p result using the output indices from the range beginning
/// at \p map if stencil is resolved to true. By default the predicate is
/// an identity
///
///
template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator,
class Predicate>
inline void scatter_if(InputIterator first,
InputIterator last,
MapIterator map,
StencilIterator stencil,
OutputIterator result,
Predicate predicate,
command_queue &queue = system::default_queue())
{
detail::scatter_if_kernel<InputIterator, MapIterator, StencilIterator, OutputIterator, Predicate> kernel;
kernel.set_range(first, last, map, stencil, result, predicate);
kernel.exec(queue);
}
template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator>
inline void scatter_if(InputIterator first,
InputIterator last,
MapIterator map,
StencilIterator stencil,
OutputIterator result,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<StencilIterator>::value_type T;
scatter_if(first, last, map, stencil, result, identity<T>(), queue);
}
} // end compute namespace
} // end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP
@@ -0,0 +1,73 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_SEARCH_HPP
#define BOOST_COMPUTE_ALGORITHM_SEARCH_HPP
#include <boost/compute/algorithm/detail/search_all.hpp>
#include <boost/compute/algorithm/find.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
///
/// \brief Substring matching algorithm
///
/// Searches for the first match of the pattern [p_first, p_last)
/// in text [t_first, t_last).
/// \return Iterator pointing to beginning of first occurrence
///
/// \param t_first Iterator pointing to start of text
/// \param t_last Iterator pointing to end of text
/// \param p_first Iterator pointing to start of pattern
/// \param p_last Iterator pointing to end of pattern
/// \param queue Queue on which to execute
///
template<class TextIterator, class PatternIterator>
inline TextIterator search(TextIterator t_first,
TextIterator t_last,
PatternIterator p_first,
PatternIterator p_last,
command_queue &queue = system::default_queue())
{
// there is no need to check if pattern starts at last n - 1 indices
vector<uint_> matching_indices(
detail::iterator_range_size(t_first, t_last)
- detail::iterator_range_size(p_first, p_last) + 1,
queue.get_context()
);
// search_kernel puts value 1 at every index in vector where pattern starts at
detail::search_kernel<PatternIterator,
TextIterator,
vector<uint_>::iterator> kernel;
kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin());
kernel.exec(queue);
vector<uint_>::iterator index = ::boost::compute::find(
matching_indices.begin(), matching_indices.end(), uint_(1), queue
);
// pattern was not found
if(index == matching_indices.end())
return t_last;
return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_SEARCH_HPP
@@ -0,0 +1,140 @@
//---------------------------------------------------------------------------//
// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP
#include <iterator>
#include <boost/compute/algorithm/find.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/system.hpp>
namespace boost {
namespace compute {
namespace detail {
///
/// \brief Search kernel class
///
/// Subclass of meta_kernel which is capable of performing search_n
///
template<class TextIterator, class OutputIterator>
class search_n_kernel : public meta_kernel
{
public:
typedef typename std::iterator_traits<TextIterator>::value_type value_type;
search_n_kernel() : meta_kernel("search_n")
{}
void set_range(TextIterator t_first,
TextIterator t_last,
value_type value,
size_t n,
OutputIterator result)
{
m_n = n;
m_n_arg = add_arg<uint_>("n");
m_value = value;
m_value_arg = add_arg<value_type>("value");
m_count = iterator_range_size(t_first, t_last);
m_count = m_count + 1 - m_n;
*this <<
"uint i = get_global_id(0);\n" <<
"uint i1 = i;\n" <<
"uint j;\n" <<
"for(j = 0; j<n; j++,i++)\n" <<
"{\n" <<
" if(value != " << t_first[expr<uint_>("i")] << ")\n" <<
" j = n + 1;\n" <<
"}\n" <<
"if(j == n)\n" <<
result[expr<uint_>("i1")] << " = 1;\n" <<
"else\n" <<
result[expr<uint_>("i1")] << " = 0;\n";
}
event exec(command_queue &queue)
{
if(m_count == 0) {
return event();
}
set_arg(m_n_arg, uint_(m_n));
set_arg(m_value_arg, m_value);
return exec_1d(queue, 0, m_count);
}
private:
size_t m_n;
size_t m_n_arg;
size_t m_count;
value_type m_value;
size_t m_value_arg;
};
} //end detail namespace
///
/// \brief Substring matching algorithm
///
/// Searches for the first occurrence of n consecutive occurrences of
/// value in text [t_first, t_last).
/// \return Iterator pointing to beginning of first occurrence
///
/// \param t_first Iterator pointing to start of text
/// \param t_last Iterator pointing to end of text
/// \param n Number of times value repeats
/// \param value Value which repeats
/// \param queue Queue on which to execute
///
template<class TextIterator, class ValueType>
inline TextIterator search_n(TextIterator t_first,
TextIterator t_last,
size_t n,
ValueType value,
command_queue &queue = system::default_queue())
{
// there is no need to check if pattern starts at last n - 1 indices
vector<uint_> matching_indices(
detail::iterator_range_size(t_first, t_last) + 1 - n,
queue.get_context()
);
// search_n_kernel puts value 1 at every index in vector where pattern
// of n values starts at
detail::search_n_kernel<TextIterator,
vector<uint_>::iterator> kernel;
kernel.set_range(t_first, t_last, value, n, matching_indices.begin());
kernel.exec(queue);
vector<uint_>::iterator index = ::boost::compute::find(
matching_indices.begin(), matching_indices.end(), uint_(1), queue
);
// pattern was not found
if(index == matching_indices.end())
return t_last;
return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}
} //end compute namespace
} //end boost namespace
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP

Some files were not shown because too many files have changed in this diff Show More