stabilize build system: depends, installer, boost/bdb fixes, cross targets groundwork

2026-02-24 18:38:47 +00:00
parent da8c28aaeb
commit 65cb2619a7
13106 changed files with 2484322 additions and 1804 deletions
@@ -0,0 +1,162 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Balanced Path kernel class
+///
+/// Subclass of meta_kernel to break two sets into tiles according
+/// to their balanced path.
+///
+class balanced_path_kernel : public meta_kernel
+{
+public:
+    unsigned int tile_size;
+
+    balanced_path_kernel() : meta_kernel("balanced_path")
+    {
+        tile_size = 4;
+    }
+
+    template<class InputIterator1, class InputIterator2,
+             class OutputIterator1, class OutputIterator2,
+             class Compare>
+    void set_range(InputIterator1 first1,
+                   InputIterator1 last1,
+                   InputIterator2 first2,
+                   InputIterator2 last2,
+                   OutputIterator1 result_a,
+                   OutputIterator2 result_b,
+                   Compare comp)
+    {
+        typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+        m_a_count = iterator_range_size(first1, last1);
+        m_a_count_arg = add_arg<uint_>("a_count");
+
+        m_b_count = iterator_range_size(first2, last2);
+        m_b_count_arg = add_arg<uint_>("b_count");
+
+        *this <<
+            "uint i = get_global_id(0);\n" <<
+            "uint target = (i+1)*" << tile_size << ";\n" <<
+            "uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" <<
+            "uint end = min(target,a_count);\n" <<
+            "uint a_index, b_index;\n" <<
+            "while(start<end)\n" <<
+            "{\n" <<
+            "   a_index = (start + end)/2;\n" <<
+            "   b_index = target - a_index - 1;\n" <<
+            "   if(!(" << comp(first2[expr<uint_>("b_index")],
+                              first1[expr<uint_>("a_index")]) << "))\n" <<
+            "       start = a_index + 1;\n" <<
+            "   else end = a_index;\n" <<
+            "}\n" <<
+            "a_index = start;\n" <<
+            "b_index = target - start;\n" <<
+            "if(b_index < b_count)\n" <<
+            "{\n" <<
+            "   " << decl<const value_type>("x") << " = " <<
+                        first2[expr<uint_>("b_index")] << ";\n" <<
+            "   uint a_start = 0, a_end = a_index, a_mid;\n" <<
+            "   uint b_start = 0, b_end = b_index, b_mid;\n" <<
+            "   while(a_start<a_end)\n" <<
+            "   {\n" <<
+            "       a_mid = (a_start + a_end)/2;\n" <<
+            "       if(" << comp(first1[expr<uint_>("a_mid")], expr<value_type>("x")) << ")\n" <<
+            "           a_start = a_mid+1;\n" <<
+            "       else a_end = a_mid;\n" <<
+            "   }\n" <<
+            "   while(b_start<b_end)\n" <<
+            "   {\n" <<
+            "       b_mid = (b_start + b_end)/2;\n" <<
+            "       if(" << comp(first2[expr<uint_>("b_mid")], expr<value_type>("x")) << ")\n" <<
+            "           b_start = b_mid+1;\n" <<
+            "       else b_end = b_mid;\n" <<
+            "   }\n" <<
+            "   uint a_run = a_index - a_start;\n" <<
+            "   uint b_run = b_index - b_start;\n" <<
+            "   uint x_count = a_run + b_run;\n" <<
+            "   uint b_advance = max(x_count / 2, x_count - a_run);\n" <<
+            "   b_end = min(b_count, b_start + b_advance + 1);\n" <<
+            "   uint temp_start = b_index, temp_end = b_end, temp_mid;" <<
+            "   while(temp_start < temp_end)\n" <<
+            "   {\n" <<
+            "       temp_mid = (temp_start + temp_end + 1)/2;\n" <<
+            "       if(" << comp(expr<value_type>("x"), first2[expr<uint_>("temp_mid")]) << ")\n" <<
+            "           temp_end = temp_mid-1;\n" <<
+            "       else temp_start = temp_mid;\n" <<
+            "   }\n" <<
+            "   b_run = temp_start - b_start + 1;\n" <<
+            "   b_advance = min(b_advance, b_run);\n" <<
+            "   uint a_advance = x_count - b_advance;\n" <<
+            "   uint star = convert_uint((a_advance == b_advance + 1) " <<
+                                            "&& (b_advance < b_run));\n" <<
+            "   a_index = a_start + a_advance;\n" <<
+            "   b_index = target - a_index + star;\n" <<
+            "}\n" <<
+            result_a[expr<uint_>("i")] << " = a_index;\n" <<
+            result_b[expr<uint_>("i")] << " = b_index;\n";
+
+    }
+
+    template<class InputIterator1, class InputIterator2,
+             class OutputIterator1, class OutputIterator2>
+    void set_range(InputIterator1 first1,
+                   InputIterator1 last1,
+                   InputIterator2 first2,
+                   InputIterator2 last2,
+                   OutputIterator1 result_a,
+                   OutputIterator2 result_b)
+    {
+        typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+        ::boost::compute::less<value_type> less_than;
+        set_range(first1, last1, first2, last2, result_a, result_b, less_than);
+    }
+
+    event exec(command_queue &queue)
+    {
+        if((m_a_count + m_b_count)/tile_size == 0) {
+            return event();
+        }
+
+        set_arg(m_a_count_arg, uint_(m_a_count));
+        set_arg(m_b_count_arg, uint_(m_b_count));
+
+        return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size);
+    }
+
+private:
+    size_t m_a_count;
+    size_t m_a_count_arg;
+    size_t m_b_count;
+    size_t m_b_count_arg;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
@@ -0,0 +1,133 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
+
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail{
+
+///
+/// \brief Binary find kernel class
+///
+/// Subclass of meta_kernel to perform single step in binary find.
+///
+template<class InputIterator, class UnaryPredicate>
+class binary_find_kernel : public meta_kernel
+{
+public:
+    binary_find_kernel(InputIterator first,
+                       InputIterator last,
+                       UnaryPredicate predicate)
+        : meta_kernel("binary_find")
+    {
+        typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+        m_index_arg = add_arg<uint_ *>(memory_object::global_memory, "index");
+        m_block_arg = add_arg<uint_>("block");
+
+        atomic_min<uint_> atomic_min_uint;
+
+        *this <<
+            "uint i = get_global_id(0) * block;\n" <<
+            decl<value_type>("value") << "=" << first[var<uint_>("i")] << ";\n" <<
+            "if(" << predicate(var<value_type>("value")) << ") {\n" <<
+                atomic_min_uint(var<uint_ *>("index"), var<uint_>("i")) << ";\n" <<
+            "}\n";
+    }
+
+    size_t m_index_arg;
+    size_t m_block_arg;
+};
+
+///
+/// \brief Binary find algorithm
+///
+/// Finds the end of true values in the partitioned range [first, last).
+/// \return Iterator pointing to end of true values
+///
+/// \param first Iterator pointing to start of range
+/// \param last Iterator pointing to end of range
+/// \param predicate Predicate according to which the range is partitioned
+/// \param queue Queue on which to execute
+///
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator binary_find(InputIterator first,
+                                 InputIterator last,
+                                 UnaryPredicate predicate,
+                                 command_queue &queue = system::default_queue())
+{
+    const device &device = queue.get_device();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    const std::string cache_key = "__boost_binary_find";
+
+    size_t find_if_limit = 128;
+    size_t threads = parameters->get(cache_key, "tpb", 128);
+    size_t count = iterator_range_size(first, last);
+
+    InputIterator search_first = first;
+    InputIterator search_last = last;
+
+    scalar<uint_> index(queue.get_context());
+
+    // construct and compile binary_find kernel
+    binary_find_kernel<InputIterator, UnaryPredicate>
+        binary_find_kernel(search_first, search_last, predicate);
+    ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context());
+
+    // set buffer for index
+    kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer());
+
+    while(count > find_if_limit) {
+        index.write(static_cast<uint_>(count), queue);
+
+        // set block and run binary_find kernel
+        uint_ block = static_cast<uint_>((count - 1)/(threads - 1));
+        kernel.set_arg(binary_find_kernel.m_block_arg, block);
+        queue.enqueue_1d_range_kernel(kernel, 0, threads, 0);
+
+        size_t i = index.read(queue);
+
+        if(i == count) {
+            search_first = search_last - ((count - 1)%(threads - 1));
+            break;
+        } else {
+            search_last = search_first + i;
+            search_first = search_last - ((count - 1)/(threads - 1));
+        }
+
+        // Make sure that first and last stay within the input range
+        search_last = (std::min)(search_last, last);
+        search_last = (std::max)(search_last, first);
+
+        search_first = (std::max)(search_first, first);
+        search_first = (std::min)(search_first, last);
+
+        count = iterator_range_size(search_first, search_last);
+    }
+
+    return find_if(search_first, search_last, predicate, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
@@ -0,0 +1,77 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
+
+#include <iterator>
+
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Compact kernel class
+///
+/// Subclass of meta_kernel to compact the result of set kernels to
+/// get actual sets
+///
+class compact_kernel : public meta_kernel
+{
+public:
+    unsigned int tile_size;
+
+    compact_kernel() : meta_kernel("compact")
+    {
+        tile_size = 4;
+    }
+
+    template<class InputIterator1, class InputIterator2, class OutputIterator>
+    void set_range(InputIterator1 start,
+                   InputIterator2 counts_begin,
+                   InputIterator2 counts_end,
+                   OutputIterator result)
+    {
+        m_count = iterator_range_size(counts_begin, counts_end) - 1;
+
+        *this <<
+            "uint i = get_global_id(0);\n" <<
+            "uint count = i*" << tile_size << ";\n" <<
+            "for(uint j = " << counts_begin[expr<uint_>("i")] << "; j<" <<
+                counts_begin[expr<uint_>("i+1")] << "; j++, count++)\n" <<
+            "{\n" <<
+                result[expr<uint_>("j")] << " = " << start[expr<uint_>("count")]
+                    << ";\n" <<
+            "}\n";
+    }
+
+    event exec(command_queue &queue)
+    {
+        if(m_count == 0) {
+            return event();
+        }
+
+        return exec_1d(queue, 0, m_count);
+    }
+
+private:
+    size_t m_count;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
@@ -0,0 +1,190 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/iterator/discard_iterator.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/detail/work_size.hpp>
+#include <boost/compute/detail/vendor.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator>
+inline event copy_on_device_cpu(InputIterator first,
+                                OutputIterator result,
+                                size_t count,
+                                command_queue &queue)
+{
+    meta_kernel k("copy");
+    const device& device = queue.get_device();
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+        "while(index < end){\n" <<
+            result[k.var<uint_>("index")] << '=' <<
+                first[k.var<uint_>("index")] << ";\n" <<
+            "index++;\n" <<
+        "}\n";
+
+    k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
+
+    size_t global_work_size = device.compute_units();
+    if(count <= 1024) global_work_size = 1;
+    return k.exec_1d(queue, 0, global_work_size);
+}
+
+template<class InputIterator, class OutputIterator>
+inline event copy_on_device_gpu(InputIterator first,
+                                OutputIterator result,
+                                size_t count,
+                                command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const device& device = queue.get_device();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+    std::string cache_key =
+        "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
+
+    uint_ vpt = parameters->get(cache_key, "vpt", 4);
+    uint_ tpb = parameters->get(cache_key, "tpb", 128);
+
+    meta_kernel k("copy");
+    k <<
+        "uint index = get_local_id(0) + " <<
+            "(" << vpt * tpb << " * get_group_id(0));\n" <<
+        "for(uint i = 0; i < " << vpt << "; i++){\n" <<
+        "    if(index < count){\n" <<
+                result[k.var<uint_>("index")] << '=' <<
+                    first[k.var<uint_>("index")] << ";\n" <<
+        "       index += " << tpb << ";\n"
+        "    }\n"
+        "}\n";
+
+    k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
+    size_t global_work_size = calculate_work_size(count, vpt, tpb);
+    return k.exec_1d(queue, 0, global_work_size, tpb);
+}
+
+template<class InputIterator, class OutputIterator>
+inline event dispatch_copy_on_device(InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     command_queue &queue)
+{
+    const size_t count = detail::iterator_range_size(first, last);
+
+    if(count == 0){
+        // nothing to do
+        return event();
+    }
+
+    const device& device = queue.get_device();
+    // copy_on_device_cpu() does not work for CPU on Apple platform
+    // due to bug in its compiler.
+    // See https://github.com/boostorg/compute/pull/626
+    if((device.type() & device::cpu) && !is_apple_platform_device(device))
+    {
+        return copy_on_device_cpu(first, result, count, queue);
+    }
+    return copy_on_device_gpu(first, result, count, queue);
+}
+
+template<class InputIterator, class OutputIterator>
+inline OutputIterator copy_on_device(InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     command_queue &queue)
+{
+    dispatch_copy_on_device(first, last, result, queue);
+    return result + std::distance(first, last);
+}
+
+template<class InputIterator>
+inline discard_iterator copy_on_device(InputIterator first,
+                                       InputIterator last,
+                                       discard_iterator result,
+                                       command_queue &queue)
+{
+    (void) queue;
+
+    return result + std::distance(first, last);
+}
+
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator> copy_on_device_async(InputIterator first,
+                                                   InputIterator last,
+                                                   OutputIterator result,
+                                                   command_queue &queue)
+{
+    event event_ = dispatch_copy_on_device(first, last, result, queue);
+    return make_future(result + std::distance(first, last), event_);
+}
+
+#ifdef CL_VERSION_2_0
+// copy_on_device() specialization for svm_ptr
+template<class T>
+inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
+                                 svm_ptr<T> last,
+                                 svm_ptr<T> result,
+                                 command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    queue.enqueue_svm_memcpy(
+        result.get(), first.get(), count * sizeof(T)
+    );
+
+    return result + count;
+}
+
+template<class T>
+inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
+                                                svm_ptr<T> last,
+                                                svm_ptr<T> result,
+                                                command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return future<svm_ptr<T> >();
+    }
+
+    event event_ = queue.enqueue_svm_memcpy_async(
+        result.get(), first.get(), count * sizeof(T)
+    );
+
+    return make_future(result + count, event_);
+}
+#endif // CL_VERSION_2_0
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
@@ -0,0 +1,193 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
+
+#include <iterator>
+
+#include <boost/utility/addressof.hpp>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class HostIterator, class DeviceIterator>
+inline DeviceIterator copy_to_device(HostIterator first,
+                                     HostIterator last,
+                                     DeviceIterator result,
+                                     command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+    typedef typename
+        std::iterator_traits<DeviceIterator>::difference_type
+        difference_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    size_t offset = result.get_index();
+
+    queue.enqueue_write_buffer(result.get_buffer(),
+                               offset * sizeof(value_type),
+                               count * sizeof(value_type),
+                               ::boost::addressof(*first));
+
+    return result + static_cast<difference_type>(count);
+}
+
+template<class HostIterator, class DeviceIterator>
+inline DeviceIterator copy_to_device_map(HostIterator first,
+                                         HostIterator last,
+                                         DeviceIterator result,
+                                         command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+    typedef typename
+        std::iterator_traits<DeviceIterator>::difference_type
+        difference_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    size_t offset = result.get_index();
+
+    // map result buffer to host
+    value_type *pointer = static_cast<value_type*>(
+        queue.enqueue_map_buffer(
+            result.get_buffer(),
+            CL_MAP_WRITE,
+            offset * sizeof(value_type),
+            count * sizeof(value_type)
+        )
+    );
+
+    // copy [first; last) to result buffer
+    std::copy(first, last, pointer);
+
+    // unmap result buffer
+    boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
+        result.get_buffer(),
+        static_cast<void*>(pointer)
+    );
+    unmap_event.wait();
+
+    return result + static_cast<difference_type>(count);
+}
+
+template<class HostIterator, class DeviceIterator>
+inline future<DeviceIterator> copy_to_device_async(HostIterator first,
+                                                   HostIterator last,
+                                                   DeviceIterator result,
+                                                   command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+    typedef typename
+        std::iterator_traits<DeviceIterator>::difference_type
+        difference_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return future<DeviceIterator>();
+    }
+
+    size_t offset = result.get_index();
+
+    event event_ =
+        queue.enqueue_write_buffer_async(result.get_buffer(),
+                                         offset * sizeof(value_type),
+                                         count * sizeof(value_type),
+                                         ::boost::addressof(*first));
+
+    return make_future(result + static_cast<difference_type>(count), event_);
+}
+
+#ifdef CL_VERSION_2_0
+// copy_to_device() specialization for svm_ptr
+template<class HostIterator, class T>
+inline svm_ptr<T> copy_to_device(HostIterator first,
+                                 HostIterator last,
+                                 svm_ptr<T> result,
+                                 command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    queue.enqueue_svm_memcpy(
+        result.get(), ::boost::addressof(*first), count * sizeof(T)
+    );
+
+    return result + count;
+}
+
+template<class HostIterator, class T>
+inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
+                                                HostIterator last,
+                                                svm_ptr<T> result,
+                                                command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return future<svm_ptr<T> >();
+    }
+
+    event event_ = queue.enqueue_svm_memcpy_async(
+        result.get(), ::boost::addressof(*first), count * sizeof(T)
+    );
+
+    return make_future(result + count, event_);
+}
+
+template<class HostIterator, class T>
+inline svm_ptr<T> copy_to_device_map(HostIterator first,
+                                              HostIterator last,
+                                              svm_ptr<T> result,
+                                              command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    // map
+    queue.enqueue_svm_map(result.get(), count * sizeof(T), CL_MAP_WRITE);
+
+    // copy [first; last) to result buffer
+    std::copy(first, last, static_cast<T*>(result.get()));
+
+    // unmap result
+    queue.enqueue_svm_unmap(result.get()).wait();
+
+    return result + count;
+}
+#endif // CL_VERSION_2_0
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
@@ -0,0 +1,198 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
+
+#include <iterator>
+
+#include <boost/utility/addressof.hpp>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
+#include <boost/compute/detail/iterator_plus_distance.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class DeviceIterator, class HostIterator>
+inline HostIterator copy_to_host(DeviceIterator first,
+                                 DeviceIterator last,
+                                 HostIterator result,
+                                 command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    const buffer &buffer = first.get_buffer();
+    size_t offset = first.get_index();
+
+    queue.enqueue_read_buffer(buffer,
+                              offset * sizeof(value_type),
+                              count * sizeof(value_type),
+                              ::boost::addressof(*result));
+
+    return iterator_plus_distance(result, count);
+}
+
+template<class DeviceIterator, class HostIterator>
+inline HostIterator copy_to_host_map(DeviceIterator first,
+                                     DeviceIterator last,
+                                     HostIterator result,
+                                     command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+    typedef typename
+        std::iterator_traits<DeviceIterator>::difference_type
+        difference_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    size_t offset = first.get_index();
+
+    // map [first; last) buffer to host
+    value_type *pointer = static_cast<value_type*>(
+        queue.enqueue_map_buffer(
+            first.get_buffer(),
+            CL_MAP_READ,
+            offset * sizeof(value_type),
+            count * sizeof(value_type)
+        )
+    );
+
+    // copy [first; last) to result buffer
+    std::copy(
+        pointer,
+        pointer + static_cast<difference_type>(count),
+        result
+    );
+
+    // unmap [first; last)
+    boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
+        first.get_buffer(),
+        static_cast<void*>(pointer)
+    );
+    unmap_event.wait();
+
+    return iterator_plus_distance(result, count);
+}
+
+template<class DeviceIterator, class HostIterator>
+inline future<HostIterator> copy_to_host_async(DeviceIterator first,
+                                               DeviceIterator last,
+                                               HostIterator result,
+                                               command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return future<HostIterator>();
+    }
+
+    const buffer &buffer = first.get_buffer();
+    size_t offset = first.get_index();
+
+    event event_ =
+        queue.enqueue_read_buffer_async(buffer,
+                                        offset * sizeof(value_type),
+                                        count * sizeof(value_type),
+                                        ::boost::addressof(*result));
+
+    return make_future(iterator_plus_distance(result, count), event_);
+}
+
+#ifdef CL_VERSION_2_0
+// copy_to_host() specialization for svm_ptr
+template<class T, class HostIterator>
+inline HostIterator copy_to_host(svm_ptr<T> first,
+                                 svm_ptr<T> last,
+                                 HostIterator result,
+                                 command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    queue.enqueue_svm_memcpy(
+        ::boost::addressof(*result), first.get(), count * sizeof(T)
+    );
+
+    return result + count;
+}
+
+template<class T, class HostIterator>
+inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
+                                               svm_ptr<T> last,
+                                               HostIterator result,
+                                               command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return future<HostIterator>();
+    }
+
+    event event_ = queue.enqueue_svm_memcpy_async(
+        ::boost::addressof(*result), first.get(), count * sizeof(T)
+    );
+
+    return make_future(iterator_plus_distance(result, count), event_);
+}
+
+template<class T, class HostIterator>
+inline HostIterator copy_to_host_map(svm_ptr<T> first,
+                                     svm_ptr<T> last,
+                                     HostIterator result,
+                                     command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    // map
+    queue.enqueue_svm_map(first.get(), count * sizeof(T), CL_MAP_READ);
+
+    // copy [first; last) to result
+    std::copy(
+        static_cast<T*>(first.get()),
+        static_cast<T*>(last.get()),
+        result
+    );
+
+    // unmap [first; last)
+    queue.enqueue_svm_unmap(first.get()).wait();
+
+    return iterator_plus_distance(result, count);
+}
+#endif // CL_VERSION_2_0
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
@@ -0,0 +1,78 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
+
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/functional/detail/nvidia_ballot.hpp>
+#include <boost/compute/functional/detail/nvidia_popcount.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Predicate>
+inline size_t count_if_with_ballot(InputIterator first,
+                                   InputIterator last,
+                                   Predicate predicate,
+                                   command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    size_t block_size = 32;
+    size_t block_count = count / block_size;
+    if(block_count * block_size != count){
+        block_count++;
+    }
+
+    const ::boost::compute::context &context = queue.get_context();
+
+    ::boost::compute::vector<uint_> counts(block_count, context);
+
+    ::boost::compute::detail::nvidia_popcount<uint_> popc;
+    ::boost::compute::detail::nvidia_ballot<uint_> ballot;
+
+    meta_kernel k("count_if_with_ballot");
+    k <<
+        "const uint gid = get_global_id(0);\n" <<
+
+        "bool value = false;\n" <<
+        "if(gid < count)\n" <<
+        "    value = " << predicate(first[k.var<const uint_>("gid")]) << ";\n" <<
+
+        "uint bits = " << ballot(k.var<const uint_>("value")) << ";\n" <<
+
+        "if(get_local_id(0) == 0)\n" <<
+            counts.begin()[k.var<uint_>("get_group_id(0)") ]
+                << " = " << popc(k.var<uint_>("bits")) << ";\n";
+
+    k.add_set_arg<const uint_>("count", count);
+
+    k.exec_1d(queue, 0, block_size * block_count, block_size);
+
+    uint_ result;
+    ::boost::compute::reduce(
+        counts.begin(),
+        counts.end(),
+        &result,
+        queue
+    );
+    return result;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
+
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Predicate, class Arg>
+struct invoked_countable_predicate
+{
+    invoked_countable_predicate(Predicate p, Arg a)
+        : predicate(p), arg(a)
+    {
+    }
+
+    Predicate predicate;
+    Arg arg;
+};
+
+template<class Predicate, class Arg>
+inline meta_kernel& operator<<(meta_kernel &kernel,
+                               const invoked_countable_predicate<Predicate, Arg> &expr)
+{
+    return kernel << "(" << expr.predicate(expr.arg) << " ? 1 : 0)";
+}
+
+// the countable_predicate wraps Predicate and converts its result from
+// bool to ulong so that it can be used with reduce()
+template<class Predicate>
+struct countable_predicate
+{
+    typedef ulong_ result_type;
+
+    countable_predicate(Predicate predicate)
+        : m_predicate(predicate)
+    {
+    }
+
+    template<class Arg>
+    invoked_countable_predicate<Predicate, Arg> operator()(const Arg &arg) const
+    {
+        return invoked_countable_predicate<Predicate, Arg>(m_predicate, arg);
+    }
+
+    Predicate m_predicate;
+};
+
+// counts the number of elements matching predicate using reduce()
+template<class InputIterator, class Predicate>
+inline size_t count_if_with_reduce(InputIterator first,
+                                   InputIterator last,
+                                   Predicate predicate,
+                                   command_queue &queue)
+{
+    countable_predicate<Predicate> reduce_predicate(predicate);
+
+    ulong_ count = 0;
+    ::boost::compute::reduce(
+        ::boost::compute::make_transform_iterator(first, reduce_predicate),
+        ::boost::compute::make_transform_iterator(last, reduce_predicate),
+        &count,
+        ::boost::compute::plus<ulong_>(),
+        queue
+    );
+
+    return static_cast<size_t>(count);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
@@ -0,0 +1,129 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
+
+#include <numeric>
+
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Predicate>
+class count_if_with_threads_kernel : meta_kernel
+{
+public:
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type
+        value_type;
+
+    count_if_with_threads_kernel()
+        : meta_kernel("count_if_with_threads")
+    {
+    }
+
+    void set_args(InputIterator first,
+                  InputIterator last,
+                  Predicate predicate)
+
+    {
+        typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+        m_size = detail::iterator_range_size(first, last);
+
+        m_size_arg = add_arg<const ulong_>("size");
+        m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts");
+
+        *this <<
+            // thread parameters
+            "const uint gid = get_global_id(0);\n" <<
+            "const uint block_size = size / get_global_size(0);\n" <<
+            "const uint start = block_size * gid;\n" <<
+            "uint end = 0;\n" <<
+            "if(gid == get_global_size(0) - 1)\n" <<
+            "    end = size;\n" <<
+            "else\n" <<
+            "    end = block_size * gid + block_size;\n" <<
+
+            // count values
+            "uint count = 0;\n" <<
+            "for(uint i = start; i < end; i++){\n" <<
+                decl<const T>("value") << "="
+                    << first[expr<uint_>("i")] << ";\n" <<
+                if_(predicate(var<const T>("value"))) << "{\n" <<
+                    "count++;\n" <<
+                "}\n" <<
+            "}\n" <<
+
+            // write count
+            "counts[gid] = count;\n";
+    }
+
+    size_t exec(command_queue &queue)
+    {
+        const device &device = queue.get_device();
+        const context &context = queue.get_context();
+
+        size_t threads = device.compute_units();
+
+        const size_t minimum_block_size = 2048;
+        if(m_size / threads < minimum_block_size){
+            threads = static_cast<size_t>(
+                          (std::max)(
+                              std::ceil(float(m_size) / minimum_block_size),
+                              1.0f
+                          )
+                      );
+        }
+
+        // storage for counts
+        ::boost::compute::vector<ulong_> counts(threads, context);
+
+        // exec kernel
+        set_arg(m_size_arg, static_cast<ulong_>(m_size));
+        set_arg(m_counts_arg, counts.get_buffer());
+        exec_1d(queue, 0, threads, 1);
+
+        // copy counts to the host
+        std::vector<ulong_> host_counts(threads);
+        ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue);
+
+        // return sum of counts
+        return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0));
+    }
+
+private:
+    size_t m_size;
+    size_t m_size_arg;
+    size_t m_counts_arg;
+};
+
+// counts values that match the predicate using one thread per block. this is
+// optimized for cpu-type devices with a small number of compute units.
+template<class InputIterator, class Predicate>
+inline size_t count_if_with_threads(InputIterator first,
+                                    InputIterator last,
+                                    Predicate predicate,
+                                    command_queue &queue)
+{
+    count_if_with_threads_kernel<InputIterator, Predicate> kernel;
+    kernel.set_args(first, last, predicate);
+    return kernel.exec(queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
@@ -0,0 +1,70 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
+
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
+#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema(InputIterator first,
+                                  InputIterator last,
+                                  Compare compare,
+                                  const bool find_minimum,
+                                  command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+
+    // handle trivial cases
+    if(count == 0 || count == 1){
+        return first;
+    }
+
+    const device &device = queue.get_device();
+
+    // CPU
+    if(device.type() & device::cpu) {
+        return find_extrema_on_cpu(first, last, compare, find_minimum, queue);
+    }
+
+    // GPU
+    // use serial method for small inputs
+    if(count < 512)
+    {
+        return serial_find_extrema(first, last, compare, find_minimum, queue);
+    }
+    // find_extrema_with_reduce() is used only if requirements are met
+    if(find_extrema_with_reduce_requirements_met(first, last, queue))
+    {
+        return find_extrema_with_reduce(first, last, compare, find_minimum, queue);
+    }
+
+    // use serial method for OpenCL version 1.0 due to
+    // problems with atomic_cmpxchg()
+    #ifndef CL_VERSION_1_1
+        return serial_find_extrema(first, last, compare, find_minimum, queue);
+    #endif
+
+    return find_extrema_with_atomics(first, last, compare, find_minimum, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
@@ -0,0 +1,138 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
+
+#include <algorithm>
+
+#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
+#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema_on_cpu(InputIterator first,
+                                         InputIterator last,
+                                         Compare compare,
+                                         const bool find_minimum,
+                                         command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+    size_t count = iterator_range_size(first, last);
+
+    const device &device = queue.get_device();
+    const uint_ compute_units = queue.get_device().compute_units();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+    std::string cache_key =
+        "__boost_find_extrema_cpu_"
+            + boost::lexical_cast<std::string>(sizeof(input_type));
+
+    // for inputs smaller than serial_find_extrema_threshold
+    // serial_find_extrema algorithm is used
+    uint_ serial_find_extrema_threshold = parameters->get(
+        cache_key,
+        "serial_find_extrema_threshold",
+        16384 * sizeof(input_type)
+    );
+    serial_find_extrema_threshold =
+        (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units));
+
+    const context &context = queue.get_context();
+    if(count < serial_find_extrema_threshold) {
+        return serial_find_extrema(first, last, compare, find_minimum, queue);
+    }
+
+    meta_kernel k("find_extrema_on_cpu");
+    buffer output(context, sizeof(input_type) * compute_units);
+    buffer output_idx(
+        context, sizeof(uint_) * compute_units,
+        buffer::read_write | buffer::alloc_host_ptr
+    );
+
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t output_arg =
+        k.add_arg<input_type *>(memory_object::global_memory, "output");
+    size_t output_idx_arg =
+        k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+
+        "uint value_index = index;\n" <<
+        k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" <<
+
+        "index++;\n" <<
+        "while(index < end){\n" <<
+            k.decl<input_type>("candidate") <<
+                " = " << first[k.var<uint_>("index")] << ";\n" <<
+        "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+            "bool compare = " << compare(k.var<input_type>("candidate"),
+                                         k.var<input_type>("value")) << ";\n" <<
+        "#else\n" <<
+            "bool compare = " << compare(k.var<input_type>("value"),
+                                         k.var<input_type>("candidate")) << ";\n" <<
+        "#endif\n" <<
+            "value = compare ? candidate : value;\n" <<
+            "value_index = compare ? index : value_index;\n" <<
+            "index++;\n" <<
+        "}\n" <<
+        "output[get_global_id(0)] = value;\n" <<
+        "output_idx[get_global_id(0)] = value_index;\n";
+
+    size_t global_work_size = compute_units;
+    std::string options;
+    if(!find_minimum){
+        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+    }
+    kernel kernel = k.compile(context, options);
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(output_arg, output);
+    kernel.set_arg(output_idx_arg, output_idx);
+    queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
+    
+    buffer_iterator<input_type> result = serial_find_extrema(
+        make_buffer_iterator<input_type>(output),
+        make_buffer_iterator<input_type>(output, global_work_size),
+        compare,
+        find_minimum,
+        queue
+    );
+
+    uint_* output_idx_host_ptr =
+        static_cast<uint_*>(
+            queue.enqueue_map_buffer(
+                output_idx, command_queue::map_read,
+                0, global_work_size * sizeof(uint_)
+            )
+        );
+
+    difference_type extremum_idx =
+        static_cast<difference_type>(*(output_idx_host_ptr + result.get_index()));
+    return first + extremum_idx;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
@@ -0,0 +1,108 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
+
+#include <boost/compute/types.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/functional/atomic.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema_with_atomics(InputIterator first,
+                                               InputIterator last,
+                                               Compare compare,
+                                               const bool find_minimum,
+                                               command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+    const context &context = queue.get_context();
+
+    meta_kernel k("find_extrema");
+    atomic_cmpxchg<uint_> atomic_cmpxchg_uint;
+
+    k <<
+        "const uint gid = get_global_id(0);\n" <<
+        "uint old_index = *index;\n" <<
+
+        k.decl<value_type>("old") <<
+            " = " << first[k.var<uint_>("old_index")] << ";\n" <<
+        k.decl<value_type>("new") <<
+            " = " << first[k.var<uint_>("gid")] << ";\n" <<
+
+        k.decl<bool>("compare_result") << ";\n" <<
+        "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+        "while(" <<
+            "(compare_result = " << compare(k.var<value_type>("old"),
+                                            k.var<value_type>("new")) << ")" <<
+            " || (!(compare_result" <<
+                      " || " << compare(k.var<value_type>("new"),
+                                        k.var<value_type>("old")) << ") "
+                  "&& gid < old_index)){\n" <<
+        "#else\n" <<
+        // while condition explained for minimum case with less (<)
+        // as comparison function:
+        // while(new_value < old_value
+        //       OR (new_value == old_value AND new_index < old_index))
+        "while(" <<
+            "(compare_result = " << compare(k.var<value_type>("new"),
+                                            k.var<value_type>("old"))  << ")" <<
+            " || (!(compare_result" <<
+                      " || " << compare(k.var<value_type>("old"),
+                                        k.var<value_type>("new")) << ") "
+                  "&& gid < old_index)){\n" <<
+        "#endif\n" <<
+
+        "  if(" << atomic_cmpxchg_uint(k.var<uint_ *>("index"),
+                                       k.var<uint_>("old_index"),
+                                       k.var<uint_>("gid")) << " == old_index)\n" <<
+        "      break;\n" <<
+        "  else\n" <<
+        "    old_index = *index;\n" <<
+        "old = " << first[k.var<uint_>("old_index")] << ";\n" <<
+        "}\n";
+
+    size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index");
+
+    std::string options;
+    if(!find_minimum){
+        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+    }
+    kernel kernel = k.compile(context, options);
+
+    // setup index buffer
+    scalar<uint_> index(context);
+    kernel.set_arg(index_arg_index, index.get_buffer());
+
+    // initialize index
+    index.write(0, queue);
+
+    // run kernel
+    size_t count = iterator_range_size(first, last);
+    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+    // read index and return iterator
+    return first + static_cast<difference_type>(index.read(queue));
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
@@ -0,0 +1,443 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
+
+#include <algorithm>
+
+#include <boost/compute/types.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator>
+bool find_extrema_with_reduce_requirements_met(InputIterator first,
+                                               InputIterator last,
+                                               command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const device &device = queue.get_device();
+
+    // device must have dedicated local memory storage
+    // otherwise reduction would be highly inefficient
+    if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
+    {
+        return false;
+    }
+
+    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    // local memory size in bytes (per compute unit)
+    const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
+
+    std::string cache_key = std::string("__boost_find_extrema_reduce_")
+        + type_name<input_type>();
+    // load parameters
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // Get preferred work group size
+    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+
+    work_group_size = (std::min)(max_work_group_size, work_group_size);
+
+    // local memory size needed to perform parallel reduction
+    size_t required_local_mem_size = 0;
+    // indices size
+    required_local_mem_size += sizeof(uint_) * work_group_size;
+    // values size
+    required_local_mem_size += sizeof(input_type) * work_group_size;
+
+    // at least 4 work groups per compute unit otherwise reduction
+    // would be highly inefficient
+    return ((required_local_mem_size * 4) <= local_mem_size);
+}
+
+/// \internal_
+/// Algorithm finds the first extremum in given range, i.e., with the lowest
+/// index.
+///
+/// If \p use_input_idx is false, it's assumed that input data is ordered by
+/// increasing index and \p input_idx is not used in the algorithm.
+template<class InputIterator, class ResultIterator, class Compare>
+inline void find_extrema_with_reduce(InputIterator input,
+                                     vector<uint_>::iterator input_idx,
+                                     size_t count,
+                                     ResultIterator result,
+                                     vector<uint_>::iterator result_idx,
+                                     size_t work_groups_no,
+                                     size_t work_group_size,
+                                     Compare compare,
+                                     const bool find_minimum,
+                                     const bool use_input_idx,
+                                     command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const context &context = queue.get_context();
+
+    meta_kernel k("find_extrema_reduce");
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
+    size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
+
+    k <<
+        // Work item global id
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+
+        // Index of element that will be read from input buffer
+        k.decl<uint_>("idx") << " = gid;\n" <<
+
+        k.decl<input_type>("acc") << ";\n" <<
+        k.decl<uint_>("acc_idx") << ";\n" <<
+        "if(gid < count) {\n" <<
+            // Real index of currently best element
+            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+            "#else\n" <<
+            k.var<uint_>("acc_idx") << " = idx;\n" <<
+            "#endif\n" <<
+
+            // Init accumulator with first[get_global_id(0)]
+            "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
+            "idx += get_global_size(0);\n" <<
+        "}\n" <<
+
+        k.decl<bool>("compare_result") << ";\n" <<
+        k.decl<bool>("equal") << ";\n\n" <<
+        "while( idx < count ){\n" <<
+            // Next element
+            k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
+            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+            "#endif\n" <<
+
+            // Comparison between currently best element (acc) and next element
+            "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+            "compare_result = " << compare(k.var<input_type>("next"),
+                                           k.var<input_type>("acc")) << ";\n" <<
+            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            "equal = !compare_result && !" <<
+                compare(k.var<input_type>("acc"),
+                        k.var<input_type>("next")) << ";\n" <<
+            "# endif\n" <<
+            "#else\n" <<
+            "compare_result = " << compare(k.var<input_type>("acc"),
+                                           k.var<input_type>("next")) << ";\n" <<
+            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            "equal = !compare_result && !" <<
+                compare(k.var<input_type>("next"),
+                        k.var<input_type>("acc")) << ";\n" <<
+            "# endif\n" <<
+            "#endif\n" <<
+
+            // save the winner
+            "acc = compare_result ? acc : next;\n" <<
+            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+            "acc_idx = compare_result ? " <<
+                "acc_idx : " <<
+                "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
+            "#else\n" <<
+            "acc_idx = compare_result ? acc_idx : idx;\n" <<
+            "#endif\n" <<
+            "idx += get_global_size(0);\n" <<
+        "}\n\n" <<
+
+        // Work item local id
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+        "block[lid] = acc;\n" <<
+        "block_idx[lid] = acc_idx;\n" <<
+        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+        k.decl<uint_>("group_offset") <<
+            " = count - (get_local_size(0) * get_group_id(0));\n\n";
+
+    k <<
+        "#pragma unroll\n"
+        "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
+             "offset = offset / 2) {\n" <<
+             "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
+                 k.decl<input_type>("mine") << " = block[lid];\n" <<
+                 k.decl<input_type>("other") << " = block[lid+offset];\n" <<
+                 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+                 "compare_result = " << compare(k.var<input_type>("other"),
+                                                k.var<input_type>("mine")) << ";\n" <<
+                 "equal = !compare_result && !" <<
+                     compare(k.var<input_type>("mine"),
+                             k.var<input_type>("other")) << ";\n" <<
+                 "#else\n" <<
+                 "compare_result = " << compare(k.var<input_type>("mine"),
+                                                k.var<input_type>("other")) << ";\n" <<
+                 "equal = !compare_result && !" <<
+                     compare(k.var<input_type>("other"),
+                             k.var<input_type>("mine")) << ";\n" <<
+                 "#endif\n" <<
+                 "block[lid] = compare_result ? mine : other;\n" <<
+                 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
+                 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
+                 "block_idx[lid] = compare_result ? " <<
+                     "mine_idx : " <<
+                     "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
+             "}\n"
+             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "}\n\n" <<
+
+         // write block result to global output
+        "if(lid == 0){\n" <<
+            result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
+            result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
+        "}";
+
+    std::string options;
+    if(!find_minimum){
+        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+    }
+    if(use_input_idx){
+        options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
+    }
+
+    kernel kernel = k.compile(context, options);
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
+    kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
+
+    queue.enqueue_1d_range_kernel(kernel,
+                                  0,
+                                  work_groups_no * work_group_size,
+                                  work_group_size);
+}
+
+template<class InputIterator, class ResultIterator, class Compare>
+inline void find_extrema_with_reduce(InputIterator input,
+                                     size_t count,
+                                     ResultIterator result,
+                                     vector<uint_>::iterator result_idx,
+                                     size_t work_groups_no,
+                                     size_t work_group_size,
+                                     Compare compare,
+                                     const bool find_minimum,
+                                     command_queue &queue)
+{
+    // dummy will not be used
+    buffer_iterator<uint_> dummy = result_idx;
+    return find_extrema_with_reduce(
+        input, dummy, count, result, result_idx, work_groups_no,
+        work_group_size, compare, find_minimum, false, queue
+    );
+}
+
+template<class InputIterator, class Compare>
+InputIterator find_extrema_with_reduce(InputIterator first,
+                                       InputIterator last,
+                                       Compare compare,
+                                       const bool find_minimum,
+                                       command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    // Getting information about used queue and device
+    const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+
+    const size_t count = detail::iterator_range_size(first, last);
+
+    std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+        + type_name<input_type>();
+
+    // load parameters
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // get preferred work group size and preferred number
+    // of work groups per compute unit
+    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+    size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
+
+    // calculate work group size and number of work groups
+    work_group_size = (std::min)(max_work_group_size, work_group_size);
+    size_t work_groups_no = compute_units_no * work_groups_per_cu;
+    work_groups_no = (std::min)(
+        work_groups_no,
+        static_cast<size_t>(std::ceil(float(count) / work_group_size))
+    );
+
+    // phase I: finding candidates for extremum
+
+    // device buffors for extremum candidates and their indices
+    // each work-group computes its candidate
+    vector<input_type> candidates(work_groups_no, context);
+    vector<uint_> candidates_idx(work_groups_no, context);
+
+    // finding candidates for first extremum and their indices
+    find_extrema_with_reduce(
+        first, count, candidates.begin(), candidates_idx.begin(),
+        work_groups_no, work_group_size, compare, find_minimum, queue
+    );
+
+    // phase II: finding extremum from among the candidates
+
+    // zero-copy buffers for final result (value and index)
+    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+        result(1, context);
+    vector<uint_, ::boost::compute::pinned_allocator<uint_> >
+        result_idx(1, context);
+
+    // get extremum from among the candidates
+    find_extrema_with_reduce(
+        candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
+        result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
+    );
+
+    // mapping extremum index to host
+    uint_* result_idx_host_ptr =
+        static_cast<uint_*>(
+            queue.enqueue_map_buffer(
+                result_idx.get_buffer(), command_queue::map_read,
+                0, sizeof(uint_)
+            )
+        );
+
+    return first + static_cast<difference_type>(*result_idx_host_ptr);
+}
+
+template<class InputIterator>
+InputIterator find_extrema_with_reduce(InputIterator first,
+                                       InputIterator last,
+                                       ::boost::compute::less<
+                                           typename std::iterator_traits<
+                                               InputIterator
+                                           >::value_type
+                                       >
+                                       compare,
+                                       const bool find_minimum,
+                                       command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    // Getting information about used queue and device
+    const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+
+    const size_t count = detail::iterator_range_size(first, last);
+
+    std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+        + type_name<input_type>();
+
+    // load parameters
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // get preferred work group size and preferred number
+    // of work groups per compute unit
+    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+    size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
+
+    // calculate work group size and number of work groups
+    work_group_size = (std::min)(max_work_group_size, work_group_size);
+    size_t work_groups_no = compute_units_no * work_groups_per_cu;
+    work_groups_no = (std::min)(
+        work_groups_no,
+        static_cast<size_t>(std::ceil(float(count) / work_group_size))
+    );
+
+    // phase I: finding candidates for extremum
+
+    // device buffors for extremum candidates and their indices
+    // each work-group computes its candidate
+    // zero-copy buffers are used to eliminate copying data back to host
+    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+        candidates(work_groups_no, context);
+    vector<uint_, ::boost::compute::pinned_allocator <uint_> >
+        candidates_idx(work_groups_no, context);
+
+    // finding candidates for first extremum and their indices
+    find_extrema_with_reduce(
+        first, count, candidates.begin(), candidates_idx.begin(),
+        work_groups_no, work_group_size, compare, find_minimum, queue
+    );
+
+    // phase II: finding extremum from among the candidates
+
+    // mapping candidates and their indices to host
+    input_type* candidates_host_ptr =
+        static_cast<input_type*>(
+            queue.enqueue_map_buffer(
+                candidates.get_buffer(), command_queue::map_read,
+                0, work_groups_no * sizeof(input_type)
+            )
+        );
+
+    uint_* candidates_idx_host_ptr =
+        static_cast<uint_*>(
+            queue.enqueue_map_buffer(
+                candidates_idx.get_buffer(), command_queue::map_read,
+                0, work_groups_no * sizeof(uint_)
+            )
+        );
+
+    input_type* i = candidates_host_ptr;
+    uint_* idx = candidates_idx_host_ptr;
+    uint_* extremum_idx = idx;
+    input_type extremum = *candidates_host_ptr;
+    i++; idx++;
+
+    // find extremum (serial) from among the candidates on host
+    if(!find_minimum) {
+        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+            input_type next = *i;
+            bool compare_result =  next > extremum;
+            bool equal = next == extremum;
+            extremum = compare_result ? next : extremum;
+            extremum_idx = compare_result ? idx : extremum_idx;
+            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+            idx++, i++;
+        }
+    }
+    else {
+        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+            input_type next = *i;
+            bool compare_result = next < extremum;
+            bool equal = next == extremum;
+            extremum = compare_result ? next : extremum;
+            extremum_idx = compare_result ? idx : extremum_idx;
+            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+            idx++, i++;
+        }
+    }
+
+    return first + static_cast<difference_type>(*extremum_idx);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
@@ -0,0 +1,212 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
+
+#include <iterator>
+
+#include <boost/compute/types.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
+                                                  InputIterator last,
+                                                  UnaryPredicate predicate,
+                                                  const size_t count,
+                                                  command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+    const context &context = queue.get_context();
+
+    detail::meta_kernel k("find_if");
+    size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+    atomic_min<uint_> atomic_min_uint;
+
+    k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
+      << k.decl<const value_type>("value") << "="
+      <<     first[k.var<const uint_>("i")] << ";\n"
+      << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
+      << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
+      << "}\n";
+
+    kernel kernel = k.compile(context);
+
+    scalar<uint_> index(context);
+    kernel.set_arg(index_arg, index.get_buffer());
+
+    // initialize index to the last iterator's index
+    index.write(static_cast<uint_>(count), queue);
+    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+    // read index and return iterator
+    return first + static_cast<difference_type>(index.read(queue));
+}
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
+                                                       InputIterator last,
+                                                       UnaryPredicate predicate,
+                                                       const size_t count,
+                                                       const size_t vpt,
+                                                       command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    detail::meta_kernel k("find_if");
+    size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
+    size_t count_arg = k.add_arg<const uint_>("count");
+    size_t vpt_arg = k.add_arg<const uint_>("vpt");
+    atomic_min<uint_> atomic_min_uint;
+
+    // for GPUs reads from global memory are coalesced
+    if(device.type() & device::gpu) {
+        k <<
+            k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
+            k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
+            k.decl<const uint_>("end") << " = min(" <<
+                    "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
+                    "count" <<
+            ");\n" <<
+
+            // checking if the index is already found
+            "__local uint local_index;\n" <<
+            "if(get_local_id(0) == 0){\n" <<
+            "    local_index = *index;\n " <<
+            "};\n" <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "if(local_index < id){\n" <<
+            "    return;\n" <<
+            "}\n" <<
+
+            "while(id < end){\n" <<
+            "    " << k.decl<const value_type>("value") << " = " <<
+                      first[k.var<const uint_>("id")] << ";\n"
+            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+            "        " << atomic_min_uint(k.var<uint_ *>("index"),
+                                          k.var<uint_>("id")) << ";\n" <<
+            "        return;\n"
+            "    }\n" <<
+            "    id+=lsize;\n" <<
+            "}\n";
+    // for CPUs (and other devices) reads are ordered so the big cache is
+    // efficiently used.
+    } else {
+        k <<
+            k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
+            k.decl<const uint_>("end") << " = min(" <<
+                    "id + " << k.var<uint_>("vpt") << "," <<
+                    "count" <<
+            ");\n" <<
+            "while(id < end && (*index) > id){\n" <<
+            "    " << k.decl<const value_type>("value") << " = " <<
+                      first[k.var<const uint_>("id")] << ";\n"
+            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+            "        " << atomic_min_uint(k.var<uint_ *>("index"),
+                                          k.var<uint_>("id")) << ";\n" <<
+            "        return;\n" <<
+            "    }\n" <<
+            "    id++;\n" <<
+            "}\n";
+    }
+
+    kernel kernel = k.compile(context);
+
+    scalar<uint_> index(context);
+    kernel.set_arg(index_arg, index.get_buffer());
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
+
+    // initialize index to the last iterator's index
+    index.write(static_cast<uint_>(count), queue);
+
+    const size_t global_wg_size = static_cast<size_t>(
+        std::ceil(float(count) / vpt)
+    );
+    queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
+
+    // read index and return iterator
+    return first + static_cast<difference_type>(index.read(queue));
+}
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics(InputIterator first,
+                                          InputIterator last,
+                                          UnaryPredicate predicate,
+                                          command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return last;
+    }
+
+    const device &device = queue.get_device();
+
+    // load cached parameters
+    std::string cache_key = std::string("__boost_find_if_with_atomics_")
+        + type_name<value_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // for relatively small inputs on GPUs kernel checking one value per thread
+    // (work-item) is more efficient than its multiple values per thread version
+    if(device.type() & device::gpu){
+        const size_t one_vpt_threshold =
+            parameters->get(cache_key, "one_vpt_threshold", 1048576);
+        if(count <= one_vpt_threshold){
+            return find_if_with_atomics_one_vpt(
+                first, last, predicate, count, queue
+            );
+        }
+    }
+
+    // values per thread
+    size_t vpt;
+    if(device.type() & device::gpu){
+        // get vpt parameter
+        vpt = parameters->get(cache_key, "vpt", 32);
+    } else {
+        // for CPUs work is split equally between compute units
+        const size_t max_compute_units =
+            device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        vpt = static_cast<size_t>(
+            std::ceil(float(count) / max_compute_units)
+        );
+    }
+
+    return find_if_with_atomics_multiple_vpt(
+        first, last, predicate, count, vpt, queue
+    );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
@@ -0,0 +1,136 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
+
+#include <iterator>
+
+#include <boost/utility/result_of.hpp>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class BinaryFunction>
+inline void inplace_reduce(Iterator first,
+                           Iterator last,
+                           BinaryFunction function,
+                           command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<Iterator>::value_type
+        value_type;
+
+    size_t input_size = iterator_range_size(first, last);
+    if(input_size < 2){
+        return;
+    }
+
+    const context &context = queue.get_context();
+
+    size_t block_size = 64;
+    size_t values_per_thread = 8;
+    size_t block_count = input_size / (block_size * values_per_thread);
+    if(block_count * block_size * values_per_thread != input_size)
+        block_count++;
+
+    vector<value_type> output(block_count, context);
+
+    meta_kernel k("inplace_reduce");
+    size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
+    size_t input_size_arg = k.add_arg<const uint_>("input_size");
+    size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
+    size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
+    k <<
+        "const uint gid = get_global_id(0);\n" <<
+        "const uint lid = get_local_id(0);\n" <<
+        "const uint values_per_thread =\n"
+            << uint_(values_per_thread) << ";\n" <<
+
+        // thread reduce
+        "const uint index = gid * values_per_thread;\n" <<
+        "if(index < input_size){\n" <<
+            k.decl<value_type>("sum") << " = input[index];\n" <<
+            "for(uint i = 1;\n" <<
+                 "i < values_per_thread && (index + i) < input_size;\n" <<
+                 "i++){\n" <<
+            "    sum = " <<
+                     function(k.var<value_type>("sum"),
+                              k.var<value_type>("input[index+i]")) << ";\n" <<
+            "}\n" <<
+            "scratch[lid] = sum;\n" <<
+        "}\n" <<
+
+        // local reduce
+        "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    uint mask = (i << 1) - 1;\n" <<
+        "    uint next_index = (gid + i) * values_per_thread;\n"
+        "    if((lid & mask) == 0 && next_index < input_size){\n" <<
+        "        scratch[lid] = " <<
+                     function(k.var<value_type>("scratch[lid]"),
+                              k.var<value_type>("scratch[lid+i]")) << ";\n" <<
+        "    }\n" <<
+        "}\n" <<
+
+        // write output for block
+        "if(lid == 0){\n" <<
+        "    output[get_group_id(0)] = scratch[0];\n" <<
+        "}\n"
+        ;
+
+    const buffer *input_buffer = &first.get_buffer();
+    const buffer *output_buffer = &output.get_buffer();
+
+    kernel kernel = k.compile(context);
+
+    while(input_size > 1){
+        kernel.set_arg(input_arg, *input_buffer);
+        kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
+        kernel.set_arg(output_arg, *output_buffer);
+        kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
+
+        queue.enqueue_1d_range_kernel(kernel,
+                                      0,
+                                      block_count * block_size,
+                                      block_size);
+
+        input_size =
+            static_cast<size_t>(
+                std::ceil(float(input_size) / (block_size * values_per_thread)
+            )
+        );
+
+        block_count = input_size / (block_size * values_per_thread);
+        if(block_count * block_size * values_per_thread != input_size)
+            block_count++;
+
+        std::swap(input_buffer, output_buffer);
+    }
+
+    if(input_buffer != &first.get_buffer()){
+        ::boost::compute::copy(output.begin(),
+                               output.begin() + 1,
+                               first,
+                               queue);
+    }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
@@ -0,0 +1,165 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class Compare>
+inline void serial_insertion_sort(Iterator first,
+                                  Iterator last,
+                                  Compare compare,
+                                  command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    size_t count = iterator_range_size(first, last);
+    if(count < 2){
+        return;
+    }
+
+    meta_kernel k("serial_insertion_sort");
+    size_t local_data_arg = k.add_arg<T *>(memory_object::local_memory, "data");
+    size_t count_arg = k.add_arg<uint_>("n");
+
+    k <<
+        // copy data to local memory
+        "for(uint i = 0; i < n; i++){\n" <<
+        "    data[i] = " << first[k.var<uint_>("i")] << ";\n"
+        "}\n"
+
+        // sort data in local memory
+        "for(uint i = 1; i < n; i++){\n" <<
+        "    " << k.decl<const T>("value") << " = data[i];\n" <<
+        "    uint pos = i;\n" <<
+        "    while(pos > 0 && " <<
+                   compare(k.var<const T>("value"),
+                           k.var<const T>("data[pos-1]")) << "){\n" <<
+        "        data[pos] = data[pos-1];\n" <<
+        "        pos--;\n" <<
+        "    }\n" <<
+        "    data[pos] = value;\n" <<
+        "}\n" <<
+
+        // copy sorted data to output
+        "for(uint i = 0; i < n; i++){\n" <<
+        "    " << first[k.var<uint_>("i")] << " = data[i];\n"
+        "}\n";
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(local_data_arg, local_buffer<T>(count));
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+    queue.enqueue_task(kernel);
+}
+
+template<class Iterator>
+inline void serial_insertion_sort(Iterator first,
+                                  Iterator last,
+                                  command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type T;
+
+    ::boost::compute::less<T> less;
+
+    return serial_insertion_sort(first, last, less, queue);
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void serial_insertion_sort_by_key(KeyIterator keys_first,
+                                         KeyIterator keys_last,
+                                         ValueIterator values_first,
+                                         Compare compare,
+                                         command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+    size_t count = iterator_range_size(keys_first, keys_last);
+    if(count < 2){
+        return;
+    }
+
+    meta_kernel k("serial_insertion_sort_by_key");
+    size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys");
+    size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data");
+    size_t count_arg = k.add_arg<uint_>("n");
+
+    k <<
+        // copy data to local memory
+        "for(uint i = 0; i < n; i++){\n" <<
+        "    keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n"
+        "    data[i] = " << values_first[k.var<uint_>("i")] << ";\n"
+        "}\n"
+
+        // sort data in local memory
+        "for(uint i = 1; i < n; i++){\n" <<
+        "    " << k.decl<const key_type>("key") << " = keys[i];\n" <<
+        "    " << k.decl<const value_type>("value") << " = data[i];\n" <<
+        "    uint pos = i;\n" <<
+        "    while(pos > 0 && " <<
+                   compare(k.var<const key_type>("key"),
+                           k.var<const key_type>("keys[pos-1]")) << "){\n" <<
+        "        keys[pos] = keys[pos-1];\n" <<
+        "        data[pos] = data[pos-1];\n" <<
+        "        pos--;\n" <<
+        "    }\n" <<
+        "    keys[pos] = key;\n" <<
+        "    data[pos] = value;\n" <<
+        "}\n" <<
+
+        // copy sorted data to output
+        "for(uint i = 0; i < n; i++){\n" <<
+        "    " << keys_first[k.var<uint_>("i")] << " = keys[i];\n"
+        "    " << values_first[k.var<uint_>("i")] << " = data[i];\n"
+        "}\n";
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0);
+    kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0);
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+    queue.enqueue_task(kernel);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void serial_insertion_sort_by_key(KeyIterator keys_first,
+                                         KeyIterator keys_last,
+                                         ValueIterator values_first,
+                                         command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+    serial_insertion_sort_by_key(
+        keys_first,
+        keys_last,
+        values_first,
+        boost::compute::less<key_type>(),
+        queue
+    );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
@@ -0,0 +1,116 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Merge Path kernel class
+///
+/// Subclass of meta_kernel to break two sets into tiles according
+/// to their merge path
+///
+class merge_path_kernel : public meta_kernel
+{
+public:
+    unsigned int tile_size;
+
+    merge_path_kernel() : meta_kernel("merge_path")
+    {
+        tile_size = 4;
+    }
+
+    template<class InputIterator1, class InputIterator2,
+             class OutputIterator1, class OutputIterator2,
+             class Compare>
+    void set_range(InputIterator1 first1,
+                   InputIterator1 last1,
+                   InputIterator2 first2,
+                   InputIterator2 last2,
+                   OutputIterator1 result_a,
+                   OutputIterator2 result_b,
+                   Compare comp)
+    {
+        m_a_count = iterator_range_size(first1, last1);
+        m_a_count_arg = add_arg<uint_>("a_count");
+
+        m_b_count = iterator_range_size(first2, last2);
+        m_b_count_arg = add_arg<uint_>("b_count");
+
+        *this <<
+            "uint i = get_global_id(0);\n" <<
+            "uint target = (i+1)*" << tile_size << ";\n" <<
+            "uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" <<
+            "uint end = min(target,a_count);\n" <<
+            "uint a_index, b_index;\n" <<
+            "while(start<end)\n" <<
+            "{\n" <<
+            "   a_index = (start + end)/2;\n" <<
+            "   b_index = target - a_index - 1;\n" <<
+            "   if(!(" << comp(first2[expr<uint_>("b_index")],
+                              first1[expr<uint_>("a_index")]) << "))\n" <<
+            "       start = a_index + 1;\n" <<
+            "   else end = a_index;\n" <<
+            "}\n" <<
+            result_a[expr<uint_>("i")] << " = start;\n" <<
+            result_b[expr<uint_>("i")] << " = target - start;\n";
+    }
+
+    template<class InputIterator1, class InputIterator2,
+             class OutputIterator1, class OutputIterator2>
+    void set_range(InputIterator1 first1,
+                   InputIterator1 last1,
+                   InputIterator2 first2,
+                   InputIterator2 last2,
+                   OutputIterator1 result_a,
+                   OutputIterator2 result_b)
+    {
+        typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+        ::boost::compute::less<value_type> less_than;
+        set_range(first1, last1, first2, last2, result_a, result_b, less_than);
+    }
+
+    event exec(command_queue &queue)
+    {
+        if((m_a_count + m_b_count)/tile_size == 0) {
+            return event();
+        }
+
+        set_arg(m_a_count_arg, uint_(m_a_count));
+        set_arg(m_b_count_arg, uint_(m_b_count));
+
+        return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size);
+    }
+
+private:
+    size_t m_a_count;
+    size_t m_a_count_arg;
+    size_t m_b_count;
+    size_t m_b_count_arg;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
@@ -0,0 +1,366 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_blocks(KeyIterator keys_first,
+                         ValueIterator values_first,
+                         KeyIterator keys_result,
+                         ValueIterator values_result,
+                         Compare compare,
+                         size_t count,
+                         const size_t block_size,
+                         const bool sort_by_key,
+                         command_queue &queue)
+{
+    (void) values_result;
+    (void) values_first;
+
+    meta_kernel k("merge_sort_on_cpu_merge_blocks");
+    size_t count_arg = k.add_arg<const uint_>("count");
+    size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+    k <<
+        k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
+        k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
+        k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
+        k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
+        k.decl<uint_>("result_idx") << " = b1_start;\n" <<
+
+        // merging block 1 and block 2 (stable)
+        "while(b1_start < b1_end && b2_start < b2_end){\n" <<
+        "    if( " << compare(keys_first[k.var<uint_>("b2_start")],
+                              keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
+        "        " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
+                      keys_first[k.var<uint_>("b2_start")] << ";\n";
+    if(sort_by_key){
+        k <<
+        "        " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
+                      values_first[k.var<uint_>("b2_start")] << ";\n";
+    }
+    k <<
+        "        b2_start++;\n" <<
+        "    }\n" <<
+        "    else {\n" <<
+        "        " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
+                      keys_first[k.var<uint_>("b1_start")] << ";\n";
+    if(sort_by_key){
+        k <<
+        "        " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
+                      values_first[k.var<uint_>("b1_start")] << ";\n";
+    }
+    k <<
+        "        b1_start++;\n" <<
+        "    }\n" <<
+        "    result_idx++;\n" <<
+        "}\n" <<
+        "while(b1_start < b1_end){\n" <<
+        "    " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
+                 keys_first[k.var<uint_>("b1_start")] << ";\n";
+    if(sort_by_key){
+        k <<
+        "    " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
+                      values_first[k.var<uint_>("b1_start")] << ";\n";
+    }
+    k <<
+        "    b1_start++;\n" <<
+        "    result_idx++;\n" <<
+        "}\n" <<
+        "while(b2_start < b2_end){\n" <<
+        "    " << keys_result[k.var<uint_>("result_idx")] <<  " = " <<
+                 keys_first[k.var<uint_>("b2_start")] << ";\n";
+    if(sort_by_key){
+        k <<
+        "    " << values_result[k.var<uint_>("result_idx")] <<  " = " <<
+                      values_first[k.var<uint_>("b2_start")] << ";\n";
+    }
+    k <<
+        "    b2_start++;\n" <<
+        "    result_idx++;\n" <<
+        "}\n";
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(count_arg, static_cast<const uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+    const size_t global_size = static_cast<size_t>(
+        std::ceil(float(count) / (2 * block_size))
+    );
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void merge_blocks(Iterator first,
+                         Iterator result,
+                         Compare compare,
+                         size_t count,
+                         const size_t block_size,
+                         const bool sort_by_key,
+                         command_queue &queue)
+{
+    // dummy iterator as it's not sort by key
+    Iterator dummy;
+    merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
+}
+
+template<class Iterator, class Compare>
+inline void dispatch_merge_blocks(Iterator first,
+                                  Iterator result,
+                                  Compare compare,
+                                  size_t count,
+                                  const size_t block_size,
+                                  const size_t input_size_threshold,
+                                  const size_t blocks_no_threshold,
+                                  command_queue &queue)
+{
+    const size_t blocks_no = static_cast<size_t>(
+        std::ceil(float(count) / block_size)
+    );
+    // merge with merge path should used only for the large arrays and at the
+    // end of merging part when there are only a few big blocks left to be merged
+    if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
+        Iterator last = first + count;
+        for(size_t i = 0; i < count; i+= 2*block_size)
+        {
+            Iterator first1 = (std::min)(first + i, last);
+            Iterator last1 = (std::min)(first1 + block_size, last);
+            Iterator first2 = last1;
+            Iterator last2 = (std::min)(first2 + block_size, last);
+            Iterator block_result = (std::min)(result + i, result + count);
+            merge_with_merge_path(first1, last1, first2, last2,
+                                  block_result, compare, queue);
+        }
+    }
+    else {
+        merge_blocks(first, result, compare, count, block_size, false, queue);
+    }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void block_insertion_sort(KeyIterator keys_first,
+                                 ValueIterator values_first,
+                                 Compare compare,
+                                 const size_t count,
+                                 const size_t block_size,
+                                 const bool sort_by_key,
+                                 command_queue &queue)
+{
+    (void) values_first;
+
+    typedef typename std::iterator_traits<KeyIterator>::value_type K;
+    typedef typename std::iterator_traits<ValueIterator>::value_type T;
+
+    meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+    k <<
+        k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
+        k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
+
+        // block insertion sort (stable)
+        "for(uint i = start+1; i < end; i++){\n" <<
+        "    " << k.decl<const K>("key") << " = " <<
+                  keys_first[k.var<uint_>("i")] << ";\n";
+    if(sort_by_key){
+        k <<
+        "    " << k.decl<const T>("value") << " = " <<
+                  values_first[k.var<uint_>("i")] << ";\n";
+    }
+    k <<
+        "    uint pos = i;\n" <<
+        "    while(pos > start && " <<
+                   compare(k.var<const K>("key"),
+                           keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
+        "        " << keys_first[k.var<uint_>("pos")] << " = " <<
+                      keys_first[k.var<uint_>("pos-1")] << ";\n";
+    if(sort_by_key){
+        k <<
+        "        " << values_first[k.var<uint_>("pos")] << " = " <<
+                      values_first[k.var<uint_>("pos-1")] << ";\n";
+    }
+    k <<
+        "        pos--;\n" <<
+        "    }\n" <<
+        "    " << keys_first[k.var<uint_>("pos")] << " = key;\n";
+    if(sort_by_key) {
+        k <<
+        "    " << values_first[k.var<uint_>("pos")] << " = value;\n";
+    }
+    k <<
+        "}\n"; // block insertion sort
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+    const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void block_insertion_sort(Iterator first,
+                                 Compare compare,
+                                 const size_t count,
+                                 const size_t block_size,
+                                 command_queue &queue)
+{
+    // dummy iterator as it's not sort by key
+    Iterator dummy;
+    block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
+}
+
+// This sort is stable.
+template<class Iterator, class Compare>
+inline void merge_sort_on_cpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count < 2){
+        return;
+    }
+    // for small input size only insertion sort is performed
+    else if(count <= 512){
+        block_insertion_sort(first, compare, count, count, queue);
+        return;
+    }
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // When there is merge_with_path_blocks_no_threshold or less blocks left to
+    // merge AND input size is merge_with_merge_path_input_size_threshold or more
+    // merge_with_merge_path() algorithm is used to merge sorted blocks;
+    // otherwise merge_blocks() is used.
+    const size_t merge_with_path_blocks_no_threshold =
+        parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
+    const size_t merge_with_path_input_size_threshold =
+        parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
+
+    const size_t block_size =
+        parameters->get(cache_key, "insertion_sort_block_size", 64);
+    block_insertion_sort(first, compare, count, block_size, queue);
+
+    // temporary buffer for merge result
+    vector<value_type> temp(count, context);
+    bool result_in_temporary_buffer = false;
+
+    for(size_t i = block_size; i < count; i *= 2){
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            dispatch_merge_blocks(first, temp.begin(), compare, count, i,
+                                  merge_with_path_input_size_threshold,
+                                  merge_with_path_blocks_no_threshold,
+                                  queue);
+        } else {
+            dispatch_merge_blocks(temp.begin(), first, compare, count, i,
+                                  merge_with_path_input_size_threshold,
+                                  merge_with_path_blocks_no_threshold,
+                                  queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy(temp.begin(), temp.end(), first, queue);
+    }
+}
+
+// This sort is stable.
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
+                                     KeyIterator keys_last,
+                                     ValueIterator values_first,
+                                     Compare compare,
+                                     command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+    size_t count = iterator_range_size(keys_first, keys_last);
+    if(count < 2){
+        return;
+    }
+    // for small input size only insertion sort is performed
+    else if(count <= 512){
+        block_insertion_sort(keys_first, values_first, compare,
+                             count, count, true, queue);
+        return;
+    }
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
+        + "_with_" + type_name<key_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    const size_t block_size =
+        parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
+    block_insertion_sort(keys_first, values_first, compare,
+                         count, block_size, true, queue);
+
+    // temporary buffer for merge results
+    vector<value_type> values_temp(count, context);
+    vector<key_type> keys_temp(count, context);
+    bool result_in_temporary_buffer = false;
+
+    for(size_t i = block_size; i < count; i *= 2){
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            merge_blocks(keys_first, values_first,
+                         keys_temp.begin(), values_temp.begin(),
+                         compare, count, i, true, queue);
+        } else {
+            merge_blocks(keys_temp.begin(), values_temp.begin(),
+                         keys_first, values_first,
+                         compare, count, i, true, queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
+        copy(values_temp.begin(), values_temp.end(), values_first, queue);
+    }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
@@ -0,0 +1,590 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
+
+#include <algorithm>
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class KeyType, class ValueType>
+inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,
+                                                 size_t lmem_size,
+                                                 bool sort_by_key)
+{
+    size_t n = proposed_wg;
+
+    size_t lmem_required = n * sizeof(KeyType);
+    if(sort_by_key) {
+        lmem_required += n * sizeof(ValueType);
+    }
+
+    // try to force at least 4 work-groups of >64 elements
+    // for better occupancy
+    while(lmem_size < (lmem_required * 4) && (n > 64)) {
+        n /= 2;
+        lmem_required = n * sizeof(KeyType);
+    }
+    while(lmem_size < lmem_required && (n != 1)) {
+        n /= 2;
+        if(n < 1) n = 1;
+        lmem_required = n * sizeof(KeyType);
+    }
+
+    if(n < 2)   { return 1; }
+    else if(n < 4)   { return 2; }
+    else if(n < 8)   { return 4; }
+    else if(n < 16)  { return 8; }
+    else if(n < 32)  { return 16; }
+    else if(n < 64)  { return 32; }
+    else if(n < 128) { return 64; }
+    else if(n < 256) { return 128; }
+    else             { return 256; }
+}
+
+
+/// Performs bitonic block sort according to \p compare.
+///
+/// Since bitonic sort can be only performed when input size is equal to 2^n,
+/// in this case input size is block size (\p work_group_size), we would have
+/// to require \p count be a exact multiple of block size. That would not be
+/// great.
+/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the
+/// last block is not equal to 2^n (where n is some natural number) the odd-even
+/// sort is performed for that block. That way bitonic_block_sort() works for
+/// input of any size. Block size (\p work_group_size) still have to be equal
+/// to 2^n.
+///
+/// This is NOT stable.
+///
+/// \param keys_first first key element in the range to sort
+/// \param values_first first value element in the range to sort
+/// \param compare comparison function for keys
+/// \param count number of elements in the range; count > 0
+/// \param work_group_size size of the work group, also the block size; must be
+///        equal to n^2 where n is natural number
+/// \param queue command queue to perform the operation
+template<class KeyIterator, class ValueIterator, class Compare>
+inline size_t bitonic_block_sort(KeyIterator keys_first,
+                                 ValueIterator values_first,
+                                 Compare compare,
+                                 const size_t count,
+                                 const bool sort_by_key,
+                                 command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+    meta_kernel k("bitonic_block_sort");
+    size_t count_arg = k.add_arg<const uint_>("count");
+
+    size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
+    size_t local_vals_arg = 0;
+    if(sort_by_key) {
+        local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
+    }
+
+    k <<
+        // Work item global and local ids
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n";
+
+    // declare my_key and my_value
+    k <<
+        k.decl<key_type>("my_key") << ";\n";
+    // Instead of copying values (my_value) in local memory with keys
+    // we save local index (uchar) and copy my_value at the end at
+    // final index. This saves local memory.
+    if(sort_by_key)
+    {
+        k <<
+            k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
+    }
+
+    // load key
+    k <<
+        "if(gid < count) {\n" <<
+            k.var<key_type>("my_key") <<  " = " <<
+                keys_first[k.var<const uint_>("gid")] << ";\n" <<
+        "}\n";
+
+    // load key and index to local memory
+    k <<
+        "lkeys[lid] = my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "lidx[lid] = my_index;\n";
+    }
+    k <<
+        k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
+        k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";
+
+    // When work group size is a power of 2 bitonic sorter can be used;
+    // otherwise, slower odd-even sort is used.
+
+    k <<
+        // check if n is power of 2
+        "if(((n != 0) && ((n & (~n + 1)) == n))) {\n";
+
+    // bitonic sort, not stable
+    k <<
+        // wait for keys and vals to be stored in local memory
+        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+        "#pragma unroll\n" <<
+        "for(" <<
+            k.decl<uint_>("length") << " = 1; " <<
+            "length < n; " <<
+            "length <<= 1" <<
+        ") {\n" <<
+            // direction of sort: false -> asc, true -> desc
+            k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
+            "for(" <<
+                k.decl<uint_>("k") << " = length; " <<
+                "k > 0; " <<
+                "k >>= 1" <<
+            ") {\n" <<
+
+            // sibling to compare with my key
+            k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
+            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
+            k.decl<bool>("compare") << " = " <<
+                compare(k.var<key_type>("sibling_key"),
+                        k.var<key_type>("my_key")) << ";\n" <<
+            k.decl<bool>("swap") <<
+                " = compare ^ (sibling_idx < lid) ^ direction;\n" <<
+            "my_key = swap ? sibling_key : my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "lkeys[lid] = my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "lidx[lid] = my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "}\n" <<
+         "}\n";
+
+    // end of bitonic sort
+
+    // odd-even sort, not stable
+    k <<
+        "}\n" <<
+        "else { \n";
+
+    k <<
+        k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<
+        k.decl<uint_>("oddsibling_idx") << " = " <<
+            "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<
+        k.decl<uint_>("evensibling_idx") << " = " <<
+            "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<
+
+        // wait for keys and vals to be stored in local memory
+        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+        "#pragma unroll\n" <<
+        "for(" <<
+            k.decl<uint_>("i") << " = 0; " <<
+            "i < n; " <<
+            "i++" <<
+        ") {\n" <<
+            k.decl<uint_>("sibling_idx") <<
+                " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<
+            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
+            k.decl<bool>("compare") << " = " <<
+                compare(k.var<key_type>("sibling_key"),
+                        k.var<key_type>("my_key")) << ";\n" <<
+            k.decl<bool>("swap") <<
+                " = compare ^ (sibling_idx < lid);\n" <<
+            "my_key = swap ? sibling_key : my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "lkeys[lid] = my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "lidx[lid] = my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "}\n" <<  // for
+
+    "}\n"; // else
+    // end of odd-even sort
+
+    // save key and value
+    k <<
+        "if(gid < count) {\n" <<
+        keys_first[k.var<const uint_>("gid")] << " = " <<
+            k.var<key_type>("my_key") << ";\n";
+    if(sort_by_key)
+    {
+        k << values_first[k.var<const uint_>("gid")] << " = " <<
+                values_first[k.var<const uint_>("offset + my_index")] << ";\n";
+    }
+    k <<
+        // end if
+        "}\n";
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+    ::boost::compute::kernel kernel = k.compile(context);
+
+    const size_t work_group_size =
+        pick_bitonic_block_sort_block_size<key_type, uchar_>(
+            kernel.get_work_group_info<size_t>(
+                device, CL_KERNEL_WORK_GROUP_SIZE
+            ),
+            device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),
+            sort_by_key
+        );
+
+    const size_t global_size =
+        work_group_size * static_cast<size_t>(
+            std::ceil(float(count) / work_group_size)
+        );
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));
+    if(sort_by_key) {
+        kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));
+    }
+
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
+    // return size of the block
+    return work_group_size;
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline size_t block_sort(KeyIterator keys_first,
+                         ValueIterator values_first,
+                         Compare compare,
+                         const size_t count,
+                         const bool sort_by_key,
+                         const bool stable,
+                         command_queue &queue)
+{
+    if(stable) {
+        // TODO: Implement stable block sort (stable odd-even merge sort)
+        return size_t(1);
+    }
+    return bitonic_block_sort(
+        keys_first, values_first,
+        compare, count,
+        sort_by_key, queue
+    );
+}
+
+/// space: O(n + m); n - number of keys, m - number of values
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_blocks_on_gpu(KeyIterator keys_first,
+                                ValueIterator values_first,
+                                KeyIterator out_keys_first,
+                                ValueIterator out_values_first,
+                                Compare compare,
+                                const size_t count,
+                                const size_t block_size,
+                                const bool sort_by_key,
+                                command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+    meta_kernel k("merge_blocks");
+    size_t count_arg = k.add_arg<const uint_>("count");
+    size_t block_size_arg = k.add_arg<const uint_>("block_size");
+
+    k <<
+        // get global id
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        "if(gid >= count) {\n" <<
+            "return;\n" <<
+        "}\n" <<
+
+        k.decl<const key_type>("my_key") << " = " <<
+            keys_first[k.var<const uint_>("gid")] << ";\n";
+
+    if(sort_by_key) {
+        k <<
+            k.decl<const value_type>("my_value") << " = " <<
+                values_first[k.var<const uint_>("gid")] << ";\n";
+    }
+
+    k <<
+        // get my block idx
+        k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<
+        k.decl<const bool>("my_block_idx_is_odd") << " = " <<
+            "my_block_idx & 0x1;\n" <<
+
+        k.decl<const uint_>("other_block_idx") << " = " <<
+            // if(my_block_idx is odd) {} else {}
+            "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<
+
+        // get ranges of my block and the other block
+        // [my_block_start; my_block_end)
+        // [other_block_start; other_block_end)
+        k.decl<const uint_>("my_block_start") << " = " <<
+            "min(my_block_idx * block_size, count);\n" << // including
+        k.decl<const uint_>("my_block_end") << " = " <<
+            "min((my_block_idx + 1) * block_size, count);\n" << // excluding
+
+        k.decl<const uint_>("other_block_start") << " = " <<
+            "min(other_block_idx * block_size, count);\n" << // including
+        k.decl<const uint_>("other_block_end") << " = " <<
+            "min((other_block_idx + 1) * block_size, count);\n" << // excluding
+
+        // other block is empty, nothing to merge here
+        "if(other_block_start == count){\n" <<
+            out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";
+        if(sort_by_key) {
+            k <<
+                out_values_first[k.var<uint_>("gid")] << " = my_value;\n";
+        }
+
+        k <<
+        "return;\n" <<
+        "}\n" <<
+
+        // lower bound
+        // left_idx - lower bound
+        k.decl<uint_>("left_idx") << " = other_block_start;\n" <<
+        k.decl<uint_>("right_idx") << " = other_block_end;\n" <<
+        "while(left_idx < right_idx) {\n" <<
+            k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
+            k.decl<key_type>("mid_key") << " = " <<
+                    keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
+            k.decl<bool>("smaller") << " = " <<
+                compare(k.var<key_type>("mid_key"),
+                        k.var<key_type>("my_key")) << ";\n" <<
+            "left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<
+            "right_idx = smaller ? right_idx :  mid_idx;\n" <<
+        "}\n" <<
+        // left_idx is found position in other block
+
+        // if my_block is odd we need to get the upper bound
+        "right_idx = other_block_end;\n" <<
+        "if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<
+            k.decl<key_type>("upper_key") << " = " <<
+                keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
+            "while(" <<
+                "!(" << compare(k.var<key_type>("upper_key"),
+                                k.var<key_type>("my_key")) <<
+                ") && " <<
+                "!(" << compare(k.var<key_type>("my_key"),
+                                k.var<key_type>("upper_key")) <<
+                ") && " <<
+                     "left_idx < right_idx" <<
+                ")" <<
+            "{\n" <<
+                k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
+                k.decl<key_type>("mid_key") << " = " <<
+                    keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
+                k.decl<bool>("equal") << " = " <<
+                    "!(" << compare(k.var<key_type>("mid_key"),
+                                    k.var<key_type>("my_key")) <<
+                    ") && " <<
+                    "!(" << compare(k.var<key_type>("my_key"),
+                                    k.var<key_type>("mid_key")) <<
+                    ");\n" <<
+                "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<
+                "right_idx = equal ? right_idx : mid_idx;\n" <<
+                "upper_key = equal ? upper_key : " <<
+                    keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
+            "}\n" <<
+        "}\n" <<
+
+        k.decl<uint_>("offset") << " = 0;\n" <<
+        "offset += gid - my_block_start;\n" <<
+        "offset += left_idx - other_block_start;\n" <<
+        "offset += min(my_block_start, other_block_start);\n" <<
+        out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";
+    if(sort_by_key) {
+        k <<
+            out_values_first[k.var<uint_>("offset")] << " = my_value;\n";
+    }
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+
+    const size_t work_group_size = (std::min)(
+        size_t(256),
+        kernel.get_work_group_info<size_t>(
+            queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE
+        )
+    );
+    const size_t global_size =
+        work_group_size * static_cast<size_t>(
+            std::ceil(float(count) / work_group_size)
+        );
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
+                                     KeyIterator keys_last,
+                                     ValueIterator values_first,
+                                     Compare compare,
+                                     bool stable,
+                                     command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+    size_t count = iterator_range_size(keys_first, keys_last);
+    if(count < 2){
+        return;
+    }
+
+    size_t block_size =
+        block_sort(
+            keys_first, values_first,
+            compare, count,
+            true /* sort_by_key */, stable /* stable */,
+            queue
+        );
+
+    // for small input size only block sort is performed
+    if(count <= block_size) {
+        return;
+    }
+
+    const context &context = queue.get_context();
+
+    bool result_in_temporary_buffer = false;
+    ::boost::compute::vector<key_type> temp_keys(count, context);
+    ::boost::compute::vector<value_type> temp_values(count, context);
+
+    for(; block_size < count; block_size *= 2) {
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            merge_blocks_on_gpu(keys_first, values_first,
+                                temp_keys.begin(), temp_values.begin(),
+                                compare, count, block_size,
+                                true /* sort_by_key */, queue);
+        } else {
+            merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
+                                keys_first, values_first,
+                                compare, count, block_size,
+                                true /* sort_by_key */, queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
+        copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
+    }
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_gpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              bool stable,
+                              command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type key_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count < 2){
+        return;
+    }
+
+    Iterator dummy;
+    size_t block_size =
+        block_sort(
+            first, dummy,
+            compare, count,
+            false /* sort_by_key */, stable /* stable */,
+            queue
+        );
+
+    // for small input size only block sort is performed
+    if(count <= block_size) {
+        return;
+    }
+
+    const context &context = queue.get_context();
+
+    bool result_in_temporary_buffer = false;
+    ::boost::compute::vector<key_type> temp_keys(count, context);
+
+    for(; block_size < count; block_size *= 2) {
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,
+                                compare, count, block_size,
+                                false /* sort_by_key */, queue);
+        } else {
+            merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,
+                                compare, count, block_size,
+                                false /* sort_by_key */, queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy_async(temp_keys.begin(), temp_keys.end(), first, queue);
+    }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
+                                     KeyIterator keys_last,
+                                     ValueIterator values_first,
+                                     Compare compare,
+                                     command_queue &queue)
+{
+    merge_sort_by_key_on_gpu(
+        keys_first, keys_last, values_first,
+        compare, false /* not stable */, queue
+    );
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_gpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
+{
+    merge_sort_on_gpu(
+        first, last, compare, false /* not stable */, queue
+    );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */
@@ -0,0 +1,203 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/merge_path.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial merge kernel class
+///
+/// Subclass of meta_kernel to perform serial merge after tiling
+///
+class serial_merge_kernel : meta_kernel
+{
+public:
+    unsigned int tile_size;
+
+    serial_merge_kernel() : meta_kernel("merge")
+    {
+        tile_size = 4;
+    }
+
+    template<class InputIterator1, class InputIterator2,
+             class InputIterator3, class InputIterator4,
+             class OutputIterator, class Compare>
+    void set_range(InputIterator1 first1,
+                   InputIterator2 first2,
+                   InputIterator3 tile_first1,
+                   InputIterator3 tile_last1,
+                   InputIterator4 tile_first2,
+                   OutputIterator result,
+                   Compare comp)
+    {
+        m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+        *this <<
+        "uint i = get_global_id(0);\n" <<
+        "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+        "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+        "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+        "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+        "uint index = i*" << tile_size << ";\n" <<
+        "while(start1<end1 && start2<end2)\n" <<
+        "{\n" <<
+        "   if(!(" << comp(first2[expr<uint_>("start2")],
+                            first1[expr<uint_>("start1")]) << "))\n" <<
+        "   {\n" <<
+                result[expr<uint_>("index")] <<
+                    " = " << first1[expr<uint_>("start1")] << ";\n" <<
+        "       index++;\n" <<
+        "       start1++;\n" <<
+        "   }\n" <<
+        "   else\n" <<
+        "   {\n" <<
+                result[expr<uint_>("index")] <<
+                    " = " << first2[expr<uint_>("start2")] << ";\n" <<
+        "       index++;\n" <<
+        "       start2++;\n" <<
+        "   }\n" <<
+        "}\n" <<
+        "while(start1<end1)\n" <<
+        "{\n" <<
+            result[expr<uint_>("index")] <<
+                " = " << first1[expr<uint_>("start1")] << ";\n" <<
+        "   index++;\n" <<
+        "   start1++;\n" <<
+        "}\n" <<
+        "while(start2<end2)\n" <<
+        "{\n" <<
+            result[expr<uint_>("index")] <<
+                " = " << first2[expr<uint_>("start2")] << ";\n" <<
+        "   index++;\n" <<
+        "   start2++;\n" <<
+        "}\n";
+    }
+
+    template<class InputIterator1, class InputIterator2,
+             class InputIterator3, class InputIterator4,
+             class OutputIterator>
+    void set_range(InputIterator1 first1,
+                   InputIterator2 first2,
+                   InputIterator3 tile_first1,
+                   InputIterator3 tile_last1,
+                   InputIterator4 tile_first2,
+                   OutputIterator result)
+    {
+        typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+        ::boost::compute::less<value_type> less_than;
+        set_range(first1, first2, tile_first1, tile_last1, tile_first2, result, less_than);
+    }
+
+    event exec(command_queue &queue)
+    {
+        if(m_count == 0) {
+            return event();
+        }
+
+        return exec_1d(queue, 0, m_count);
+    }
+
+private:
+    size_t m_count;
+};
+
+///
+/// \brief Merge algorithm with merge path
+///
+/// Merges the sorted values in the range [\p first1, \p last1) with
+/// the sorted values in the range [\p first2, last2) and stores the
+/// result in the range beginning at \p result
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param result Iterator pointing to start of range in which the result
+/// will be stored
+/// \param comp Comparator which performs less than function
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
+inline OutputIterator
+merge_with_merge_path(InputIterator1 first1,
+                      InputIterator1 last1,
+                      InputIterator2 first2,
+                      InputIterator2 last2,
+                      OutputIterator result,
+                      Compare comp,
+                      command_queue &queue = system::default_queue())
+{
+    typedef typename
+        std::iterator_traits<OutputIterator>::difference_type result_difference_type;
+
+    size_t tile_size = 1024;
+
+    size_t count1 = iterator_range_size(first1, last1);
+    size_t count2 = iterator_range_size(first2, last2);
+
+    vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+    vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+    // Tile the sets
+    merge_path_kernel tiling_kernel;
+    tiling_kernel.tile_size = static_cast<unsigned int>(tile_size);
+    tiling_kernel.set_range(first1, last1, first2, last2,
+                            tile_a.begin()+1, tile_b.begin()+1, comp);
+    fill_n(tile_a.begin(), 1, uint_(0), queue);
+    fill_n(tile_b.begin(), 1, uint_(0), queue);
+    tiling_kernel.exec(queue);
+
+    fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue);
+    fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue);
+
+    // Merge
+    serial_merge_kernel merge_kernel;
+    merge_kernel.tile_size = static_cast<unsigned int>(tile_size);
+    merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
+                           tile_b.begin(), result, comp);
+
+    merge_kernel.exec(queue);
+
+    return result + static_cast<result_difference_type>(count1 + count2);
+}
+
+/// \overload
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator
+merge_with_merge_path(InputIterator1 first1,
+                      InputIterator1 last1,
+                      InputIterator2 first2,
+                      InputIterator2 last2,
+                      OutputIterator result,
+                      command_queue &queue = system::default_queue())
+{
+    typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+    ::boost::compute::less<value_type> less_than;
+    return merge_with_merge_path(first1, last1, first2, last2, result, less_than, queue);
+}
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
@@ -0,0 +1,461 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
+
+#include <iterator>
+
+#include <boost/assert.hpp>
+#include <boost/type_traits/is_signed.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/type_traits/is_fundamental.hpp>
+#include <boost/compute/type_traits/is_vector_type.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+// meta-function returning true if type T is radix-sortable
+template<class T>
+struct is_radix_sortable :
+    boost::mpl::and_<
+        typename ::boost::compute::is_fundamental<T>::type,
+        typename boost::mpl::not_<typename is_vector_type<T>::type>::type
+    >
+{
+};
+
+template<size_t N>
+struct radix_sort_value_type
+{
+};
+
+template<>
+struct radix_sort_value_type<1>
+{
+    typedef uchar_ type;
+};
+
+template<>
+struct radix_sort_value_type<2>
+{
+    typedef ushort_ type;
+};
+
+template<>
+struct radix_sort_value_type<4>
+{
+    typedef uint_ type;
+};
+
+template<>
+struct radix_sort_value_type<8>
+{
+    typedef ulong_ type;
+};
+
+template<typename T>
+inline const char* enable_double()
+{
+    return " -DT2_double=0";
+}
+
+template<>
+inline const char* enable_double<double>()
+{
+    return " -DT2_double=1";
+}
+
+const char radix_sort_source[] =
+"#if T2_double\n"
+"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+"#endif\n"
+"#define K2_BITS (1 << K_BITS)\n"
+"#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n"
+"#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n"
+
+"#if defined(ASC)\n" // asc order
+
+"inline uint radix(const T x, const uint low_bit)\n"
+"{\n"
+"#if defined(IS_FLOATING_POINT)\n"
+"    const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
+"    return ((x ^ mask) >> low_bit) & RADIX_MASK;\n"
+"#elif defined(IS_SIGNED)\n"
+"    return ((x ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
+"#else\n"
+"    return (x >> low_bit) & RADIX_MASK;\n"
+"#endif\n"
+"}\n"
+
+"#else\n" // desc order
+
+// For signed types we just negate the x and for unsigned types we
+// subtract the x from max value of its type ((T)(-1) is a max value
+// of type T when T is an unsigned type).
+"inline uint radix(const T x, const uint low_bit)\n"
+"{\n"
+"#if defined(IS_FLOATING_POINT)\n"
+"    const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
+"    return (((-x) ^ mask) >> low_bit) & RADIX_MASK;\n"
+"#elif defined(IS_SIGNED)\n"
+"    return (((-x) ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
+"#else\n"
+"    return (((T)(-1) - x) >> low_bit) & RADIX_MASK;\n"
+"#endif\n"
+"}\n"
+
+"#endif\n" // #if defined(ASC)
+
+"__kernel void count(__global const T *input,\n"
+"                    const uint input_offset,\n"
+"                    const uint input_size,\n"
+"                    __global uint *global_counts,\n"
+"                    __global uint *global_offsets,\n"
+"                    __local uint *local_counts,\n"
+"                    const uint low_bit)\n"
+"{\n"
+     // work-item parameters
+"    const uint gid = get_global_id(0);\n"
+"    const uint lid = get_local_id(0);\n"
+
+     // zero local counts
+"    if(lid < K2_BITS){\n"
+"        local_counts[lid] = 0;\n"
+"    }\n"
+"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+     // reduce local counts
+"    if(gid < input_size){\n"
+"        T value = input[input_offset+gid];\n"
+"        uint bucket = radix(value, low_bit);\n"
+"        atomic_inc(local_counts + bucket);\n"
+"    }\n"
+"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+     // write block-relative offsets
+"    if(lid < K2_BITS){\n"
+"        global_counts[K2_BITS*get_group_id(0) + lid] = local_counts[lid];\n"
+
+         // write global offsets
+"        if(get_group_id(0) == (get_num_groups(0) - 1)){\n"
+"            global_offsets[lid] = local_counts[lid];\n"
+"        }\n"
+"    }\n"
+"}\n"
+
+"__kernel void scan(__global const uint *block_offsets,\n"
+"                   __global uint *global_offsets,\n"
+"                   const uint block_count)\n"
+"{\n"
+"    __global const uint *last_block_offsets =\n"
+"        block_offsets + K2_BITS * (block_count - 1);\n"
+
+     // calculate and scan global_offsets
+"    uint sum = 0;\n"
+"    for(uint i = 0; i < K2_BITS; i++){\n"
+"        uint x = global_offsets[i] + last_block_offsets[i];\n"
+"        global_offsets[i] = sum;\n"
+"        sum += x;\n"
+"    }\n"
+"}\n"
+
+"__kernel void scatter(__global const T *input,\n"
+"                      const uint input_offset,\n"
+"                      const uint input_size,\n"
+"                      const uint low_bit,\n"
+"                      __global const uint *counts,\n"
+"                      __global const uint *global_offsets,\n"
+"#ifndef SORT_BY_KEY\n"
+"                      __global T *output,\n"
+"                      const uint output_offset)\n"
+"#else\n"
+"                      __global T *keys_output,\n"
+"                      const uint keys_output_offset,\n"
+"                      __global T2 *values_input,\n"
+"                      const uint values_input_offset,\n"
+"                      __global T2 *values_output,\n"
+"                      const uint values_output_offset)\n"
+"#endif\n"
+"{\n"
+     // work-item parameters
+"    const uint gid = get_global_id(0);\n"
+"    const uint lid = get_local_id(0);\n"
+
+     // copy input to local memory
+"    T value;\n"
+"    uint bucket;\n"
+"    __local uint local_input[BLOCK_SIZE];\n"
+"    if(gid < input_size){\n"
+"        value = input[input_offset+gid];\n"
+"        bucket = radix(value, low_bit);\n"
+"        local_input[lid] = bucket;\n"
+"    }\n"
+
+     // copy block counts to local memory
+"    __local uint local_counts[(1 << K_BITS)];\n"
+"    if(lid < K2_BITS){\n"
+"        local_counts[lid] = counts[get_group_id(0) * K2_BITS + lid];\n"
+"    }\n"
+
+     // wait until local memory is ready
+"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+"    if(gid >= input_size){\n"
+"        return;\n"
+"    }\n"
+
+     // get global offset
+"    uint offset = global_offsets[bucket] + local_counts[bucket];\n"
+
+     // calculate local offset
+"    uint local_offset = 0;\n"
+"    for(uint i = 0; i < lid; i++){\n"
+"        if(local_input[i] == bucket)\n"
+"            local_offset++;\n"
+"    }\n"
+
+"#ifndef SORT_BY_KEY\n"
+     // write value to output
+"    output[output_offset + offset + local_offset] = value;\n"
+"#else\n"
+     // write key and value if doing sort_by_key
+"    keys_output[keys_output_offset+offset + local_offset] = value;\n"
+"    values_output[values_output_offset+offset + local_offset] =\n"
+"        values_input[values_input_offset+gid];\n"
+"#endif\n"
+"}\n";
+
+template<class T, class T2>
+inline void radix_sort_impl(const buffer_iterator<T> first,
+                            const buffer_iterator<T> last,
+                            const buffer_iterator<T2> values_first,
+                            const bool ascending,
+                            command_queue &queue)
+{
+
+    typedef T value_type;
+    typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;
+
+    const device &device = queue.get_device();
+    const context &context = queue.get_context();
+
+
+    // if we have a valid values iterator then we are doing a
+    // sort by key and have to set up the values buffer
+    bool sort_by_key = (values_first.get_buffer().get() != 0);
+
+    // load (or create) radix sort program
+    std::string cache_key =
+        std::string("__boost_radix_sort_") + type_name<value_type>();
+
+    if(sort_by_key){
+        cache_key += std::string("_with_") + type_name<T2>();
+    }
+
+    boost::shared_ptr<program_cache> cache =
+        program_cache::get_global_cache(context);
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    // sort parameters
+    const uint_ k = parameters->get(cache_key, "k", 4);
+    const uint_ k2 = 1 << k;
+    const uint_ block_size = parameters->get(cache_key, "tpb", 128);
+
+    // sort program compiler options
+    std::stringstream options;
+    options << "-DK_BITS=" << k;
+    options << " -DT=" << type_name<sort_type>();
+    options << " -DBLOCK_SIZE=" << block_size;
+
+    if(boost::is_floating_point<value_type>::value){
+        options << " -DIS_FLOATING_POINT";
+    }
+
+    if(boost::is_signed<value_type>::value){
+        options << " -DIS_SIGNED";
+    }
+
+    if(sort_by_key){
+        options << " -DSORT_BY_KEY";
+        options << " -DT2=" << type_name<T2>();
+        options << enable_double<T2>();
+    }
+
+    if(ascending){
+        options << " -DASC";
+    }
+
+    // load radix sort program
+    program radix_sort_program = cache->get_or_build(
+        cache_key, options.str(), radix_sort_source, context
+    );
+
+    kernel count_kernel(radix_sort_program, "count");
+    kernel scan_kernel(radix_sort_program, "scan");
+    kernel scatter_kernel(radix_sort_program, "scatter");
+
+    size_t count = detail::iterator_range_size(first, last);
+
+    uint_ block_count = static_cast<uint_>(count / block_size);
+    if(block_count * block_size != count){
+        block_count++;
+    }
+
+    // setup temporary buffers
+    vector<value_type> output(count, context);
+    vector<T2> values_output(sort_by_key ? count : 0, context);
+    vector<uint_> offsets(k2, context);
+    vector<uint_> counts(block_count * k2, context);
+
+    const buffer *input_buffer = &first.get_buffer();
+    uint_ input_offset = static_cast<uint_>(first.get_index());
+    const buffer *output_buffer = &output.get_buffer();
+    uint_ output_offset = 0;
+    const buffer *values_input_buffer = &values_first.get_buffer();
+    uint_ values_input_offset = static_cast<uint_>(values_first.get_index());
+    const buffer *values_output_buffer = &values_output.get_buffer();
+    uint_ values_output_offset = 0;
+
+    for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
+        // write counts
+        count_kernel.set_arg(0, *input_buffer);
+        count_kernel.set_arg(1, input_offset);
+        count_kernel.set_arg(2, static_cast<uint_>(count));
+        count_kernel.set_arg(3, counts);
+        count_kernel.set_arg(4, offsets);
+        count_kernel.set_arg(5, block_size * sizeof(uint_), 0);
+        count_kernel.set_arg(6, i * k);
+        queue.enqueue_1d_range_kernel(count_kernel,
+                                      0,
+                                      block_count * block_size,
+                                      block_size);
+
+        // scan counts
+        if(k == 1){
+            typedef uint2_ counter_type;
+            ::boost::compute::exclusive_scan(
+                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
+                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2),
+                make_buffer_iterator<counter_type>(counts.get_buffer()),
+                queue
+            );
+        }
+        else if(k == 2){
+            typedef uint4_ counter_type;
+            ::boost::compute::exclusive_scan(
+                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
+                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4),
+                make_buffer_iterator<counter_type>(counts.get_buffer()),
+                queue
+            );
+        }
+        else if(k == 4){
+            typedef uint16_ counter_type;
+            ::boost::compute::exclusive_scan(
+                make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
+                make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16),
+                make_buffer_iterator<counter_type>(counts.get_buffer()),
+                queue
+            );
+        }
+        else {
+            BOOST_ASSERT(false && "unknown k");
+            break;
+        }
+
+        // scan global offsets
+        scan_kernel.set_arg(0, counts);
+        scan_kernel.set_arg(1, offsets);
+        scan_kernel.set_arg(2, block_count);
+        queue.enqueue_task(scan_kernel);
+
+        // scatter values
+        scatter_kernel.set_arg(0, *input_buffer);
+        scatter_kernel.set_arg(1, input_offset);
+        scatter_kernel.set_arg(2, static_cast<uint_>(count));
+        scatter_kernel.set_arg(3, i * k);
+        scatter_kernel.set_arg(4, counts);
+        scatter_kernel.set_arg(5, offsets);
+        scatter_kernel.set_arg(6, *output_buffer);
+        scatter_kernel.set_arg(7, output_offset);
+        if(sort_by_key){
+            scatter_kernel.set_arg(8, *values_input_buffer);
+            scatter_kernel.set_arg(9, values_input_offset);
+            scatter_kernel.set_arg(10, *values_output_buffer);
+            scatter_kernel.set_arg(11, values_output_offset);
+        }
+        queue.enqueue_1d_range_kernel(scatter_kernel,
+                                      0,
+                                      block_count * block_size,
+                                      block_size);
+
+        // swap buffers
+        std::swap(input_buffer, output_buffer);
+        std::swap(values_input_buffer, values_output_buffer);
+        std::swap(input_offset, output_offset);
+        std::swap(values_input_offset, values_output_offset);
+    }
+}
+
+template<class Iterator>
+inline void radix_sort(Iterator first,
+                       Iterator last,
+                       command_queue &queue)
+{
+    radix_sort_impl(first, last, buffer_iterator<int>(), true, queue);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void radix_sort_by_key(KeyIterator keys_first,
+                              KeyIterator keys_last,
+                              ValueIterator values_first,
+                              command_queue &queue)
+{
+    radix_sort_impl(keys_first, keys_last, values_first, true, queue);
+}
+
+template<class Iterator>
+inline void radix_sort(Iterator first,
+                       Iterator last,
+                       const bool ascending,
+                       command_queue &queue)
+{
+    radix_sort_impl(first, last, buffer_iterator<int>(), ascending, queue);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void radix_sort_by_key(KeyIterator keys_first,
+                              KeyIterator keys_last,
+                              ValueIterator values_first,
+                              const bool ascending,
+                              command_queue &queue)
+{
+    radix_sort_impl(keys_first, keys_last, values_first, ascending, queue);
+}
+
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
@@ -0,0 +1,57 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class OutputIterator, class Generator>
+inline void random_fill(OutputIterator first,
+                        OutputIterator last,
+                        Generator &g,
+                        command_queue &queue)
+{
+    g.fill(first, last, queue);
+}
+
+template<class OutputIterator>
+inline void
+random_fill(OutputIterator first,
+            OutputIterator last,
+            typename std::iterator_traits<OutputIterator>::value_type lo,
+            typename std::iterator_traits<OutputIterator>::value_type hi,
+            command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<OutputIterator>::value_type value_type;
+    typedef typename
+        boost::compute::default_random_engine engine_type;
+    typedef typename
+        boost::compute::uniform_real_distribution<value_type> distribution_type;
+
+    engine_type engine(queue);
+    distribution_type generator(lo, hi);
+    generator.fill(first, last, engine, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
@@ -0,0 +1,119 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
+
+#include <algorithm>
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/detail/serial_reduce_by_key.hpp>
+#include <boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp>
+#include <boost/compute/type_traits.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator,
+         class BinaryFunction, class BinaryPredicate>
+size_t reduce_by_key_on_gpu(InputKeyIterator keys_first,
+                            InputKeyIterator keys_last,
+                            InputValueIterator values_first,
+                            OutputKeyIterator keys_result,
+                            OutputValueIterator values_result,
+                            BinaryFunction function,
+                            BinaryPredicate predicate,
+                            command_queue &queue)
+{
+    return detail::reduce_by_key_with_scan(keys_first, keys_last, values_first,
+                                           keys_result, values_result, function,
+                                           predicate, queue);
+}
+
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator>
+bool reduce_by_key_on_gpu_requirements_met(InputKeyIterator keys_first,
+                                           InputValueIterator values_first,
+                                           OutputKeyIterator keys_result,
+                                           OutputValueIterator values_result,
+                                           const size_t count,
+                                           command_queue &queue)
+{
+    const device &device = queue.get_device();
+    return (count > 256)
+               && !(device.type() & device::cpu)
+               && reduce_by_key_with_scan_requirements_met(keys_first, values_first,
+                                                           keys_result,values_result,
+                                                           count, queue);
+    return true;
+}
+
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator,
+         class BinaryFunction, class BinaryPredicate>
+inline std::pair<OutputKeyIterator, OutputValueIterator>
+dispatch_reduce_by_key(InputKeyIterator keys_first,
+                       InputKeyIterator keys_last,
+                       InputValueIterator values_first,
+                       OutputKeyIterator keys_result,
+                       OutputValueIterator values_result,
+                       BinaryFunction function,
+                       BinaryPredicate predicate,
+                       command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<OutputKeyIterator>::difference_type key_difference_type;
+    typedef typename
+        std::iterator_traits<OutputValueIterator>::difference_type value_difference_type;
+
+    const size_t count = detail::iterator_range_size(keys_first, keys_last);
+    if (count < 2) {
+        boost::compute::copy_n(keys_first, count, keys_result, queue);
+        boost::compute::copy_n(values_first, count, values_result, queue);
+        return
+            std::make_pair<OutputKeyIterator, OutputValueIterator>(
+                keys_result + static_cast<key_difference_type>(count),
+                values_result + static_cast<value_difference_type>(count)
+            );
+    }
+
+    size_t result_size = 0;
+    if(reduce_by_key_on_gpu_requirements_met(keys_first, values_first, keys_result,
+                                             values_result, count, queue)){
+        result_size =
+            detail::reduce_by_key_on_gpu(keys_first, keys_last, values_first,
+                                         keys_result, values_result, function,
+                                         predicate, queue);
+    }
+    else {
+        result_size =
+              detail::serial_reduce_by_key(keys_first, keys_last, values_first,
+                                           keys_result, values_result, function,
+                                           predicate, queue);
+    }
+
+    return
+        std::make_pair<OutputKeyIterator, OutputValueIterator>(
+            keys_result + static_cast<key_difference_type>(result_size),
+            values_result + static_cast<value_difference_type>(result_size)
+        );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
@@ -0,0 +1,541 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
+
+#include <algorithm>
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/read_write_single_value.hpp>
+#include <boost/compute/type_traits.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+/// \internal_
+///
+/// Fills \p new_keys_first with unsigned integer keys generated from vector
+/// of original keys \p keys_first. New keys can be distinguish by simple equality
+/// predicate.
+///
+/// \param keys_first iterator pointing to the first key
+/// \param number_of_keys number of keys
+/// \param predicate binary predicate for key comparison
+/// \param new_keys_first iterator pointing to the new keys vector
+/// \param preferred_work_group_size preferred work group size
+/// \param queue command queue to perform the operation
+///
+/// Binary function \p predicate must take two keys as arguments and
+/// return true only if they are considered the same.
+///
+/// The first new key equals zero and the last equals number of unique keys
+/// minus one.
+///
+/// No local memory usage.
+template<class InputKeyIterator, class BinaryPredicate>
+inline void generate_uint_keys(InputKeyIterator keys_first,
+                               size_t number_of_keys,
+                               BinaryPredicate predicate,
+                               vector<uint_>::iterator new_keys_first,
+                               size_t preferred_work_group_size,
+                               command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputKeyIterator>::value_type key_type;
+
+    detail::meta_kernel k("reduce_by_key_new_key_flags");
+    k.add_set_arg<const uint_>("count", uint_(number_of_keys));
+
+    k <<
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        k.decl<uint_>("value") << " = 0;\n" <<
+        "if(gid >= count){\n    return;\n}\n" <<
+        "if(gid > 0){ \n" <<
+        k.decl<key_type>("key") << " = " <<
+                                keys_first[k.var<const uint_>("gid")] << ";\n" <<
+        k.decl<key_type>("previous_key") << " = " <<
+                                keys_first[k.var<const uint_>("gid - 1")] << ";\n" <<
+        "    value = " << predicate(k.var<key_type>("previous_key"),
+                                    k.var<key_type>("key")) <<
+                          " ? 0 : 1;\n" <<
+        "}\n else {\n" <<
+        "    value = 0;\n" <<
+        "}\n" <<
+        new_keys_first[k.var<const uint_>("gid")] << " = value;\n";
+
+    const context &context = queue.get_context();
+    kernel kernel = k.compile(context);
+
+    size_t work_group_size = preferred_work_group_size;
+    size_t work_groups_no = static_cast<size_t>(
+        std::ceil(float(number_of_keys) / work_group_size)
+    );
+
+    queue.enqueue_1d_range_kernel(kernel,
+                                  0,
+                                  work_groups_no * work_group_size,
+                                  work_group_size);
+
+    inclusive_scan(new_keys_first, new_keys_first + number_of_keys,
+                   new_keys_first, queue);
+}
+
+/// \internal_
+/// Calculate carry-out for each work group.
+/// Carry-out is a pair of the last key processed by a work group and sum of all
+/// values under this key in this work group.
+template<class InputValueIterator, class OutputValueIterator, class BinaryFunction>
+inline void carry_outs(vector<uint_>::iterator keys_first,
+                       InputValueIterator values_first,
+                       size_t count,
+                       vector<uint_>::iterator carry_out_keys_first,
+                       OutputValueIterator carry_out_values_first,
+                       BinaryFunction function,
+                       size_t work_group_size,
+                       command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+    detail::meta_kernel k("reduce_by_key_with_scan_carry_outs");
+    k.add_set_arg<const uint_>("count", uint_(count));
+    size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
+    size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
+
+    k <<
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+        k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
+
+        k.decl<uint_>("key") << ";\n" <<
+        k.decl<value_out_type>("value") << ";\n" <<
+        "if(gid < count){\n" <<
+            k.var<uint_>("key") << " = " <<
+                keys_first[k.var<const uint_>("gid")] << ";\n" <<
+            k.var<value_out_type>("value") << " = " <<
+                values_first[k.var<const uint_>("gid")] << ";\n" <<
+            "lkeys[lid] = key;\n" <<
+            "lvals[lid] = value;\n" <<
+        "}\n" <<
+
+        // Calculate carry out for each work group by performing Hillis/Steele scan
+        // where only last element (key-value pair) is saved
+        k.decl<value_out_type>("result") << " = value;\n" <<
+        k.decl<uint_>("other_key") << ";\n" <<
+        k.decl<value_out_type>("other_value") << ";\n" <<
+
+        "for(" << k.decl<uint_>("offset") << " = 1; " <<
+                  "offset < wg_size; offset *= 2){\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    if(lid >= offset){\n"
+        "        other_key = lkeys[lid - offset];\n" <<
+        "        if(other_key == key){\n" <<
+        "            other_value = lvals[lid - offset];\n" <<
+        "            result = " << function(k.var<value_out_type>("result"),
+                                            k.var<value_out_type>("other_value")) << ";\n" <<
+        "        }\n" <<
+        "    }\n" <<
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    lvals[lid] = result;\n" <<
+        "}\n" <<
+
+        // save carry out
+        "if(lid == (wg_size - 1)){\n" <<
+        carry_out_keys_first[k.var<const uint_>("group_id")] << " = key;\n" <<
+        carry_out_values_first[k.var<const uint_>("group_id")] << " = result;\n" <<
+        "}\n";
+
+    size_t work_groups_no = static_cast<size_t>(
+        std::ceil(float(count) / work_group_size)
+    );
+
+    const context &context = queue.get_context();
+    kernel kernel = k.compile(context);
+    kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
+    kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
+
+    queue.enqueue_1d_range_kernel(kernel,
+                                  0,
+                                  work_groups_no * work_group_size,
+                                  work_group_size);
+}
+
+/// \internal_
+/// Calculate carry-in by performing inclusive scan by key on carry-outs vector.
+template<class OutputValueIterator, class BinaryFunction>
+inline void carry_ins(vector<uint_>::iterator carry_out_keys_first,
+                      OutputValueIterator carry_out_values_first,
+                      OutputValueIterator carry_in_values_first,
+                      size_t carry_out_size,
+                      BinaryFunction function,
+                      size_t work_group_size,
+                      command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+    uint_ values_pre_work_item = static_cast<uint_>(
+        std::ceil(float(carry_out_size) / work_group_size)
+    );
+
+    detail::meta_kernel k("reduce_by_key_with_scan_carry_ins");
+    k.add_set_arg<const uint_>("carry_out_size", uint_(carry_out_size));
+    k.add_set_arg<const uint_>("values_per_work_item", values_pre_work_item);
+    size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
+    size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
+
+    k <<
+        k.decl<uint_>("id") << " = get_global_id(0) * values_per_work_item;\n" <<
+        k.decl<uint_>("idx") << " = id;\n" <<
+        k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+        k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
+
+        k.decl<uint_>("key") << ";\n" <<
+        k.decl<value_out_type>("value") << ";\n" <<
+        k.decl<uint_>("previous_key") << ";\n" <<
+        k.decl<value_out_type>("result") << ";\n" <<
+
+        "if(id < carry_out_size){\n" <<
+            k.var<uint_>("previous_key") << " = " <<
+                carry_out_keys_first[k.var<const uint_>("id")] << ";\n" <<
+            k.var<value_out_type>("result") << " = " <<
+                carry_out_values_first[k.var<const uint_>("id")] << ";\n" <<
+            carry_in_values_first[k.var<const uint_>("id")] << " = result;\n" <<
+        "}\n" <<
+
+        k.decl<const uint_>("end") << " = (id + values_per_work_item) <= carry_out_size" <<
+                                      " ? (values_per_work_item + id) :  carry_out_size;\n" <<
+
+        "for(idx = idx + 1; idx < end; idx += 1){\n" <<
+        "    key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" <<
+        "    value = " << carry_out_values_first[k.var<const uint_>("idx")] << ";\n" <<
+        "    if(previous_key == key){\n" <<
+        "        result = " << function(k.var<value_out_type>("result"),
+                                        k.var<value_out_type>("value")) << ";\n" <<
+        "    }\n else { \n" <<
+        "        result = value;\n"
+        "    }\n" <<
+        "    " << carry_in_values_first[k.var<const uint_>("idx")] << " = result;\n" <<
+        "    previous_key = key;\n"
+        "}\n" <<
+
+        // save the last key and result to local memory
+        "lkeys[lid] = previous_key;\n" <<
+        "lvals[lid] = result;\n" <<
+
+        // Hillis/Steele scan
+        "for(" << k.decl<uint_>("offset") << " = 1; " <<
+                  "offset < wg_size; offset *= 2){\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    if(lid >= offset){\n"
+        "        key = lkeys[lid - offset];\n" <<
+        "        if(previous_key == key){\n" <<
+        "            value = lvals[lid - offset];\n" <<
+        "            result = " << function(k.var<value_out_type>("result"),
+                                            k.var<value_out_type>("value")) << ";\n" <<
+        "        }\n" <<
+        "    }\n" <<
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    lvals[lid] = result;\n" <<
+        "}\n" <<
+        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+        "if(lid > 0){\n" <<
+        // load key-value reduced by previous work item
+        "    previous_key = lkeys[lid - 1];\n" <<
+        "    result       = lvals[lid - 1];\n" <<
+        "}\n" <<
+
+        // add key-value reduced by previous work item
+        "for(idx = id; idx < id + values_per_work_item; idx += 1){\n" <<
+        // make sure all carry-ins are saved in global memory
+        "    barrier( CLK_GLOBAL_MEM_FENCE );\n" <<
+        "    if(lid > 0 && idx < carry_out_size) {\n"
+        "        key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" <<
+        "        value = " << carry_in_values_first[k.var<const uint_>("idx")] << ";\n" <<
+        "        if(previous_key == key){\n" <<
+        "            value = " << function(k.var<value_out_type>("result"),
+                                           k.var<value_out_type>("value")) << ";\n" <<
+        "        }\n" <<
+        "        " << carry_in_values_first[k.var<const uint_>("idx")] << " = value;\n" <<
+        "    }\n" <<
+        "}\n";
+
+
+    const context &context = queue.get_context();
+    kernel kernel = k.compile(context);
+    kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
+    kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
+
+    queue.enqueue_1d_range_kernel(kernel,
+                                  0,
+                                  work_group_size,
+                                  work_group_size);
+}
+
+/// \internal_
+///
+/// Perform final reduction by key. Each work item:
+/// 1. Perform local work-group reduction (Hillis/Steele scan)
+/// 2. Add carry-in (if keys are right)
+/// 3. Save reduced value if next key is different than processed one
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator,
+         class BinaryFunction>
+inline void final_reduction(InputKeyIterator keys_first,
+                            InputValueIterator values_first,
+                            OutputKeyIterator keys_result,
+                            OutputValueIterator values_result,
+                            size_t count,
+                            BinaryFunction function,
+                            vector<uint_>::iterator new_keys_first,
+                            vector<uint_>::iterator carry_in_keys_first,
+                            OutputValueIterator carry_in_values_first,
+                            size_t carry_in_size,
+                            size_t work_group_size,
+                            command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+    detail::meta_kernel k("reduce_by_key_with_scan_final_reduction");
+    k.add_set_arg<const uint_>("count", uint_(count));
+    size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
+    size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
+
+    k <<
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+        k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
+
+        k.decl<uint_>("key") << ";\n" <<
+        k.decl<value_out_type>("value") << ";\n"
+
+        "if(gid < count){\n" <<
+            k.var<uint_>("key") << " = " <<
+                new_keys_first[k.var<const uint_>("gid")] << ";\n" <<
+            k.var<value_out_type>("value") << " = " <<
+                values_first[k.var<const uint_>("gid")] << ";\n" <<
+            "lkeys[lid] = key;\n" <<
+            "lvals[lid] = value;\n" <<
+        "}\n" <<
+
+        // Hillis/Steele scan
+        k.decl<value_out_type>("result") << " = value;\n" <<
+        k.decl<uint_>("other_key") << ";\n" <<
+        k.decl<value_out_type>("other_value") << ";\n" <<
+
+        "for(" << k.decl<uint_>("offset") << " = 1; " <<
+                 "offset < wg_size ; offset *= 2){\n"
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    if(lid >= offset) {\n" <<
+        "        other_key = lkeys[lid - offset];\n" <<
+        "        if(other_key == key){\n" <<
+        "            other_value = lvals[lid - offset];\n" <<
+        "            result = " << function(k.var<value_out_type>("result"),
+                                            k.var<value_out_type>("other_value")) << ";\n" <<
+        "        }\n" <<
+        "    }\n" <<
+        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "    lvals[lid] = result;\n" <<
+        "}\n" <<
+
+        "if(gid >= count) {\n return;\n};\n" <<
+
+        k.decl<const bool>("save") << " = (gid < (count - 1)) ?"
+                                   << new_keys_first[k.var<const uint_>("gid + 1")] << " != key" <<
+                                   ": true;\n" <<
+
+        // Add carry in
+        k.decl<uint_>("carry_in_key") << ";\n" <<
+        "if(group_id > 0 && save) {\n" <<
+        "    carry_in_key = " << carry_in_keys_first[k.var<const uint_>("group_id - 1")] << ";\n" <<
+        "    if(key == carry_in_key){\n" <<
+        "        other_value = " << carry_in_values_first[k.var<const uint_>("group_id - 1")] << ";\n" <<
+        "        result = " << function(k.var<value_out_type>("result"),
+                                        k.var<value_out_type>("other_value")) << ";\n" <<
+        "    }\n" <<
+        "}\n" <<
+
+        // Save result only if the next key is different or it's the last element.
+        "if(save){\n" <<
+        keys_result[k.var<uint_>("key")] << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" <<
+        values_result[k.var<uint_>("key")] << " = result;\n" <<
+        "}\n"
+        ;
+
+    size_t work_groups_no = static_cast<size_t>(
+        std::ceil(float(count) / work_group_size)
+    );
+
+    const context &context = queue.get_context();
+    kernel kernel = k.compile(context);
+    kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
+    kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
+
+    queue.enqueue_1d_range_kernel(kernel,
+                                  0,
+                                  work_groups_no * work_group_size,
+                                  work_group_size);
+}
+
+/// \internal_
+/// Returns preferred work group size for reduce by key with scan algorithm.
+template<class KeyType, class ValueType>
+inline size_t get_work_group_size(const device& device)
+{
+    std::string cache_key = std::string("__boost_reduce_by_key_with_scan")
+        + "k_" + type_name<KeyType>() + "_v_" + type_name<ValueType>();
+
+    // load parameters
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    return (std::max)(
+        static_cast<size_t>(parameters->get(cache_key, "wgsize", 256)),
+        static_cast<size_t>(device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>())
+    );
+}
+
+/// \internal_
+///
+/// 1. For each work group carry-out value is calculated (it's done by key-oriented
+/// Hillis/Steele scan). Carry-out is a pair of the last key processed by work
+/// group and sum of all values under this key in work group.
+/// 2. From every carry-out carry-in is calculated by performing inclusive scan
+/// by key.
+/// 3. Final reduction by key is performed (key-oriented Hillis/Steele scan),
+/// carry-in values are added where needed.
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator,
+         class BinaryFunction, class BinaryPredicate>
+inline size_t reduce_by_key_with_scan(InputKeyIterator keys_first,
+                                      InputKeyIterator keys_last,
+                                      InputValueIterator values_first,
+                                      OutputKeyIterator keys_result,
+                                      OutputValueIterator values_result,
+                                      BinaryFunction function,
+                                      BinaryPredicate predicate,
+                                      command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputValueIterator>::value_type value_type;
+    typedef typename
+        std::iterator_traits<InputKeyIterator>::value_type key_type;
+    typedef typename
+        std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+    const context &context = queue.get_context();
+    size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+    if(count == 0){
+        return size_t(0);
+    }
+
+    const device &device = queue.get_device();
+    size_t work_group_size = get_work_group_size<value_type, key_type>(device);
+
+    // Replace original key with unsigned integer keys generated based on given
+    // predicate. New key is also an index for keys_result and values_result vectors,
+    // which points to place where reduced value should be saved.
+    vector<uint_> new_keys(count, context);
+    vector<uint_>::iterator new_keys_first = new_keys.begin();
+    generate_uint_keys(keys_first, count, predicate, new_keys_first,
+                       work_group_size, queue);
+
+    // Calculate carry-out and carry-in vectors size
+    const size_t carry_out_size = static_cast<size_t>(
+           std::ceil(float(count) / work_group_size)
+    );
+    vector<uint_> carry_out_keys(carry_out_size, context);
+    vector<value_out_type> carry_out_values(carry_out_size, context);
+    carry_outs(new_keys_first, values_first, count, carry_out_keys.begin(),
+               carry_out_values.begin(), function, work_group_size, queue);
+
+    vector<value_out_type> carry_in_values(carry_out_size, context);
+    carry_ins(carry_out_keys.begin(), carry_out_values.begin(),
+              carry_in_values.begin(), carry_out_size, function, work_group_size,
+              queue);
+
+    final_reduction(keys_first, values_first, keys_result, values_result,
+                    count, function, new_keys_first, carry_out_keys.begin(),
+                    carry_in_values.begin(), carry_out_size, work_group_size,
+                    queue);
+
+    const size_t result = read_single_value<uint_>(new_keys.get_buffer(),
+                                                   count - 1, queue);
+    return result + 1;
+}
+
+/// \internal_
+/// Return true if requirements for running reduce by key with scan on given
+/// device are met (at least one work group of preferred size can be run).
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator>
+bool reduce_by_key_with_scan_requirements_met(InputKeyIterator keys_first,
+                                              InputValueIterator values_first,
+                                              OutputKeyIterator keys_result,
+                                              OutputValueIterator values_result,
+                                              const size_t count,
+                                              command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputValueIterator>::value_type value_type;
+    typedef typename
+        std::iterator_traits<InputKeyIterator>::value_type key_type;
+    typedef typename
+        std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+    (void) keys_first;
+    (void) values_first;
+    (void) keys_result;
+    (void) values_result;
+
+    const device &device = queue.get_device();
+    // device must have dedicated local memory storage
+    if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
+    {
+        return false;
+    }
+
+    // local memory size in bytes (per compute unit)
+    const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
+
+    // preferred work group size
+    size_t work_group_size = get_work_group_size<key_type, value_type>(device);
+
+    // local memory size needed to perform parallel reduction
+    size_t required_local_mem_size = 0;
+    // keys size
+    required_local_mem_size += sizeof(uint_) * work_group_size;
+    // reduced values size
+    required_local_mem_size += sizeof(value_out_type) * work_group_size;
+
+    return (required_local_mem_size <= local_mem_size);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
@@ -0,0 +1,110 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
+
+#include <algorithm>
+
+#include <boost/compute/buffer.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+#include <boost/compute/algorithm/detail/serial_reduce.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void reduce_on_cpu(InputIterator first,
+                          InputIterator last,
+                          OutputIterator result,
+                          BinaryFunction function,
+                          command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type T;
+    typedef typename
+        ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
+
+    const device &device = queue.get_device();
+    const uint_ compute_units = queue.get_device().compute_units();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    std::string cache_key =
+        "__boost_reduce_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
+
+    // for inputs smaller than serial_reduce_threshold
+    // serial_reduce algorithm is used
+    uint_ serial_reduce_threshold =
+        parameters->get(cache_key, "serial_reduce_threshold", 16384 * sizeof(T));
+    serial_reduce_threshold =
+        (std::max)(serial_reduce_threshold, uint_(compute_units));
+
+    const context &context = queue.get_context();
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return;
+    }
+    else if(count < serial_reduce_threshold) {
+        return serial_reduce(first, last, result, function, queue);
+    }
+
+    meta_kernel k("reduce_on_cpu");
+    buffer output(context, sizeof(result_type) * compute_units);
+
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t output_arg =
+        k.add_arg<result_type *>(memory_object::global_memory, "output");
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+
+        k.decl<result_type>("result") << " = " << first[k.var<uint_>("index")] << ";\n" <<
+        "index++;\n" <<
+        "while(index < end){\n" <<
+             "result = " << function(k.var<T>("result"),
+                                     first[k.var<uint_>("index")]) << ";\n" <<
+             "index++;\n" <<
+        "}\n" <<
+        "output[get_global_id(0)] = result;\n";
+
+    size_t global_work_size = compute_units;
+    kernel kernel = k.compile(context);
+
+    // reduction to global_work_size elements
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(output_arg, output);
+    queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
+
+    // final reduction
+    reduce_on_cpu(
+        make_buffer_iterator<result_type>(output),
+        make_buffer_iterator<result_type>(output, global_work_size),
+        result,
+        function,
+        queue
+    );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
@@ -0,0 +1,286 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
+
+#include <iterator>
+
+#include <boost/compute/utility/source.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/vendor.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/detail/work_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+/// \internal
+/// body reduction inside a warp
+template<typename T,bool isNvidiaDevice>
+struct ReduceBody
+{
+    static std::string body()
+    {
+        std::stringstream k;
+        // local reduction
+        k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
+             "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
+             "   uint mask = (i << 1) - 1;\n"      <<
+             "   if((lid & mask) == 0){\n"         <<
+             "       scratch[lid] += scratch[lid+i];\n" <<
+             "   }\n" <<
+            "}\n";
+        return k.str();
+    }
+};
+
+/// \internal
+/// body reduction inside a warp
+/// for nvidia device we can use the "unsafe"
+/// memory optimisation
+template<typename T>
+struct ReduceBody<T,true>
+{
+    static std::string body()
+    {
+        std::stringstream k;
+        // local reduction
+        // we use TPB to compile only useful instruction
+        // local reduction when size is greater than warp size
+        k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+        "if(TPB >= 1024){\n" <<
+            "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
+         "if(TPB >= 512){\n" <<
+            "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
+         "if(TPB >= 256){\n" <<
+            "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
+         "if(TPB >= 128){\n" <<
+            "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
+
+        // warp reduction
+        "if(lid < 32){\n" <<
+            // volatile this way we don't need any barrier
+            "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
+            "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
+            "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
+            "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
+            "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
+            "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
+            "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
+        "}\n";
+        return k.str();
+    }
+};
+
+template<class InputIterator, class Function>
+inline void initial_reduce(InputIterator first,
+                           InputIterator last,
+                           buffer result,
+                           const Function &function,
+                           kernel &reduce_kernel,
+                           const uint_ vpt,
+                           const uint_ tpb,
+                           command_queue &queue)
+{
+    (void) function;
+    (void) reduce_kernel;
+
+    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
+    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
+
+    size_t count = std::distance(first, last);
+    detail::meta_kernel k("initial_reduce");
+    k.add_set_arg<const uint_>("count", uint_(count));
+    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
+
+    k <<
+        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+
+        "__local " << type_name<T>() << " scratch[TPB];\n" <<
+
+        // private reduction
+        k.decl<T>("sum") << " = 0;\n" <<
+        "for(uint i = 0; i < VPT; i++){\n" <<
+        "    if(offset + lid + i*TPB < count){\n" <<
+        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
+        "    }\n" <<
+        "}\n" <<
+
+        "scratch[lid] = sum;\n" <<
+
+        // local reduction
+        ReduceBody<T,false>::body() <<
+
+        // write sum to output
+        "if(lid == 0){\n" <<
+        "    output[get_group_id(0)] = scratch[0];\n" <<
+        "}\n";
+
+    const context &context = queue.get_context();
+    std::stringstream options;
+    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
+    kernel generic_reduce_kernel = k.compile(context, options.str());
+    generic_reduce_kernel.set_arg(output_arg, result);
+
+    size_t work_size = calculate_work_size(count, vpt, tpb);
+
+    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
+}
+
+template<class T>
+inline void initial_reduce(const buffer_iterator<T> &first,
+                           const buffer_iterator<T> &last,
+                           const buffer &result,
+                           const plus<T> &function,
+                           kernel &reduce_kernel,
+                           const uint_ vpt,
+                           const uint_ tpb,
+                           command_queue &queue)
+{
+    (void) function;
+
+    size_t count = std::distance(first, last);
+
+    reduce_kernel.set_arg(0, first.get_buffer());
+    reduce_kernel.set_arg(1, uint_(first.get_index()));
+    reduce_kernel.set_arg(2, uint_(count));
+    reduce_kernel.set_arg(3, result);
+    reduce_kernel.set_arg(4, uint_(0));
+
+    size_t work_size = calculate_work_size(count, vpt, tpb);
+
+    queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
+}
+
+template<class InputIterator, class T, class Function>
+inline void reduce_on_gpu(InputIterator first,
+                          InputIterator last,
+                          buffer_iterator<T> result,
+                          Function function,
+                          command_queue &queue)
+{
+    const device &device = queue.get_device();
+    const context &context = queue.get_context();
+
+    detail::meta_kernel k("reduce");
+    k.add_arg<const T*>(memory_object::global_memory, "input");
+    k.add_arg<const uint_>("offset");
+    k.add_arg<const uint_>("count");
+    k.add_arg<T*>(memory_object::global_memory, "output");
+    k.add_arg<const uint_>("output_offset");
+
+    k <<
+        k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
+        "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+
+        "__local " << type_name<T>() << " scratch[TPB];\n" <<
+        // private reduction
+        k.decl<T>("sum") << " = 0;\n" <<
+        "for(uint i = 0; i < VPT; i++){\n" <<
+        "    if(block_offset + lid + i*TPB < count){\n" <<
+        "        sum = sum + block[lid+i*TPB]; \n" <<
+        "    }\n" <<
+        "}\n" <<
+
+        "scratch[lid] = sum;\n";
+
+    // discrimination on vendor name
+    if(is_nvidia_device(device))
+        k << ReduceBody<T,true>::body();
+    else
+        k << ReduceBody<T,false>::body();
+
+    k <<
+        // write sum to output
+         "if(lid == 0){\n" <<
+         "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
+         "}\n";
+
+    std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
+
+    // load parameters
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    uint_ vpt = parameters->get(cache_key, "vpt", 8);
+    uint_ tpb = parameters->get(cache_key, "tpb", 128);
+
+    // reduce program compiler flags
+    std::stringstream options;
+    options << "-DT=" << type_name<T>()
+            << " -DVPT=" << vpt
+            << " -DTPB=" << tpb;
+
+    // load program
+    boost::shared_ptr<program_cache> cache =
+        program_cache::get_global_cache(context);
+
+    program reduce_program = cache->get_or_build(
+        cache_key, options.str(), k.source(), context
+    );
+
+    // create reduce kernel
+    kernel reduce_kernel(reduce_program, "reduce");
+
+    size_t count = std::distance(first, last);
+
+    // first pass, reduce from input to ping
+    buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
+    initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
+
+    // update count after initial reduce
+    count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
+
+    // middle pass(es), reduce between ping and pong
+    const buffer *input_buffer = &ping;
+    buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
+    const buffer *output_buffer = &pong;
+    if(count > vpt * tpb){
+        while(count > vpt * tpb){
+            reduce_kernel.set_arg(0, *input_buffer);
+            reduce_kernel.set_arg(1, uint_(0));
+            reduce_kernel.set_arg(2, uint_(count));
+            reduce_kernel.set_arg(3, *output_buffer);
+            reduce_kernel.set_arg(4, uint_(0));
+
+            size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
+            if(work_size % tpb != 0){
+                work_size += tpb - work_size % tpb;
+            }
+            queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
+
+            std::swap(input_buffer, output_buffer);
+            count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
+        }
+    }
+
+    // final pass, reduce from ping/pong to result
+    reduce_kernel.set_arg(0, *input_buffer);
+    reduce_kernel.set_arg(1, uint_(0));
+    reduce_kernel.set_arg(2, uint_(count));
+    reduce_kernel.set_arg(3, result.get_buffer());
+    reduce_kernel.set_arg(4, uint_(result.get_index()));
+
+    queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/algorithm/detail/scan_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/scan_on_gpu.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan(InputIterator first,
+                           InputIterator last,
+                           OutputIterator result,
+                           bool exclusive,
+                           T init,
+                           BinaryOperator op,
+                           command_queue &queue)
+{
+    const device &device = queue.get_device();
+
+    if(device.type() & device::cpu){
+        return scan_on_cpu(first, last, result, exclusive, init, op, queue);
+    }
+    else {
+        return scan_on_gpu(first, last, result, exclusive, init, op, queue);
+    }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
@@ -0,0 +1,207 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
+
+#include <iterator>
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/serial_scan.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan_on_cpu(InputIterator first,
+                                  InputIterator last,
+                                  OutputIterator result,
+                                  bool exclusive,
+                                  T init,
+                                  BinaryOperator op,
+                                  command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type input_type;
+    typedef typename
+        std::iterator_traits<OutputIterator>::value_type output_type;
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+    const size_t compute_units = queue.get_device().compute_units();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    std::string cache_key =
+        "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
+
+    // for inputs smaller than serial_scan_threshold
+    // serial_scan algorithm is used
+    uint_ serial_scan_threshold =
+        parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
+    serial_scan_threshold =
+        (std::max)(serial_scan_threshold, uint_(compute_units));
+
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+    else if(count < serial_scan_threshold) {
+        return serial_scan(first, last, result, exclusive, init, op, queue);
+    }
+
+    buffer block_partial_sums(context, sizeof(output_type) * compute_units );
+
+    // create scan kernel
+    meta_kernel k("scan_on_cpu_block_scan");
+
+    // Arguments
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t init_arg = k.add_arg<output_type>("initial_value");
+    size_t block_partial_sums_arg =
+        k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n";
+
+    if(!exclusive){
+        k <<
+            k.decl<output_type>("sum") << " = " <<
+                first[k.var<uint_>("index")] << ";\n" <<
+            result[k.var<uint_>("index")] << " = sum;\n" <<
+            "index++;\n";
+    }
+    else {
+        k <<
+            k.decl<output_type>("sum") << ";\n" <<
+            "if(index == 0){\n" <<
+                "sum = initial_value;\n" <<
+            "}\n" <<
+            "else {\n" <<
+                "sum = " << first[k.var<uint_>("index")] << ";\n" <<
+                "index++;\n" <<
+            "}\n";
+    }
+
+    k <<
+        "while(index < end){\n" <<
+            // load next value
+            k.decl<const input_type>("value") << " = "
+                << first[k.var<uint_>("index")] << ";\n";
+
+    if(exclusive){
+        k <<
+            "if(get_global_id(0) == 0){\n" <<
+                result[k.var<uint_>("index")] << " = sum;\n" <<
+            "}\n";
+    }
+    k <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           k.var<output_type>("value")) << ";\n";
+
+    if(!exclusive){
+        k <<
+            "if(get_global_id(0) == 0){\n" <<
+                result[k.var<uint_>("index")] << " = sum;\n" <<
+            "}\n";
+    }
+
+    k <<
+            "index++;\n" <<
+        "}\n" << // end while
+        "block_partial_sums[get_global_id(0)] = sum;\n";
+
+    // compile scan kernel
+    kernel block_scan_kernel = k.compile(context);
+
+    // setup kernel arguments
+    block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
+    block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
+    block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
+
+    // execute the kernel
+    size_t global_work_size = compute_units;
+    queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
+
+    // scan is done
+    if(compute_units < 2) {
+        return result + count;
+    }
+
+    // final scan kernel
+    meta_kernel l("scan_on_cpu_final_scan");
+
+    // Arguments
+    count_arg = l.add_arg<uint_>("count");
+    block_partial_sums_arg =
+        l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
+
+    l <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
+        "uint index = block + get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+
+        k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
+        "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
+        "}\n" <<
+
+        "while(index < end){\n";
+    if(exclusive){
+        l <<
+            l.decl<output_type>("value") << " = "
+                << first[k.var<uint_>("index")] << ";\n" <<
+            result[k.var<uint_>("index")] << " = sum;\n" <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           k.var<output_type>("value")) << ";\n";
+    }
+    else {
+        l <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           first[k.var<uint_>("index")]) << ";\n" <<
+            result[k.var<uint_>("index")] << " = sum;\n";
+    }
+    l <<
+            "index++;\n" <<
+        "}\n";
+
+
+    // compile scan kernel
+    kernel final_scan_kernel = l.compile(context);
+
+    // setup kernel arguments
+    final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
+    final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
+
+    // execute the kernel
+    global_work_size = compute_units;
+    queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
+
+    // return iterator pointing to the end of the result range
+    return result + count;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
@@ -0,0 +1,330 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryOperator>
+class local_scan_kernel : public meta_kernel
+{
+public:
+    local_scan_kernel(InputIterator first,
+                      InputIterator last,
+                      OutputIterator result,
+                      bool exclusive,
+                      BinaryOperator op)
+        : meta_kernel("local_scan")
+    {
+        typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+        (void) last;
+
+        bool checked = true;
+
+        m_block_sums_arg = add_arg<T *>(memory_object::global_memory, "block_sums");
+        m_scratch_arg = add_arg<T *>(memory_object::local_memory, "scratch");
+        m_block_size_arg = add_arg<const cl_uint>("block_size");
+        m_count_arg = add_arg<const cl_uint>("count");
+        m_init_value_arg = add_arg<const T>("init");
+
+        // work-item parameters
+        *this <<
+            "const uint gid = get_global_id(0);\n" <<
+            "const uint lid = get_local_id(0);\n";
+
+        // check against data size
+        if(checked){
+            *this <<
+                "if(gid < count){\n";
+        }
+
+        // copy values from input to local memory
+        if(exclusive){
+            *this <<
+                decl<const T>("local_init") << "= (gid == 0) ? init : 0;\n" <<
+                "if(lid == 0){ scratch[lid] = local_init; }\n" <<
+                "else { scratch[lid] = " << first[expr<cl_uint>("gid-1")] << "; }\n";
+        }
+        else{
+            *this <<
+                "scratch[lid] = " << first[expr<cl_uint>("gid")] << ";\n";
+        }
+
+        if(checked){
+            *this <<
+                "}\n"
+                "else {\n" <<
+                "    scratch[lid] = 0;\n" <<
+                "}\n";
+        }
+
+        // wait for all threads to read from input
+        *this <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n";
+
+        // perform scan
+        *this <<
+            "for(uint i = 1; i < block_size; i <<= 1){\n" <<
+            "    " << decl<const T>("x") << " = lid >= i ? scratch[lid-i] : 0;\n" <<
+            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "    if(lid >= i){\n" <<
+            "        scratch[lid] = " << op(var<T>("scratch[lid]"), var<T>("x")) << ";\n" <<
+            "    }\n" <<
+            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "}\n";
+
+        // copy results to output
+        if(checked){
+            *this <<
+                "if(gid < count){\n";
+        }
+
+        *this <<
+            result[expr<cl_uint>("gid")] << " = scratch[lid];\n";
+
+        if(checked){
+            *this << "}\n";
+        }
+
+        // store sum for the block
+        if(exclusive){
+            *this <<
+                "if(lid == block_size - 1){\n" <<
+                "    block_sums[get_group_id(0)] = " <<
+                       op(first[expr<cl_uint>("gid")], var<T>("scratch[lid]")) <<
+                       ";\n" <<
+                "}\n";
+        }
+        else {
+            *this <<
+                "if(lid == block_size - 1){\n" <<
+                "    block_sums[get_group_id(0)] = scratch[lid];\n" <<
+                "}\n";
+        }
+    }
+
+    size_t m_block_sums_arg;
+    size_t m_scratch_arg;
+    size_t m_block_size_arg;
+    size_t m_count_arg;
+    size_t m_init_value_arg;
+};
+
+template<class T, class BinaryOperator>
+class write_scanned_output_kernel : public meta_kernel
+{
+public:
+    write_scanned_output_kernel(BinaryOperator op)
+        : meta_kernel("write_scanned_output")
+    {
+        bool checked = true;
+
+        m_output_arg = add_arg<T *>(memory_object::global_memory, "output");
+        m_block_sums_arg = add_arg<const T *>(memory_object::global_memory, "block_sums");
+        m_count_arg = add_arg<const cl_uint>("count");
+
+        // work-item parameters
+        *this <<
+            "const uint gid = get_global_id(0);\n" <<
+            "const uint block_id = get_group_id(0);\n";
+
+        // check against data size
+        if(checked){
+            *this << "if(gid < count){\n";
+        }
+
+        // write output
+        *this <<
+            "output[gid] = " <<
+                op(var<T>("block_sums[block_id]"), var<T>("output[gid] ")) << ";\n";
+
+        if(checked){
+            *this << "}\n";
+        }
+    }
+
+    size_t m_output_arg;
+    size_t m_block_sums_arg;
+    size_t m_count_arg;
+};
+
+template<class InputIterator>
+inline size_t pick_scan_block_size(InputIterator first, InputIterator last)
+{
+    size_t count = iterator_range_size(first, last);
+
+    if(count == 0)        { return 0; }
+    else if(count <= 1)   { return 1; }
+    else if(count <= 2)   { return 2; }
+    else if(count <= 4)   { return 4; }
+    else if(count <= 8)   { return 8; }
+    else if(count <= 16)  { return 16; }
+    else if(count <= 32)  { return 32; }
+    else if(count <= 64)  { return 64; }
+    else if(count <= 128) { return 128; }
+    else                  { return 256; }
+}
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan_impl(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                bool exclusive,
+                                T init,
+                                BinaryOperator op,
+                                command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type
+        input_type;
+    typedef typename
+        std::iterator_traits<InputIterator>::difference_type
+        difference_type;
+    typedef typename
+        std::iterator_traits<OutputIterator>::value_type
+        output_type;
+
+    const context &context = queue.get_context();
+    const size_t count = detail::iterator_range_size(first, last);
+
+    size_t block_size = pick_scan_block_size(first, last);
+    size_t block_count = count / block_size;
+
+    if(block_count * block_size < count){
+        block_count++;
+    }
+
+    ::boost::compute::vector<input_type> block_sums(block_count, context);
+
+    // zero block sums
+    input_type zero;
+    std::memset(&zero, 0, sizeof(input_type));
+    ::boost::compute::fill(block_sums.begin(), block_sums.end(), zero, queue);
+
+    // local scan
+    local_scan_kernel<InputIterator, OutputIterator, BinaryOperator>
+        local_scan_kernel(first, last, result, exclusive, op);
+
+    ::boost::compute::kernel kernel = local_scan_kernel.compile(context);
+    kernel.set_arg(local_scan_kernel.m_scratch_arg, local_buffer<input_type>(block_size));
+    kernel.set_arg(local_scan_kernel.m_block_sums_arg, block_sums);
+    kernel.set_arg(local_scan_kernel.m_block_size_arg, static_cast<cl_uint>(block_size));
+    kernel.set_arg(local_scan_kernel.m_count_arg, static_cast<cl_uint>(count));
+    kernel.set_arg(local_scan_kernel.m_init_value_arg, static_cast<output_type>(init));
+
+    queue.enqueue_1d_range_kernel(kernel,
+                                  0,
+                                  block_count * block_size,
+                                  block_size);
+
+    // inclusive scan block sums
+    if(block_count > 1){
+        scan_impl(block_sums.begin(),
+                  block_sums.end(),
+                  block_sums.begin(),
+                  false,
+                  init,
+                  op,
+                  queue
+        );
+    }
+
+    // add block sums to each block
+    if(block_count > 1){
+        write_scanned_output_kernel<input_type, BinaryOperator>
+            write_output_kernel(op);
+        kernel = write_output_kernel.compile(context);
+        kernel.set_arg(write_output_kernel.m_output_arg, result.get_buffer());
+        kernel.set_arg(write_output_kernel.m_block_sums_arg, block_sums);
+        kernel.set_arg(write_output_kernel.m_count_arg, static_cast<cl_uint>(count));
+
+        queue.enqueue_1d_range_kernel(kernel,
+                                      block_size,
+                                      block_count * block_size,
+                                      block_size);
+    }
+
+    return result + static_cast<difference_type>(count);
+}
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator dispatch_scan(InputIterator first,
+                                    InputIterator last,
+                                    OutputIterator result,
+                                    bool exclusive,
+                                    T init,
+                                    BinaryOperator op,
+                                    command_queue &queue)
+{
+    return scan_impl(first, last, result, exclusive, init, op, queue);
+}
+
+template<class InputIterator, class T, class BinaryOperator>
+inline InputIterator dispatch_scan(InputIterator first,
+                                   InputIterator last,
+                                   InputIterator result,
+                                   bool exclusive,
+                                   T init,
+                                   BinaryOperator op,
+                                   command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+    if(first == result){
+        // scan input in-place
+        const context &context = queue.get_context();
+
+        // make a temporary copy the input
+        size_t count = iterator_range_size(first, last);
+        vector<value_type> tmp(count, context);
+        copy(first, last, tmp.begin(), queue);
+
+        // scan from temporary values
+        return scan_impl(tmp.begin(), tmp.end(), first, exclusive, init, op, queue);
+    }
+    else {
+        // scan input to output
+        return scan_impl(first, last, result, exclusive, init, op, queue);
+    }
+}
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan_on_gpu(InputIterator first,
+                                  InputIterator last,
+                                  OutputIterator result,
+                                  bool exclusive,
+                                  T init,
+                                  BinaryOperator op,
+                                  command_queue &queue)
+{
+    if(first == last){
+        return result;
+    }
+
+    return dispatch_scan(first, last, result, exclusive, init, op, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
@@ -0,0 +1,86 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Search kernel class
+///
+/// Subclass of meta_kernel which is capable of performing pattern matching
+///
+template<class PatternIterator, class TextIterator, class OutputIterator>
+class search_kernel : public meta_kernel
+{
+public:
+    search_kernel() : meta_kernel("search")
+    {}
+
+    void set_range(PatternIterator p_first,
+                   PatternIterator p_last,
+                   TextIterator t_first,
+                   TextIterator t_last,
+                   OutputIterator result)
+    {
+        m_p_count = iterator_range_size(p_first, p_last);
+        m_p_count_arg = add_arg<uint_>("p_count");
+
+        m_count = iterator_range_size(t_first, t_last);
+        m_count = m_count + 1 - m_p_count;
+
+        *this <<
+            "uint i = get_global_id(0);\n" <<
+            "const uint i1 = i;\n" <<
+            "uint j;\n" <<
+            "for(j = 0; j<p_count; j++,i++)\n" <<
+            "{\n" <<
+            "   if(" << p_first[expr<uint_>("j")] << " != " <<
+                    t_first[expr<uint_>("i")] << ")\n" <<
+            "       j = p_count + 1;\n" <<
+            "}\n" <<
+            "if(j == p_count)\n" <<
+            result[expr<uint_>("i1")] << " = 1;\n" <<
+            "else\n" <<
+            result[expr<uint_>("i1")] << " = 0;\n";
+    }
+
+    event exec(command_queue &queue)
+    {
+        if(m_count == 0) {
+            return event();
+        }
+
+        set_arg(m_p_count_arg, uint_(m_p_count));
+
+        return exec_1d(queue, 0, m_count);
+    }
+
+private:
+    size_t m_p_count;
+    size_t m_p_count_arg;
+    size_t m_count;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
@@ -0,0 +1,56 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryFunction>
+inline void serial_accumulate(InputIterator first,
+                              InputIterator last,
+                              OutputIterator result,
+                              T init,
+                              BinaryFunction function,
+                              command_queue &queue)
+{
+    const context &context = queue.get_context();
+    size_t count = detail::iterator_range_size(first, last);
+
+    meta_kernel k("serial_accumulate");
+    size_t init_arg = k.add_arg<T>("init");
+    size_t count_arg = k.add_arg<cl_uint>("count");
+
+    k <<
+        k.decl<T>("result") << " = init;\n" <<
+        "for(uint i = 0; i < count; i++)\n" <<
+        "    result = " << function(k.var<T>("result"),
+                                    first[k.var<cl_uint>("i")]) << ";\n" <<
+        result[0] << " = result;\n";
+
+    kernel kernel = k.compile(context);
+
+    kernel.set_arg(init_arg, init);
+    kernel.set_arg(count_arg, static_cast<cl_uint>(count));
+
+    queue.enqueue_task(kernel);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
+
+#include <iterator>
+
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+// counts values that match the predicate using a single thread
+template<class InputIterator, class Predicate>
+inline size_t serial_count_if(InputIterator first,
+                              InputIterator last,
+                              Predicate predicate,
+                              command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+    const context &context = queue.get_context();
+    size_t size = iterator_range_size(first, last);
+
+    meta_kernel k("serial_count_if");
+    k.add_set_arg("size", static_cast<uint_>(size));
+    size_t result_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result");
+
+    k <<
+        "uint count = 0;\n" <<
+        "for(uint i = 0; i < size; i++){\n" <<
+            k.decl<const value_type>("value") << "="
+                << first[k.var<uint_>("i")] << ";\n" <<
+            "if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+                "count++;\n" <<
+            "}\n"
+        "}\n"
+        "*result = count;\n";
+
+    kernel kernel = k.compile(context);
+
+    // setup result buffer
+    scalar<uint_> result(context);
+    kernel.set_arg(result_arg, result.get_buffer());
+
+    // run kernel
+    queue.enqueue_task(kernel);
+
+    // read index
+    return result.read(queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/types/fundamental.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator serial_find_extrema(InputIterator first,
+                                         InputIterator last,
+                                         Compare compare,
+                                         const bool find_minimum,
+                                         command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+    const context &context = queue.get_context();
+
+    meta_kernel k("serial_find_extrema");
+
+    k <<
+        k.decl<value_type>("value") << " = " << first[k.expr<uint_>("0")] << ";\n" <<
+        k.decl<uint_>("value_index") << " = 0;\n" <<
+        "for(uint i = 1; i < size; i++){\n" <<
+        "  " << k.decl<value_type>("candidate") << "="
+             << first[k.expr<uint_>("i")] << ";\n" <<
+
+        "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+        "  if(" << compare(k.var<value_type>("candidate"),
+                           k.var<value_type>("value")) << "){\n" <<
+        "#else\n" <<
+        "  if(" << compare(k.var<value_type>("value"),
+                           k.var<value_type>("candidate")) << "){\n" <<
+        "#endif\n" <<
+
+        "    value = candidate;\n" <<
+        "    value_index = i;\n" <<
+        "  }\n" <<
+        "}\n" <<
+        "*index = value_index;\n";
+
+    size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index");
+    size_t size_arg_index = k.add_arg<uint_>("size");
+
+    std::string options;
+    if(!find_minimum){
+        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+    }
+    kernel kernel = k.compile(context, options);
+
+    // setup index buffer
+    scalar<uint_> index(context);
+    kernel.set_arg(index_arg_index, index.get_buffer());
+
+    // setup count
+    size_t count = iterator_range_size(first, last);
+    kernel.set_arg(size_arg_index, static_cast<uint_>(count));
+
+    // run kernel
+    queue.enqueue_task(kernel);
+
+    // read index and return iterator
+    return first + static_cast<difference_type>(index.read(queue));
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
@@ -0,0 +1,97 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
+#define BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator1,
+         class InputIterator2,
+         class OutputIterator,
+         class Compare>
+inline OutputIterator serial_merge(InputIterator1 first1,
+                                   InputIterator1 last1,
+                                   InputIterator2 first2,
+                                   InputIterator2 last2,
+                                   OutputIterator result,
+                                   Compare comp,
+                                   command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputIterator1>::value_type
+        input_type1;
+    typedef typename
+        std::iterator_traits<InputIterator2>::value_type
+        input_type2;
+    typedef typename
+        std::iterator_traits<OutputIterator>::difference_type
+        result_difference_type;
+
+    std::ptrdiff_t size1 = std::distance(first1, last1);
+    std::ptrdiff_t size2 = std::distance(first2, last2);
+
+    meta_kernel k("serial_merge");
+    k.add_set_arg<uint_>("size1", static_cast<uint_>(size1));
+    k.add_set_arg<uint_>("size2", static_cast<uint_>(size2));
+
+    k <<
+        "uint i = 0;\n" << // index in result range
+        "uint j = 0;\n" << // index in first input range
+        "uint k = 0;\n" << // index in second input range
+
+        // fetch initial values from each range
+        k.decl<input_type1>("j_value") << " = " << first1[0] << ";\n" <<
+        k.decl<input_type2>("k_value") << " = " << first2[0] << ";\n" <<
+
+        // merge values from both input ranges to the result range
+        "while(j < size1 && k < size2){\n" <<
+        "    if(" << comp(k.var<input_type1>("j_value"),
+                          k.var<input_type2>("k_value")) << "){\n" <<
+        "        " << result[k.var<uint_>("i++")] << " = j_value;\n" <<
+        "        j_value = " << first1[k.var<uint_>("++j")] << ";\n" <<
+        "    }\n" <<
+        "    else{\n"
+        "        " << result[k.var<uint_>("i++")] << " = k_value;\n"
+        "        k_value = " << first2[k.var<uint_>("++k")] << ";\n" <<
+        "    }\n"
+        "}\n"
+
+        // copy any remaining values from first range
+        "while(j < size1){\n" <<
+            result[k.var<uint_>("i++")] << " = " <<
+               first1[k.var<uint_>("j++")] << ";\n" <<
+        "}\n"
+
+        // copy any remaining values from second range
+        "while(k < size2){\n" <<
+            result[k.var<uint_>("i++")] << " = " <<
+               first2[k.var<uint_>("k++")] << ";\n" <<
+        "}\n";
+
+    // run kernel
+    k.exec(queue);
+
+    return result + static_cast<result_difference_type>(size1 + size2);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
@@ -0,0 +1,62 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void serial_reduce(InputIterator first,
+                          InputIterator last,
+                          OutputIterator result,
+                          BinaryFunction function,
+                          command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type T;
+    typedef typename
+        ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
+
+    const context &context = queue.get_context();
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return;
+    }
+
+    meta_kernel k("serial_reduce");
+    size_t count_arg = k.add_arg<cl_uint>("count");
+
+    k <<
+        k.decl<result_type>("result") << " = " << first[0] << ";\n" <<
+        "for(uint i = 1; i < count; i++)\n" <<
+        "    result = " << function(k.var<T>("result"),
+                                    first[k.var<uint_>("i")]) << ";\n" <<
+        result[0] << " = result;\n";
+
+    kernel kernel = k.compile(context);
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+    queue.enqueue_task(kernel);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
@@ -0,0 +1,108 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputKeyIterator, class InputValueIterator,
+         class OutputKeyIterator, class OutputValueIterator,
+         class BinaryFunction, class BinaryPredicate>
+inline size_t serial_reduce_by_key(InputKeyIterator keys_first,
+                                   InputKeyIterator keys_last,
+                                   InputValueIterator values_first,
+                                   OutputKeyIterator keys_result,
+                                   OutputValueIterator values_result,
+                                   BinaryFunction function,
+                                   BinaryPredicate predicate,
+                                   command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputValueIterator>::value_type value_type;
+    typedef typename
+        std::iterator_traits<InputKeyIterator>::value_type key_type;
+    typedef typename
+        ::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type;
+
+    const context &context = queue.get_context();
+    size_t count = detail::iterator_range_size(keys_first, keys_last);
+    if(count < 1){
+        return count;
+    }
+
+    meta_kernel k("serial_reduce_by_key");
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory,
+                                                "result_size");
+
+    convert<result_type> to_result_type;
+
+    k <<
+        k.decl<result_type>("result") <<
+            " = " << to_result_type(values_first[0]) << ";\n" <<
+        k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" <<
+        k.decl<result_type>("value") << ";\n" <<
+        k.decl<key_type>("key") << ";\n" <<
+
+        k.decl<uint_>("size") << " = 1;\n" <<
+
+        keys_result[0] << " = previous_key;\n" <<
+        values_result[0] << " = result;\n" <<
+
+        "for(ulong i = 1; i < count; i++) {\n" <<
+        "    value = " << to_result_type(values_first[k.var<uint_>("i")]) << ";\n" <<
+        "    key = " << keys_first[k.var<uint_>("i")] << ";\n" <<
+        "    if (" << predicate(k.var<key_type>("previous_key"),
+                                k.var<key_type>("key")) << ") {\n" <<
+
+        "        result = " << function(k.var<result_type>("result"),
+                                        k.var<result_type>("value")) << ";\n" <<
+        "    }\n " <<
+        "    else { \n" <<
+                 keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
+                 values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
+        "        result = value;\n" <<
+        "        size++;\n" <<
+        "    } \n" <<
+        "    previous_key = key;\n" <<
+        "}\n" <<
+        keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
+        values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
+        "*result_size = size;";
+
+    kernel kernel = k.compile(context);
+
+    scalar<uint_> result_size(context);
+    kernel.set_arg(result_size_arg, result_size.get_buffer());
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+    queue.enqueue_task(kernel);
+
+    return static_cast<size_t>(result_size.read(queue));
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
@@ -0,0 +1,103 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
+
+#include <iterator>
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator serial_scan(InputIterator first,
+                                  InputIterator last,
+                                  OutputIterator result,
+                                  bool exclusive,
+                                  T init,
+                                  BinaryOperator op,
+                                  command_queue &queue)
+{
+    if(first == last){
+        return result;
+    }
+
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type input_type;
+    typedef typename
+        std::iterator_traits<OutputIterator>::value_type output_type;
+
+    const context &context = queue.get_context();
+
+    // create scan kernel
+    meta_kernel k("serial_scan");
+
+    // Arguments
+    size_t n_arg = k.add_arg<ulong_>("n");
+    size_t init_arg = k.add_arg<output_type>("initial_value");
+
+    if(!exclusive){
+        k <<
+            k.decl<const ulong_>("start_idx") << " = 1;\n" <<
+            k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
+            result[0] << " = sum;\n";
+    }
+    else {
+        k <<
+            k.decl<const ulong_>("start_idx") << " = 0;\n" <<
+            k.decl<output_type>("sum") << " = initial_value;\n";
+    }
+
+    k <<
+        "for(ulong i = start_idx; i < n; i++){\n" <<
+        k.decl<const input_type>("x") << " = "
+            << first[k.var<ulong_>("i")] << ";\n";
+
+    if(exclusive){
+        k << result[k.var<ulong_>("i")] << " = sum;\n";
+    }
+
+    k << "    sum = "
+        << op(k.var<output_type>("sum"), k.var<output_type>("x"))
+        << ";\n";
+
+    if(!exclusive){
+        k << result[k.var<ulong_>("i")] << " = sum;\n";
+    }
+
+    k << "}\n";
+
+    // compile scan kernel
+    kernel scan_kernel = k.compile(context);
+
+    // setup kernel arguments
+    size_t n = detail::iterator_range_size(first, last);
+    scan_kernel.set_arg<ulong_>(n_arg, n);
+    scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
+
+    // execute the kernel
+    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
+
+    // return iterator pointing to the end of the result range
+    return result + n;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP