Thrust is a very handy STL-like template library for rapid data processing on GPUs.
In real applications it is often needed to perform the same data processing (in Thrust’s terminology - transform) multiple times on different datasets. Transforms containing reduction (reduction, sorting, transform-reduce, etc.) require temporary arrays allocation. By default these allocations are performed for each individual transform, adding cudaMalloc/cudaFree operations, that could be quite expensive
An obvious optimization method here would be to reuse single temporary arrays set across multiple transforms. This could be done by hooking cudaMalloc/cudaFree at application level. More elegant solution exists in Thrust, recently pointed out by Eli Graser:
#include <thrust/system/cuda/vector.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/pair.h>
#include <iostream>
#include <map>
// Example by Nathan Bell and Jared Hoberock
// (modified by Mihail Ivakhnenko)
//
// This example demonstrates how to intercept calls to get_temporary_buffer
// and return_temporary_buffer to control how Thrust allocates temporary storage
// during algorithms such as thrust::reduce. The idea will be to create a simple
// cache of allocations to search when temporary storage is requested. If a hit
// is found in the cache, we quickly return the cached allocation instead of
// resorting to the more expensive thrust::cuda::malloc.
//
// Note: this implementation cached_allocator is not thread-safe. If multiple
// (host) threads use the same cached_allocator then they should gain exclusive
// access to the allocator before accessing its methods.
// cached_allocator: a simple allocator for caching allocation requests
class cached_allocator
{
public:
// just allocate bytes
typedef char value_type;
cached_allocator() { }
~cached_allocator()
{
// free all allocations when cached_allocator goes out of scope
free_all();
}
char* allocate(std::ptrdiff_t num_bytes)
{
char* result = 0;
// search the cache for a free block
free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
if (free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocator(): found a hit" << std::endl;
// get the pointer
result = free_block->second;
// erase from the free_blocks map
free_blocks.erase(free_block);
}
else
{
// no allocation of the right size exists
// create a new one with cuda::malloc
// throw if cuda::malloc can't satisfy the request
try
{
std::cout << "cached_allocator::allocator(): no free block found; calling cuda::malloc" << std::endl;
// allocate memory and convert cuda::pointer to raw pointer
result = thrust::cuda::malloc<char>(num_bytes).get();
}
catch(std::runtime_error &e)
{
throw;
}
}
// insert the allocated pointer into the allocated_blocks map
allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}
void deallocate(char* ptr, size_t n)
{
// erase the allocated block from the allocated blocks map
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// insert the block into the free blocks map
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
private:
typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
typedef std::map<char*, std::ptrdiff_t> allocated_blocks_type;
free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;
void free_all()
{
std::cout << "cached_allocator::free_all(): cleaning up after ourselves..." << std::endl;
// deallocate all outstanding blocks in both lists
for (free_blocks_type::iterator i = free_blocks.begin();
i != free_blocks.end(); i++)
{
// transform the pointer to cuda::pointer before calling cuda::free
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for (allocated_blocks_type::iterator i = allocated_blocks.begin();
i != allocated_blocks.end(); i++)
{
// transform the pointer to cuda::pointer before calling cuda::free
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
}
};
int main()
{
#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
std::cout << "This feature requires gcc >= 4.4" << std::endl;
return 0;
#endif
size_t n = 1 << 22;
// create a cached_allocator object
cached_allocator alloc;
// generate and reduce random input
thrust::host_vector<int> a_r_input(n);
thrust::generate(a_r_input.begin(), a_r_input.end(), rand);
thrust::cuda::vector<int> a_input = a_r_input;
thrust::reduce(thrust::cuda::par(alloc), a_input.begin(), a_input.end());
// generate and reduce random input
thrust::host_vector<int> b_r_input(n);
thrust::generate(b_r_input.begin(), b_r_input.end(), rand);
thrust::cuda::vector<int> b_input = b_r_input;
thrust::reduce(thrust::cuda::par(alloc), b_input.begin(), b_input.end());
// generate and reduce random input
thrust::host_vector<int> c_r_input(n);
thrust::generate(c_r_input.begin(), c_r_input.end(), rand);
thrust::cuda::vector<int> c_input = c_r_input;
thrust::reduce(thrust::cuda::par(alloc), c_input.begin(), c_input.end());
// generate and reduce random input
thrust::host_vector<int> d_r_input(n);
thrust::generate(d_r_input.begin(), d_r_input.end(), rand);
thrust::cuda::vector<int> d_input = d_r_input;
thrust::reduce(thrust::cuda::par(alloc), d_input.begin(), d_input.end());
// generate and reduce random input
thrust::host_vector<int> e_r_input(n);
thrust::generate(e_r_input.begin(), e_r_input.end(), rand);
thrust::cuda::vector<int> e_input = e_r_input;
thrust::reduce(thrust::cuda::par(alloc), e_input.begin(), e_input.end());
return 0;
}
With this custom allocator, cudaMalloc and cudaFree are now called only once, no additional calls between reductions:
In the real financial application we develop this optimization resulted into 8% performance improvement.
Applied Parallel Computing LLC
If you need help with Machine Learning, Computer Vision or with GPU computing in general, please reach out to us at Applied Parallel Computing LLC.