import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
from scikits.cuda import misc, cublas
from string import lower, upper
import numpy as np
[docs]def get_gpu_ptrs(arr, (m, n) = (0, 0)):
shape = arr[0].shape
if len(shape) == 1:
ptrs = [int(x[m:].gpudata) for x in arr]
else:
ptrs = [int(x[m:, n:].gpudata) for x in arr]
return gpuarray.to_gpu(np.array(ptrs))
[docs]def m_dot_batch(*args):
tmp_arr, tmp_ptrs, _trans = args[0]
for next_arr, next_ptrs, trans in args[1:]:
tmp_arr, tmp_ptrs = dot_batch(tmp_arr, next_arr, tmp_ptrs, next_ptrs,
transa = _trans, transb = trans)
_trans = 'N'#so we only do it once
return tmp_arr, tmp_ptrs
[docs]def dot_batch(a_arr_gpu, b_arr_gpu, a_ptr_gpu, b_ptr_gpu,
transa = 'N', transb = 'N'):
N = len(a_arr_gpu)
a_shape = a_arr_gpu[0].shape
a_dtype = a_arr_gpu[0].dtype
if len(a_shape) == 1:
a_shape = (1, a_shape[0])
b_shape = b_arr_gpu[0].shape
if len(b_shape) == 1:
b_shape = (1, b_shape[0])
transa = lower(transa)
transb = lower(transb)
if transb in ['t', 'c']:
m, k = b_shape
elif transb in ['n']:
k, m = b_shape
else:
raise ValueError('invalid value for transb')
if transa in ['t', 'c']:
l, n = a_shape
elif transa in ['n']:
n, l = a_shape
else:
raise ValueError('invalid value for transa')
c_shape = (n, m)
c_arr_gpu = [gpuarray.empty(c_shape, a_dtype) for _ in range(N)]
c_ptr_gpu = get_gpu_ptrs(c_arr_gpu)
dot_batch_nocheck(a_arr_gpu, b_arr_gpu, c_arr_gpu,
a_ptr_gpu, b_ptr_gpu, c_ptr_gpu,
transa, transb, b = 0)
return c_arr_gpu, c_ptr_gpu
[docs]def dot_batch_nocheck(a_arr_gpu, b_arr_gpu, c_arr_gpu, a_ptr_gpu, b_ptr_gpu, c_ptr_gpu,
transa = 'N', transb = 'N', a = 1, b = 1, handle = None):
"""
Implementation of batched dot products using cuda.
Parameters
----------
a_arr_gpu : list of pycuda.gpuarray.GPUArray
Input array.
b_arr_gpu : list of pycuda.gpuarray.GPUArray
Input array.
c_arr_gpu : list of pycuda.gpuarray.GPUArray
Input/Output array.
a_ptr_gpu : pycuda.gpuarray.GPUArray
Array of pointers to arrays in a_arr_gpu
b_ptr_gpu : pycuda.gpuarray.GPUArray
Array of pointers to arrays in b_arr_gpu
c_ptr_gpu : pycuda.gpuarray.GPUArray
Array of pointers to arrays in c_arr_gpu
transa : char
If 'T', compute the product of the transpose of `a_arr_gpu[i]`.
If 'C', compute the product of the Hermitian of `a_arr_gpu[i]`.
transb : char
If 'T', compute the product of the transpose of `b_arr_gpu[i]`.
If 'C', compute the product of the Hermitian of `b_arr_gpu[i]`.
handle : int
CUBLAS context. If no context is specified, the default handle from
`scikits.cuda.misc._global_cublas_handle` is used.
Returns
-------
None. Output is stored into the arrays in c_arr_gpu
Notes
-----
The input matrices must all contain elements of the same data type.
All matrics in a list must have same size.
Examples
--------
>>> import pycuda.driver
>>> import pycuda.autoinit
>>> from pycuda import gpuarray
>>> import numpy as np
>>> from scikits.cuda import linalg
>>> linalg.init()
>>> a_arr = [np.asarray(np.random.rand(4, 2), np.float32) for i in range(10)]
>>> b_arr = [np.asarray(np.random.rand(2, 2), np.float32) for i in range(10)]
>>> c_arr = [np.asarray(np.random.rand(4, 2), np.float32) for i in range(10)]
>>> a_arr_gpu = [gpuarray.to_gpu(a_gpu) for a_gpu in a_arr]
>>> b_arr_gpu = [gpuarray.to_gpu(b_gpu) for b_gpu in b_arr]
>>> c_arr_gpu = [gpuarray.to_gpu(c_gpu) for c_gpu in c_arr]
>>> a_ptr_gpu = gpuarray.to_gpu(np.asarray([int(a_gpu.gpudata) for a_gpu in a_arr_gpu]))
>>> b_ptr_gpu = gpuarray.to_gpu(np.asarray([int(b_gpu.gpudata) for b_gpu in b_arr_gpu]))
>>> c_ptr_gpu = gpuarray.to_gpu(np.asarray([int(c_gpu.gpudata) for c_gpu in c_arr_gpu]))
>>> linalg.dot_batch_nocheck(a_arr_gpu, b_arr_gpu, c_arr_gpu, a_ptr_gpu, b_ptr_gpu, c_ptr_gpu)
>>> for i in range(10):
... print np.allclose(np.dot(a_arr[i], b_arr[i]) + c_arr[i], c_arr_gpu[i].get())
...
True
True
True
True
True
True
True
True
True
True
>>>
"""
if handle is None:
handle = misc._global_cublas_handle
N = len(a_arr_gpu)
a_shape = a_arr_gpu[0].shape
a_dtype = a_arr_gpu[0].dtype
b_shape = b_arr_gpu[0].shape
b_dtype = b_arr_gpu[0].dtype
c_shape = c_arr_gpu[0].shape
c_dtype = c_arr_gpu[0].dtype
for i in range(N):
assert a_arr_gpu[i].shape == a_shape
assert a_arr_gpu[i].dtype == a_dtype
assert b_arr_gpu[i].shape == b_shape
assert b_arr_gpu[i].dtype == b_dtype
assert c_arr_gpu[i].shape == c_shape
assert c_arr_gpu[i].dtype == c_dtype
assert a_arr_gpu[i].flags.c_contiguous
assert b_arr_gpu[i].flags.c_contiguous
assert c_arr_gpu[i].flags.c_contiguous
if len(a_shape) == 1:
a_shape = (1, a_shape[0])
if len(b_shape) == 1:
b_shape = (1, b_shape[0])
if len(c_shape) == 1:
c_shape = (1, c_shape[0])
transa = lower(transa)
transb = lower(transb)
if transb in ['t', 'c']:
m, k = b_shape
elif transb in ['n']:
k, m = b_shape
else:
raise ValueError('invalid value for transb')
if transa in ['t', 'c']:
l, n = a_shape
elif transa in ['n']:
n, l = a_shape
else:
raise ValueError('invalid value for transa')
i, j = c_shape
if l != k:
raise ValueError('objects are not aligned')
if i != n:
raise ValueError('objects are not aligned')
if j != m:
raise ValueError('objects are not aligned')
if transb == 'n':
lda = max(1, m)
else:
lda = max(1, k)
if transa == 'n':
ldb = max(1, k)
else:
ldb = max(1, n)
ldc = max(1, m)
if (a_dtype == np.complex64 and b_dtype == np.complex64 \
and c_dtype == np.complex64):
cublas_func = cublas.cublasCgemmBatched
alpha = np.complex64(a)
beta = np.complex64(b)
elif (a_dtype == np.float32 and b_dtype == np.float32\
and c_dtype == np.float32):
cublas_func = cublas.cublasSgemmBatched
alpha = np.float32(a)
beta = np.float32(b)
elif (a_dtype == np.complex128 and b_dtype == np.complex128\
and c_dtype == np.complex128):
cublas_func = cublas.cublasZgemmBatched
alpha = np.complex128(a)
beta = np.complex128(b)
elif (a_dtype == np.float64 and b_dtype == np.float64\
and c_dtype == np.float64):
cublas_func = cublas.cublasDgemmBatched
alpha = np.float64(a)
beta = np.float64(b)
else:
raise ValueError('unsupported combination of input types')
cublas_func(handle, transb, transa, m, n, k, alpha, b_ptr_gpu.gpudata, lda,
a_ptr_gpu.gpudata, ldb, beta, c_ptr_gpu.gpudata, ldc, N)
# @profile
[docs]def batch_sum(a_arr_gpu, a_ptr_gpu):
"""
computes a sum of all of the arrays pointed to by a_arr_gpu and a_ptr_gpu
"""
if len(a_arr_gpu[0].shape) != 1:
n, m = a_arr_gpu[0].shape
total_size = n * m
flat_a_gpu = [a.ravel() for a in a_arr_gpu]
else:
total_size = a_arr_gpu[0].shape[0]
flat_a_gpu = a_arr_gpu
ones_vec = gpuarray.to_gpu_async(np.ones((total_size, 1), dtype=np.float32))
ones_arr_gpu = [ones_vec for i in range(len(a_arr_gpu))]
ones_ptr_gpu = get_gpu_ptrs(ones_arr_gpu)
res_arr, res_ptrs = dot_batch(flat_a_gpu, ones_arr_gpu, a_ptr_gpu, ones_ptr_gpu)
return [r.get()[0] for r in res_arr]
[docs]def gemm(a_gpu, b_gpu, c_gpu, transa='N', transb='N', alpha=1, beta=0, handle=None):
if handle is None:
handle = misc._global_cublas_handle
if len(a_gpu.shape) == 1 and len(b_gpu.shape) == 1:
raise RuntimeError('dot products not supported for gemm')
# Get the shapes of the arguments (accounting for the
# possibility that one of them may only have one dimension):
a_shape = a_gpu.shape
b_shape = b_gpu.shape
if len(a_shape) == 1:
a_shape = (1, a_shape[0])
if len(b_shape) == 1:
b_shape = (1, b_shape[0])
# Perform matrix multiplication for 2D arrays:
if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64):
cublas_func = cublas.cublasCgemm
alpha = np.complex64(alpha)
beta = np.complex64(beta)
elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32):
cublas_func = cublas.cublasSgemm
alpha = np.float32(alpha)
beta = np.float32(beta)
elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128):
cublas_func = cublas.cublasZgemm
alpha = np.complex128(alpha)
beta = np.complex128(beta)
elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64):
cublas_func = cublas.cublasDgemm
alpha = np.float64(alpha)
beta = np.float64(beta)
else:
raise ValueError('unsupported combination of input types')
transa = transa.lower()
transb = transb.lower()
if a_gpu.flags.c_contiguous != b_gpu.flags.c_contiguous:
raise ValueError('unsupported combination of input order')
if a_gpu.flags.f_contiguous:
if transa in ['t', 'c']:
k, m = a_shape
elif transa in ['n']:
m, k = a_shape
else:
raise ValueError('invalid value for transa')
if transb in ['t', 'c']:
n, l = b_shape
elif transb in ['n']:
l, n = b_shape
else:
raise ValueError('invalid value for transb')
if l != k:
raise ValueError('objects are not aligned')
lda = max(1, a_shape[0])
ldb = max(1, b_shape[0])
ldc = max(1, m)
if c_gpu.shape != (m, n) or c_gpu.dtype != a_gpu.dtype:
raise ValueError('invalid value for c_gpu')
if a_gpu.flags.f_contiguous != c_gpu.flags.f_contiguous:
raise ValueError('invalid order for c_gpu')
cublas_func(handle, transa, transb, m, n, k, alpha, a_gpu.gpudata,
lda, b_gpu.gpudata, ldb, beta, c_gpu.gpudata, ldc)
else:
if transb in ['t', 'c']:
m, k = b_shape
elif transb in ['n']:
k, m = b_shape
else:
raise ValueError('invalid value for transb')
if transa in ['t', 'c']:
l, n = a_shape
elif transa in ['n']:
n, l = a_shape
else:
raise ValueError('invalid value for transa')
if l != k:
raise ValueError('objects are not aligned')
lda = max(1, b_shape[1])
ldb = max(1, a_shape[1])
ldc = max(1, m)
# Note that the desired shape of the output matrix is the transpose
# of what CUBLAS assumes:
if c_gpu.shape != (n, ldc) or c_gpu.dtype != a_gpu.dtype:
raise ValueError('invalid value for c_gpu')
if a_gpu.flags.f_contiguous != c_gpu.flags.f_contiguous:
raise ValueError('invalid order for c_gpu')
cublas_func(handle, transb, transa, m, n, k, alpha, b_gpu.gpudata,
lda, a_gpu.gpudata, ldb, beta, c_gpu.gpudata, ldc)
[docs]def geam(a_gpu, b_gpu, c_gpu, transa='N', transb='N', alpha=1, beta=1, handle=None):
if handle is None:
handle = misc._global_cublas_handle
if len(a_gpu.shape) == 1 and len(b_gpu.shape) == 1:
raise RuntimeError('dot products not supported for geam')
# Get the shapes of the arguments (accounting for the
# possibility that one of them may only have one dimension):
a_shape = a_gpu.shape
b_shape = b_gpu.shape
if len(a_shape) == 1:
a_shape = (1, a_shape[0])
if len(b_shape) == 1:
b_shape = (1, b_shape[0])
# Perform matrix multiplication for 2D arrays:
if (a_gpu.dtype == np.complex64 and b_gpu.dtype == np.complex64):
cublas_func = cublas.cublasCgeam
alpha = np.complex64(alpha)
beta = np.complex64(beta)
elif (a_gpu.dtype == np.float32 and b_gpu.dtype == np.float32):
cublas_func = cublas.cublasSgeam
alpha = np.float32(alpha)
beta = np.float32(beta)
elif (a_gpu.dtype == np.complex128 and b_gpu.dtype == np.complex128):
cublas_func = cublas.cublasZgeam
alpha = np.complex128(alpha)
beta = np.complex128(beta)
elif (a_gpu.dtype == np.float64 and b_gpu.dtype == np.float64):
cublas_func = cublas.cublasDgeam
alpha = np.float64(alpha)
beta = np.float64(beta)
else:
raise ValueError('unsupported combination of input types')
transa = transa.lower()
transb = transb.lower()
if a_gpu.flags.c_contiguous != b_gpu.flags.c_contiguous:
raise ValueError('unsupported combination of input order')
if a_gpu.flags.f_contiguous:
if transa in ['t', 'c']:
k, m = a_shape
elif transa in ['n']:
m, k = a_shape
else:
raise ValueError('invalid value for transa')
if transb in ['t', 'c']:
n, l = b_shape
elif transb in ['n']:
l, n = b_shape
else:
raise ValueError('invalid value for transb')
if m != l or k != n:
raise ValueError('objects are not aligned')
lda = max(1, a_shape[0])
ldb = max(1, b_shape[0])
ldc = max(1, m)
if c_gpu.shape != (m, n) or c_gpu.dtype != a_gpu.dtype:
raise ValueError('invalid value for c_gpu')
if a_gpu.flags.f_contiguous != c_gpu.flags.f_contiguous:
raise ValueError('invalid order for c_gpu')
cublas_func(handle, transa, transb, m, n, alpha, a_gpu.gpudata,
lda, beta, b_gpu.gpudata, ldb, c_gpu.gpudata, ldc)
else:
if transb in ['t', 'c']:
m, k = b_shape
elif transb in ['n']:
k, m = b_shape
else:
raise ValueError('invalid value for transb')
if transa in ['t', 'c']:
l, n = a_shape
elif transa in ['n']:
n, l = a_shape
else:
raise ValueError('invalid value for transa')
if m != l or k != n:
raise ValueError('objects are not aligned')
lda = max(1, b_shape[1])
ldb = max(1, a_shape[1])
ldc = max(1, m)
# Note that the desired shape of the output matrix is the transpose
# of what CUBLAS assumes:
if c_gpu.shape != (n, ldc) or c_gpu.dtype != a_gpu.dtype:
raise ValueError('invalid value for c_gpu')
if a_gpu.flags.f_contiguous != c_gpu.flags.f_contiguous:
raise ValueError('invalid order for c_gpu')
cublas_func(handle, transb, transa, m, n, alpha, a_gpu.gpudata,
lda, beta, b_gpu.gpudata, ldb, c_gpu.gpudata, ldc)
if __name__ == '__main__':
import pycuda.autoinit
import numpy as np
from scikits.cuda import linalg
linalg.init()
N = 100
a_arr = [np.asarray(np.random.rand(4, 2), np.float32) for _ in range(N)]
b_arr = [np.asarray(np.random.rand(2, 2), np.float32) for _ in range(N)]
c_arr = [np.asarray(np.random.rand(4, 2), np.float32) for _ in range(N)]
a_arr_gpu = [gpuarray.to_gpu(a_gpu) for a_gpu in a_arr]
b_arr_gpu = [gpuarray.to_gpu(b_gpu) for b_gpu in b_arr]
c_arr_gpu = [gpuarray.to_gpu(c_gpu) for c_gpu in c_arr]
a_ptr_gpu = gpuarray.to_gpu(np.asarray([int(a_gpu.gpudata) for a_gpu in a_arr_gpu]))
b_ptr_gpu = gpuarray.to_gpu(np.asarray([int(b_gpu.gpudata) for b_gpu in b_arr_gpu]))
c_ptr_gpu = gpuarray.to_gpu(np.asarray([int(c_gpu.gpudata) for c_gpu in c_arr_gpu]))
dot_batch_nocheck(a_arr_gpu, b_arr_gpu, c_arr_gpu, a_ptr_gpu, b_ptr_gpu, c_ptr_gpu)
success = True
for i in range(N):
success &= np.allclose(np.dot(a_arr[i], b_arr[i]) + c_arr[i], c_arr_gpu[i].get())
print "Test Successful:\t{}".format(success)