CUDA Fortran

PGI CUDA Fortran and PGI Accelerator
for programming CUDA-enabled NVIDIA GPUs
Dmitry Mikushin
Plan
●
PGI CUDA Fortran
●
●
●
●
●
Simple program example
CUDA Fortran programming model
CUDA Fortran in-depth
Compilation
PGI Accelerator
PGI CUDA Fortran –
a natural Fortran language extension for
Compute Unified Device Architecture (CUDA)
Example: vecadd (cpu)
subroutine cpu_vecadd(A,B,C,N)
real(4) :: A(N), B(N), C(N)
integer :: N
integer :: i
do i = 1,N
C(i) = A(i) + B(i)
enddo
end subroutine
Example: vecadd (gpu:device)
module gpu_vecadd_module
use cudafor
contains
attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N)
real(4), device :: A(N), B(N), C(N)
integer, value :: N
integer :: i
i = (blockidx%x-1)*32 + threadidx%x
if( i <= N ) C(i) = A(i) + B(I)
end subroutine
end module
Example: vecadd (gpu:host)
subroutine gpu_vecadd_host( A, B, C )
use gpu_vecadd_module
real(4), dimension(:) :: A, B, C
real(4), device, allocatable:: Ad(:), Bd(:), Cd(:)
integer :: N
N = size( A, 1 )
allocate( Ad(N), Bd(N), Cd(N) )
Ad = A(1:N)
Bd = B(1:N)
call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N )
C(1:N) = Cd
deallocate( Ad, Bd, Cd )
end subroutine
Typicall host program
●
Select GPU (for multi-GPU systems)
●
Allocate device global memory
●
Copy data to device global memory
●
Launch device kernel
●
Copy results back from device global memory
●
Release previously allocated memory
Example: vecadd (gpu:host)
subroutine gpu_vecadd_host( A, B, C )
use gpu_vecadd_module
real(4), dimension(:) :: A, B, C
real(4), device, allocatable:: Ad(:), Bd(:), Cd(:)
Allocate device global memory
integer :: N
N = size( A, 1 )
allocate( Ad(N), Bd(N), Cd(N) )
Ad = A(1:N)
Bd = B(1:N)
call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N )
C(1:N) = Cd
deallocate( Ad, Bd, Cd )
end subroutine
Example: vecadd (gpu:host)
subroutine gpu_vecadd_host( A, B, C )
use gpu_vecadd_module
real(4), dimension(:) :: A, B, C
real(4), device, allocatable:: Ad(:), Bd(:), Cd(:)
integer :: N
N = size( A, 1 )
Copy Bd(N),
data from
to device memory
allocate( Ad(N),
Cd(N)host
)
Ad = A(1:N)
Bd = B(1:N)
call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N )
C(1:N) = Cd
deallocate( Ad, Bd, Cd )
end subroutine
Example: vecadd (gpu:host)
subroutine gpu_vecadd_host( A, B, C )
use gpu_vecadd_module
real(4), dimension(:) :: A, B, C
real(4), device, allocatable:: Ad(:), Bd(:), Cd(:)
integer :: N
N = size( A, 1 )
allocate( Ad(N), Bd(N), Cd(N) )
Ad = A(1:N)
Bd = B(1:N)Launch GPU kernel on device
call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N )
C(1:N) = Cd
deallocate( Ad, Bd, Cd )
end subroutine
Example: vecadd (gpu:host)
subroutine gpu_vecadd_host( A, B, C )
use gpu_vecadd_module
real(4), dimension(:) :: A, B, C
real(4), device, allocatable:: Ad(:), Bd(:), Cd(:)
integer :: N
N = size( A, 1 )
allocate( Ad(N), Bd(N), Cd(N) )
Ad = A(1:N)
Bd = B(1:N)
Copy data back
from
device
memory
host
call gpu_vecadd_kernel<<<(N
+ 31)
/ 32,
32>>>(
Ad, Bd,tpCd,
N )
C(1:N) = Cd
deallocate( Ad, Bd, Cd )
end subroutine
Example: vecadd (gpu:host)
subroutine gpu_vecadd_host( A, B, C )
use gpu_vecadd_module
real(4), dimension(:) :: A, B, C
real(4), device, allocatable:: Ad(:), Bd(:), Cd(:)
integer :: N
N = size( A, 1 )
allocate( Ad(N), Bd(N), Cd(N) )
Ad = A(1:N)
Bd = B(1:N)
call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N )
C(1:N) = CdRelease device memory
deallocate( Ad, Bd, Cd )
end subroutine
CUDA kernel specifics in brief
●
●
●
●
●
All threads are executing the same kernel code
Threads are grouped into blocks, blocks are grouped into grid
of blocks
Threads grid could be 1d, 2d or 3d, with maximum dimension
of 65535
Blocks grid could be 1d, 2d or 3d, with maximum dimension of
512 or 1024
Builtin variable threadidx (%x,%y,%z) denotes thread index in
block, blockidx denotes block index in blocks grid (%x,%y,%z)
CUDA kernel specifics in brief
●
●
Device and its driver are responsible for parallel threads
execution
Blocks of threads are scheduled for execution on GPU
multiprocessors
Example: vecadd (gpu:device)
module gpu_vecadd_module
use cudafor
contains
Attribute “global” means “function to execute on GPU”
attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N)
real(4), device :: A(N), B(N), C(N)
integer, value :: N
integer :: i
i = (blockidx%x-1)*32 + threadidx%x
if( i <= N ) C(i) = A(i) + B(I)
end subroutine
end module
Example: vecadd (gpu:device)
module gpu_vecadd_module
use cudafor
contains
Arrays subroutine
in GPU memory
attributes(global)
gpu_vecadd_kernel(A,B,C,N)
real(4), device :: A(N), B(N), C(N)
integer, value :: N
integer :: i
i = (blockidx%x-1)*32 + threadidx%x
if( i <= N ) C(i) = A(i) + B(I)
end subroutine
Example: vecadd (gpu:device)
module gpu_vecadd_module
use cudafor
contains
attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N)
real(4), device :: A(N), B(N), C(N)
integer, value :: N
integer :: i
Argument
is passed by value, not by address (default)
i = (blockidx%x-1)*32 + threadidx%x
if( i <= N ) C(i) = A(i) + B(I)
end subroutine
end module
Example: vecadd (gpu:device)
module gpu_vecadd_module
use cudafor
contains
attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N)
real(4), device :: A(N), B(N), C(N)
integer, value :: N
integer :: i
i = (blockidx%x-1)*32 + threadidx%x
if( i <= N ) C(i) = A(i) + B(I)
blockidx in range 1 .. (N + 31) / 32
end subroutine
end module
Example: vecadd (gpu:device)
module gpu_vecadd_module
use cudafor
contains
attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N)
real(4), device :: A(N), B(N), C(N)
integer, value :: N
integer :: i
i = (blockidx%x-1)*32 + threadidx%x
if( i <= N ) C(i) = A(i) + B(I)
threadidx in range 1 .. 32
end subroutine
end module
CUDA Fortran components
●
●
Host-code
●
Declare and allocate GPU memory buffers
●
Pass data between host and device
●
Pinned-memory buffers
●
Launch device kernels
●
CUDA Runtime API
Device-code
●
Attributes
●
Procedures-kernels and device-subprograms
●
Block shared memory
Declare data in GPU memory
●
Variables and arrays could be declared in GPU memory using “device”
attribute:
real, device, allocatable :: a(:)
real, allocatable :: a(:)
attributes(device) :: a
●
In host-code:
●
●
Allocatable or automatic variables in GPU memory could be declared
Variables and arrays in GPU memory could be passed to procedureskernels or other host-side functions
Declaring GPU data in modules
●
GPU data in modules must have fixed size or be allocatable:
module mm
real, device, allocatable :: a(:)
real, device :: x, y(10)
real, constant :: c1, c2(10)
integer, device :: n
contains
attributes(global) subroutine s( b )
...
●
Variable or array declaration could also have “constant” attribute,
denoting GPU constant memory
Allocating memory on GPU
●
Operators allocate / deallocate
real, device, allocatable :: a(:,:), b
allocate( a(1:n,1:m), b )
...
deallocate( a, b )
●
Memory for variables and arrays with “device” attribute is allocated on
GPU:
●
●
●
GPU memory allocation is performed by host program
No virtual memory on GPU ⇒ allocation could end up with “out of
memory” error
Optional argument STAT=ivar could be used for errors checking
Copying data between host and GPU
(Fortran-style)
●
Copying with generalized assignment operator:
real, device, allocatable :: a(:,:), b
allocate( a(1:n,1:m), b )
a(1:n,1:m) = x(1:n,1:m)
! copies to device
b = 99.0
...
x(1:n,1:m) = a(1:n,1:m)
! copies from device
y = b
deallocate( a, b )
●
Copying of discontinuous shapes yields to multiple separate copies, and
therefore could be slower
Copying data between host and GPU (CUDA
API)
●
Module cudafor defines functions similar to CUDA API:
use cudafor
real, allocatable, device :: a(:)
real :: b(10), b2(2), c(10)
...
istat = cudaMalloc( a, 10 )
istat = cudaMemcpy( a, b, 10 )
istat = cudaMemcpy( a(2), b2, 2 )
istat = cudaMemcpy( c, a, 10 )
istat = cudaFree( a )
Launching GPU kernels
●
Procedure-kernel could be launched with compute grid configuration in
triple angle braces:
call gpu_vecadd_kernel <<< (N+31)/32, 32 >>> ( A, B, C, N )
type(dim3) :: g, b
g = dim3((N+31)/32, 1, 1)
b = dim3( 32, 1, 1 )
call gpu_vecadd_kernel <<< g, b >>> ( A, B, C, N )
●
●
Procedure-kernel must have explicit interface or must share module with
caller
Compute grid configuration: grid dimensions could be plain integers or
values of dim3 type
GPU kernels in CUDA Fortran
●
Procedures-kernels are identified with “global” attribute:
attributes(global) subroutine kernel ( A, B, C, N )
●
●
Kernel may define scalars and fixed-size arrays
Kernel may define variables in shared memory – memory shared between
threads of the same block:
real, shared :: sm(16,16)
●
Kernel data may use simple data types below or derived types (structures)
based on them:
integer(1,2,4,8), logical(1,2,4,8), real(4,8), complex(4,8),
character(len=1)
GPU kernels in CUDA Fortran
●
●
Builtin variables: blockidx, threadidx, griddim,
blockdim, warpsize
The following Fortran language elements are
supported in kernels code:
●
●
●
●
●
Assignment operator
do, if, goto, case
call (subprograms with “device” attribute)
intrinsics
where, forall
Visibility rules
●
●
Procedure-kernel (with “device” attribute) in module:
●
Can use data in GPU memory defined in the same module
●
Can call “device”-subprograms defined in the same module
Subprogram with “device” attribute in module:
●
Can use data in GPU memory defined in the same module
●
Can call “device”-subprograms defined in the same module
●
Implicitly defined as private (not visible outside of the module)
Visibility rules
●
Procedure-kernel with “global” attribute not in module:
●
●
Can only access data in GPU memory defined in its arguments
Subprogram executed on host:
●
Can launch any procedures-kernels
●
Can access any data in modules
●
Also can launch kernels in CUDA C with explicit interface available
CUDA C interop
●
C → Fortran: interface
interface
attributes(global) subroutine saxpy(a,x,y,n) bind(c)
real, device :: x(*), y(*)
real, value :: a
integer, value :: n
end subroutine
end interface
...
call saxpy<<<grid,block>>>( aa, xx, yy, nn )
●
Fortran → C: declaration
extern __global__ void saxpy_(
float a, float* x, float* y, int n );
...
saxpy_( a, x, y, n );
Compilation
●
Build objects with kernels in CUDA Fortran:
pgfortran -Mcuda[=[emu|cc10|cc11|cc12|cc13|cc20]] a.cuf
●
Suffix .cuf ~ CUDA Fortran (free form)
●
Suffix .CUF ~ + preprocessor
●
-Mfixed ~ fixed form (Fortran 77)
●
Applications must also be linked with pgfortran -Mcuda
●
Requires installed CUDA Toolkit (can instal one itself)
CUDA C vs CUDA Fortran
CUDA C
CUDA Fortran
●
Textures, texture memory
●
Textures are available since PGI 12.8
●
CUDA Runtime API support
●
CUDA Runtime API support (“use cudafor”)
●
Driver API support
●
Driver API unsupported
●
cudaMalloc, cudaFree
●
allocate, deallocate
●
cudaMemcpy
●
Assignment operator
●
OpenGL and Direct3D support
●
OpenGL and Direct3D unsupported
●
●
Zero-based arrays and blocks/threads
indexing
CUDA-enabled math libraries
●
●
One-based arrays and blocks/threads
indexing
CUDA-enabled math libraries, accessible
through modules, for example “use
cublas”
Compact form
<code block>
Fortran
!$cuf kernel do[(n)] <<< grid, block >>>
!$cuf kernel do(2) <<< (*,*), (32,4) >>>
do j = 1, m
do i = 1, n
a(i,j) = b(i,j) + c(i,j)
end do
end do
●
●
2D loop is mapped onto 2D 32 x 4 compute grid of blocks
Threads grid dimensions (*, *) are computed in runtime, diving loop m x n dimensions
by blocks grid dimensions
Implement GPU kernel manually
or with auto-parallelizing compiler?
●
Implementing GPU kernels manually:
+ Potentially
●
high performance, if had-tuned
Works only with CUDA, resulting into codebase
fragmentation
Implement GPU kernel manually
or with auto-parallelizing compiler?
Compilers can generate GPU kernels from CPU source
code implicitly, fully automatically, or guided with
annotations/directives
Implement GPU kernel manually
or with auto-parallelizing compiler?
●
Automatic GPU kernels generation:
+ Good
performance is still possible
+ Source code stays universal and fully compatible
with CPU version
●
Hard to control/tune