PGI CUDA Fortran and PGI Accelerator for programming CUDA-enabled NVIDIA GPUs Dmitry Mikushin Plan ● PGI CUDA Fortran ● ● ● ● ● Simple program example CUDA Fortran programming model CUDA Fortran in-depth Compilation PGI Accelerator PGI CUDA Fortran – a natural Fortran language extension for Compute Unified Device Architecture (CUDA) Example: vecadd (cpu) subroutine cpu_vecadd(A,B,C,N) real(4) :: A(N), B(N), C(N) integer :: N integer :: i do i = 1,N C(i) = A(i) + B(i) enddo end subroutine Example: vecadd (gpu:device) module gpu_vecadd_module use cudafor contains attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N) real(4), device :: A(N), B(N), C(N) integer, value :: N integer :: i i = (blockidx%x-1)*32 + threadidx%x if( i <= N ) C(i) = A(i) + B(I) end subroutine end module Example: vecadd (gpu:host) subroutine gpu_vecadd_host( A, B, C ) use gpu_vecadd_module real(4), dimension(:) :: A, B, C real(4), device, allocatable:: Ad(:), Bd(:), Cd(:) integer :: N N = size( A, 1 ) allocate( Ad(N), Bd(N), Cd(N) ) Ad = A(1:N) Bd = B(1:N) call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N ) C(1:N) = Cd deallocate( Ad, Bd, Cd ) end subroutine Typicall host program ● Select GPU (for multi-GPU systems) ● Allocate device global memory ● Copy data to device global memory ● Launch device kernel ● Copy results back from device global memory ● Release previously allocated memory Example: vecadd (gpu:host) subroutine gpu_vecadd_host( A, B, C ) use gpu_vecadd_module real(4), dimension(:) :: A, B, C real(4), device, allocatable:: Ad(:), Bd(:), Cd(:) Allocate device global memory integer :: N N = size( A, 1 ) allocate( Ad(N), Bd(N), Cd(N) ) Ad = A(1:N) Bd = B(1:N) call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N ) C(1:N) = Cd deallocate( Ad, Bd, Cd ) end subroutine Example: vecadd (gpu:host) subroutine gpu_vecadd_host( A, B, C ) use gpu_vecadd_module real(4), dimension(:) :: A, B, C real(4), device, allocatable:: Ad(:), Bd(:), Cd(:) integer :: N N = size( A, 1 ) Copy Bd(N), data from to device memory allocate( Ad(N), Cd(N)host ) Ad = A(1:N) Bd = B(1:N) call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N ) C(1:N) = Cd deallocate( Ad, Bd, Cd ) end subroutine Example: vecadd (gpu:host) subroutine gpu_vecadd_host( A, B, C ) use gpu_vecadd_module real(4), dimension(:) :: A, B, C real(4), device, allocatable:: Ad(:), Bd(:), Cd(:) integer :: N N = size( A, 1 ) allocate( Ad(N), Bd(N), Cd(N) ) Ad = A(1:N) Bd = B(1:N)Launch GPU kernel on device call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N ) C(1:N) = Cd deallocate( Ad, Bd, Cd ) end subroutine Example: vecadd (gpu:host) subroutine gpu_vecadd_host( A, B, C ) use gpu_vecadd_module real(4), dimension(:) :: A, B, C real(4), device, allocatable:: Ad(:), Bd(:), Cd(:) integer :: N N = size( A, 1 ) allocate( Ad(N), Bd(N), Cd(N) ) Ad = A(1:N) Bd = B(1:N) Copy data back from device memory host call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd,tpCd, N ) C(1:N) = Cd deallocate( Ad, Bd, Cd ) end subroutine Example: vecadd (gpu:host) subroutine gpu_vecadd_host( A, B, C ) use gpu_vecadd_module real(4), dimension(:) :: A, B, C real(4), device, allocatable:: Ad(:), Bd(:), Cd(:) integer :: N N = size( A, 1 ) allocate( Ad(N), Bd(N), Cd(N) ) Ad = A(1:N) Bd = B(1:N) call gpu_vecadd_kernel<<<(N + 31) / 32, 32>>>( Ad, Bd, Cd, N ) C(1:N) = CdRelease device memory deallocate( Ad, Bd, Cd ) end subroutine CUDA kernel specifics in brief ● ● ● ● ● All threads are executing the same kernel code Threads are grouped into blocks, blocks are grouped into grid of blocks Threads grid could be 1d, 2d or 3d, with maximum dimension of 65535 Blocks grid could be 1d, 2d or 3d, with maximum dimension of 512 or 1024 Builtin variable threadidx (%x,%y,%z) denotes thread index in block, blockidx denotes block index in blocks grid (%x,%y,%z) CUDA kernel specifics in brief ● ● Device and its driver are responsible for parallel threads execution Blocks of threads are scheduled for execution on GPU multiprocessors Example: vecadd (gpu:device) module gpu_vecadd_module use cudafor contains Attribute “global” means “function to execute on GPU” attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N) real(4), device :: A(N), B(N), C(N) integer, value :: N integer :: i i = (blockidx%x-1)*32 + threadidx%x if( i <= N ) C(i) = A(i) + B(I) end subroutine end module Example: vecadd (gpu:device) module gpu_vecadd_module use cudafor contains Arrays subroutine in GPU memory attributes(global) gpu_vecadd_kernel(A,B,C,N) real(4), device :: A(N), B(N), C(N) integer, value :: N integer :: i i = (blockidx%x-1)*32 + threadidx%x if( i <= N ) C(i) = A(i) + B(I) end subroutine Example: vecadd (gpu:device) module gpu_vecadd_module use cudafor contains attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N) real(4), device :: A(N), B(N), C(N) integer, value :: N integer :: i Argument is passed by value, not by address (default) i = (blockidx%x-1)*32 + threadidx%x if( i <= N ) C(i) = A(i) + B(I) end subroutine end module Example: vecadd (gpu:device) module gpu_vecadd_module use cudafor contains attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N) real(4), device :: A(N), B(N), C(N) integer, value :: N integer :: i i = (blockidx%x-1)*32 + threadidx%x if( i <= N ) C(i) = A(i) + B(I) blockidx in range 1 .. (N + 31) / 32 end subroutine end module Example: vecadd (gpu:device) module gpu_vecadd_module use cudafor contains attributes(global) subroutine gpu_vecadd_kernel(A,B,C,N) real(4), device :: A(N), B(N), C(N) integer, value :: N integer :: i i = (blockidx%x-1)*32 + threadidx%x if( i <= N ) C(i) = A(i) + B(I) threadidx in range 1 .. 32 end subroutine end module CUDA Fortran components ● ● Host-code ● Declare and allocate GPU memory buffers ● Pass data between host and device ● Pinned-memory buffers ● Launch device kernels ● CUDA Runtime API Device-code ● Attributes ● Procedures-kernels and device-subprograms ● Block shared memory Declare data in GPU memory ● Variables and arrays could be declared in GPU memory using “device” attribute: real, device, allocatable :: a(:) real, allocatable :: a(:) attributes(device) :: a ● In host-code: ● ● Allocatable or automatic variables in GPU memory could be declared Variables and arrays in GPU memory could be passed to procedureskernels or other host-side functions Declaring GPU data in modules ● GPU data in modules must have fixed size or be allocatable: module mm real, device, allocatable :: a(:) real, device :: x, y(10) real, constant :: c1, c2(10) integer, device :: n contains attributes(global) subroutine s( b ) ... ● Variable or array declaration could also have “constant” attribute, denoting GPU constant memory Allocating memory on GPU ● Operators allocate / deallocate real, device, allocatable :: a(:,:), b allocate( a(1:n,1:m), b ) ... deallocate( a, b ) ● Memory for variables and arrays with “device” attribute is allocated on GPU: ● ● ● GPU memory allocation is performed by host program No virtual memory on GPU ⇒ allocation could end up with “out of memory” error Optional argument STAT=ivar could be used for errors checking Copying data between host and GPU (Fortran-style) ● Copying with generalized assignment operator: real, device, allocatable :: a(:,:), b allocate( a(1:n,1:m), b ) a(1:n,1:m) = x(1:n,1:m) ! copies to device b = 99.0 ... x(1:n,1:m) = a(1:n,1:m) ! copies from device y = b deallocate( a, b ) ● Copying of discontinuous shapes yields to multiple separate copies, and therefore could be slower Copying data between host and GPU (CUDA API) ● Module cudafor defines functions similar to CUDA API: use cudafor real, allocatable, device :: a(:) real :: b(10), b2(2), c(10) ... istat = cudaMalloc( a, 10 ) istat = cudaMemcpy( a, b, 10 ) istat = cudaMemcpy( a(2), b2, 2 ) istat = cudaMemcpy( c, a, 10 ) istat = cudaFree( a ) Launching GPU kernels ● Procedure-kernel could be launched with compute grid configuration in triple angle braces: call gpu_vecadd_kernel <<< (N+31)/32, 32 >>> ( A, B, C, N ) type(dim3) :: g, b g = dim3((N+31)/32, 1, 1) b = dim3( 32, 1, 1 ) call gpu_vecadd_kernel <<< g, b >>> ( A, B, C, N ) ● ● Procedure-kernel must have explicit interface or must share module with caller Compute grid configuration: grid dimensions could be plain integers or values of dim3 type GPU kernels in CUDA Fortran ● Procedures-kernels are identified with “global” attribute: attributes(global) subroutine kernel ( A, B, C, N ) ● ● Kernel may define scalars and fixed-size arrays Kernel may define variables in shared memory – memory shared between threads of the same block: real, shared :: sm(16,16) ● Kernel data may use simple data types below or derived types (structures) based on them: integer(1,2,4,8), logical(1,2,4,8), real(4,8), complex(4,8), character(len=1) GPU kernels in CUDA Fortran ● ● Builtin variables: blockidx, threadidx, griddim, blockdim, warpsize The following Fortran language elements are supported in kernels code: ● ● ● ● ● Assignment operator do, if, goto, case call (subprograms with “device” attribute) intrinsics where, forall Visibility rules ● ● Procedure-kernel (with “device” attribute) in module: ● Can use data in GPU memory defined in the same module ● Can call “device”-subprograms defined in the same module Subprogram with “device” attribute in module: ● Can use data in GPU memory defined in the same module ● Can call “device”-subprograms defined in the same module ● Implicitly defined as private (not visible outside of the module) Visibility rules ● Procedure-kernel with “global” attribute not in module: ● ● Can only access data in GPU memory defined in its arguments Subprogram executed on host: ● Can launch any procedures-kernels ● Can access any data in modules ● Also can launch kernels in CUDA C with explicit interface available CUDA C interop ● C → Fortran: interface interface attributes(global) subroutine saxpy(a,x,y,n) bind(c) real, device :: x(*), y(*) real, value :: a integer, value :: n end subroutine end interface ... call saxpy<<<grid,block>>>( aa, xx, yy, nn ) ● Fortran → C: declaration extern __global__ void saxpy_( float a, float* x, float* y, int n ); ... saxpy_( a, x, y, n ); Compilation ● Build objects with kernels in CUDA Fortran: pgfortran -Mcuda[=[emu|cc10|cc11|cc12|cc13|cc20]] a.cuf ● Suffix .cuf ~ CUDA Fortran (free form) ● Suffix .CUF ~ + preprocessor ● -Mfixed ~ fixed form (Fortran 77) ● Applications must also be linked with pgfortran -Mcuda ● Requires installed CUDA Toolkit (can instal one itself) CUDA C vs CUDA Fortran CUDA C CUDA Fortran ● Textures, texture memory ● Textures are available since PGI 12.8 ● CUDA Runtime API support ● CUDA Runtime API support (“use cudafor”) ● Driver API support ● Driver API unsupported ● cudaMalloc, cudaFree ● allocate, deallocate ● cudaMemcpy ● Assignment operator ● OpenGL and Direct3D support ● OpenGL and Direct3D unsupported ● ● Zero-based arrays and blocks/threads indexing CUDA-enabled math libraries ● ● One-based arrays and blocks/threads indexing CUDA-enabled math libraries, accessible through modules, for example “use cublas” Compact form <code block> Fortran !$cuf kernel do[(n)] <<< grid, block >>> !$cuf kernel do(2) <<< (*,*), (32,4) >>> do j = 1, m do i = 1, n a(i,j) = b(i,j) + c(i,j) end do end do ● ● 2D loop is mapped onto 2D 32 x 4 compute grid of blocks Threads grid dimensions (*, *) are computed in runtime, diving loop m x n dimensions by blocks grid dimensions Implement GPU kernel manually or with auto-parallelizing compiler? ● Implementing GPU kernels manually: + Potentially ● high performance, if had-tuned Works only with CUDA, resulting into codebase fragmentation Implement GPU kernel manually or with auto-parallelizing compiler? Compilers can generate GPU kernels from CPU source code implicitly, fully automatically, or guided with annotations/directives Implement GPU kernel manually or with auto-parallelizing compiler? ● Automatic GPU kernels generation: + Good performance is still possible + Source code stays universal and fully compatible with CPU version ● Hard to control/tune
© Copyright 2026 Paperzz