Skip to content

Commit

Permalink
Merge pull request #88 from ecmwf-ifs/nams_stack_omp
Browse files Browse the repository at this point in the history
new variants OpenACC and OpenMP
  • Loading branch information
reuterbal authored Nov 5, 2024
2 parents 3d5c82a + e43bbe6 commit 1ae7d16
Show file tree
Hide file tree
Showing 12 changed files with 12,167 additions and 5 deletions.
4 changes: 4 additions & 0 deletions bundle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,12 @@ options :
FIELD_API_ENABLE_ACC=ON
ENABLE_CLOUDSC_GPU_SCC=ON
ENABLE_CLOUDSC_GPU_SCC_HOIST=ON
ENABLE_CLOUDSC_GPU_SCC_STACK=ON
ENABLE_CLOUDSC_GPU_SCC_K_CACHING=ON
ENABLE_CLOUDSC_GPU_OMP_SCC=ON
ENABLE_CLOUDSC_GPU_OMP_SCC_HOIST=ON
ENABLE_CLOUDSC_GPU_OMP_SCC_STACK=ON
ENABLE_CLOUDSC_GPU_OMP_SCC_K_CACHING=ON
- with-cuda :
help : Enable GPU kernel variants based on CUDA and CUDA-Fortran
Expand Down
158 changes: 157 additions & 1 deletion src/cloudsc_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ ecbuild_add_option( FEATURE CLOUDSC_GPU_SCC_CUF_K_CACHING
CONDITION HAVE_CUDA AND ( Serialbox_FOUND OR HDF5_FOUND )
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_SCC_STACK
DESCRIPTION "Build optimized GPU version of CLOUDSC using SCC with pool allocator for temporary arrays and OpenACC" DEFAULT OFF
CONDITION Serialbox_FOUND OR HDF5_FOUND
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_SCC_HOIST
DESCRIPTION "Build optimized GPU version of CLOUDSC using SCC with hoisted temporary arrays and OpenACC" DEFAULT OFF
CONDITION Serialbox_FOUND OR HDF5_FOUND
Expand All @@ -43,11 +48,26 @@ ecbuild_add_option( FEATURE CLOUDSC_GPU_SCC_K_CACHING
CONDITION Serialbox_FOUND OR HDF5_FOUND
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_OMP_SCC
DESCRIPTION "Build optimized GPU version of CLOUDSC using SCC layout and OpenMP" DEFAULT OFF
CONDITION Serialbox_FOUND OR HDF5_FOUND
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_OMP_SCC_STACK
DESCRIPTION "Build optimized GPU version of CLOUDSC using SCC with pool allocator for temporary arrays and OpenMP" DEFAULT OFF
CONDITION Serialbox_FOUND OR HDF5_FOUND
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_OMP_SCC_HOIST
DESCRIPTION "Build optimized GPU version of CLOUDSC using SCC with hoisted temporary arrays and OpenMP offload" DEFAULT OFF
CONDITION Serialbox_FOUND OR HDF5_FOUND
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_OMP_SCC_K_CACHING
DESCRIPTION "Build (further) optimized GPU version of CLOUDSC using SCC layout with OpenMP" DEFAULT OFF
CONDITION Serialbox_FOUND OR HDF5_FOUND
)

ecbuild_add_option( FEATURE CLOUDSC_GPU_SCC_FIELD
DESCRIPTION "Build optimized GPU version of CLOUDSC using SCC with FIELD API" DEFAULT ON
CONDITION HAVE_FIELD_API AND field_api_HAVE_ACC AND ( Serialbox_FOUND OR HDF5_FOUND )
Expand Down Expand Up @@ -99,6 +119,33 @@ if( HAVE_CLOUDSC_GPU_SCC )
)
endif()

if ( HAVE_CLOUDSC_GPU_SCC_STACK )

ecbuild_add_executable( TARGET dwarf-cloudsc-gpu-scc-stack
SOURCES
dwarf_cloudsc_gpu.F90
cloudsc_driver_gpu_scc_stack_mod.F90
cloudsc_gpu_scc_stack_mod.F90
LIBS
cloudsc-common-lib
DEFINITIONS ${CLOUDSC_DEFINITIONS} CLOUDSC_GPU_SCC_STACK
)

if( CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
target_compile_options( dwarf-cloudsc-gpu-scc-stack PRIVATE "-fcray-pointer" )
elseif( CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC" OR CMAKE_Fortran_COMPILER_ID MATCHES "PGI" )
target_compile_options( dwarf-cloudsc-gpu-scc-stack PRIVATE "-Mcray=pointer" )
endif()

ecbuild_add_test(
TARGET dwarf-cloudsc-gpu-scc-stack-serial
COMMAND bin/dwarf-cloudsc-gpu-scc-stack
ARGS 1 1000 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
)

endif()

if( HAVE_CLOUDSC_GPU_SCC_HOIST )
ecbuild_add_executable(
Expand Down Expand Up @@ -130,7 +177,7 @@ if( HAVE_CLOUDSC_GPU_SCC_K_CACHING )
cloudsc_gpu_scc_k_caching_mod.F90
LIBS
cloudsc-common-lib
DEFINITIONS ${CLOUDSC_DEFINITIONS} CLOUDSC_GPU_SCC_K_CACHING
DEFINITIONS ${CLOUDSC_DEFINITIONS} CLOUDSC_GPU_SCC_K_CACHING
)

ecbuild_add_test(
Expand All @@ -142,7 +189,83 @@ if( HAVE_CLOUDSC_GPU_SCC_K_CACHING )
)
endif()


if( HAVE_CLOUDSC_GPU_OMP_SCC )

list( APPEND CLOUDSC_GPU_OMP_SCC_DEFINITIONS CLOUDSC_GPU_OMP_SCC )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT )
list( APPEND CLOUDSC_GPU_OMP_SCC_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL )
list( APPEND CLOUDSC_GPU_OMP_SCC_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL )
endif()
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_THREAD )
list( APPEND CLOUDSC_GPU_OMP_SCC_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_THREAD )
endif()
endif()

ecbuild_add_executable(
TARGET dwarf-cloudsc-gpu-omp-scc
SOURCES
dwarf_cloudsc_gpu.F90
cloudsc_driver_gpu_omp_scc_mod.F90
cloudsc_gpu_omp_scc_mod.F90
LIBS
cloudsc-common-lib
DEFINITIONS ${CLOUDSC_DEFINITIONS} ${CLOUDSC_GPU_OMP_SCC_DEFINITIONS}
)

ecbuild_add_test(
TARGET dwarf-cloudsc-gpu-omp-scc-serial
COMMAND bin/dwarf-cloudsc-gpu-omp-scc
ARGS 1 1000 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
ENVIRONMENT "NVCOMPILER_ACC_CUDA_HEAPSIZE=8G"
)
endif()


if ( HAVE_CLOUDSC_GPU_OMP_SCC_STACK )

list( APPEND CLOUDSC_GPU_OMP_SCC_STACK_DEFINITIONS CLOUDSC_GPU_OMP_SCC_STACK )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT )
list( APPEND CLOUDSC_GPU_OMP_SCC_STACK_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL )
list( APPEND CLOUDSC_GPU_OMP_SCC_STACK_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL )
endif()
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_THREAD )
list( APPEND CLOUDSC_GPU_OMP_SCC_STACK_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_THREAD )
endif()
endif()

ecbuild_add_executable( TARGET dwarf-cloudsc-gpu-omp-scc-stack
SOURCES
dwarf_cloudsc_gpu.F90
cloudsc_driver_gpu_omp_scc_stack_mod.F90
cloudsc_gpu_omp_scc_stack_mod.F90
LIBS
cloudsc-common-lib
DEFINITIONS ${CLOUDSC_DEFINITIONS} ${CLOUDSC_GPU_OMP_SCC_STACK_DEFINITIONS}
)

if( CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
target_compile_options( dwarf-cloudsc-gpu-omp-scc-stack PRIVATE "-fcray-pointer" )
elseif( CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC" OR CMAKE_Fortran_COMPILER_ID MATCHES "PGI" )
target_compile_options( dwarf-cloudsc-gpu-omp-scc-stack PRIVATE "-Mcray=pointer" )
endif()

ecbuild_add_test(
TARGET dwarf-cloudsc-gpu-omp-scc-stack-serial
COMMAND bin/dwarf-cloudsc-gpu-omp-scc-stack
ARGS 1 1000 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
)

endif()

if( HAVE_CLOUDSC_GPU_OMP_SCC_HOIST )

list( APPEND CLOUDSC_GPU_OMP_SCC_HOIST_DEFINITIONS CLOUDSC_GPU_OMP_SCC_HOIST )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT )
list( APPEND CLOUDSC_GPU_OMP_SCC_HOIST_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT )
Expand Down Expand Up @@ -176,8 +299,41 @@ if( HAVE_CLOUDSC_GPU_OMP_SCC_HOIST )
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
)

endif()

if ( HAVE_CLOUDSC_GPU_OMP_SCC_K_CACHING )

list( APPEND CLOUDSC_GPU_OMP_SCC_K_CACHING_DEFINITIONS CLOUDSC_GPU_OMP_SCC_K_CACHING )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT )
list( APPEND CLOUDSC_GPU_OMP_SCC_K_CACHING_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT )
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL )
list( APPEND CLOUDSC_GPU_OMP_SCC_K_CACHING_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_PARALLEL )
endif()
if( HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_THREAD )
list( APPEND CLOUDSC_GPU_OMP_SCC_K_CACHING_DEFINITIONS HAVE_OMP_TARGET_LOOP_CONSTRUCT_BIND_THREAD )
endif()
endif()

ecbuild_add_executable( TARGET dwarf-cloudsc-gpu-omp-scc-k-caching
SOURCES
dwarf_cloudsc_gpu.F90
cloudsc_driver_gpu_omp_scc_k_caching_mod.F90
cloudsc_gpu_omp_scc_k_caching_mod.F90
LIBS
cloudsc-common-lib
DEFINITIONS ${CLOUDSC_DEFINITIONS} ${CLOUDSC_GPU_OMP_SCC_K_CACHING_DEFINITIONS}
)

ecbuild_add_test(
TARGET dwarf-cloudsc-gpu-omp-scc-k-caching-serial
COMMAND bin/dwarf-cloudsc-gpu-omp-scc-k-caching
ARGS 1 1000 128
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
OMP 1
)

endif()

if( HAVE_CLOUDSC_GPU_SCC_CUF )
# Compile CUDA fortran files with -cuda.
Expand Down
4 changes: 2 additions & 2 deletions src/cloudsc_gpu/cloudsc_driver_gpu_omp_scc_hoist_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ SUBROUTINE CLOUDSC_DRIVER_GPU_SCC_HOIST( &
CALL TIMER%THREAD_START(TID)

#ifdef HAVE_OMP_TARGET_LOOP_CONSTRUCT
!$omp target teams loop bind(teams)
!$omp target teams loop bind(teams) thread_limit(nproma)
#else
!$omp target teams distribute
!$omp target teams distribute thread_limit(nproma)
#endif
DO JKGLO=1,NGPTOT,NPROMA
IBL=(JKGLO-1)/NPROMA+1
Expand Down
Loading

0 comments on commit 1ae7d16

Please sign in to comment.