Commit bbde6a0a authored by cloirec's avatar cloirec
Browse files

ajout exemple hello mpi,openmp,cuda pour skl224

parent 4c4ad981
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS := -fopenmp
LDFLAGS := -qopenmp
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
endif
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -G
CCFLAGS += -g
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - hello_mpi_omp_cuda is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(LDFLAGS)
#ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
#ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
MPI_CCFLAGS :=
MPI_CCFLAGS += $(CCFLAGS)
MPI_CCFLAGS += $(EXTRA_CCFLAGS)
MPI_LDFLAGS :=
MPI_LDFLAGS += $(LDFLAGS)
#MPI_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
#MPI_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I./common/inc
LIBRARIES :=
################################################################################
# MPI check and binaries
MPICXX ?= $(shell which mpicxx 2>/dev/null)
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
$(info -----------------------------------------------------------------------------------------------)
$(info WARNING - Cross Compilation not supported for MPI Samples.)
$(info -----------------------------------------------------------------------------------------------)
$(info Waiving the build )
$(info This will be a dry-run of the Makefile.)
$(info For more information on how to set up your environment to build and run this )
$(info sample, please refer the CUDA Samples documentation and release notes)
$(info -----------------------------------------------------------------------------------------------)
MPICXX=mpicxx
SAMPLE_ENABLED := 0
endif
ifeq ($(MPICXX),)
$(info -----------------------------------------------------------------------------------------------)
$(info WARNING - No MPI compiler found.)
$(info -----------------------------------------------------------------------------------------------)
$(info CUDA Sample "heel_superhybrid" cannot be built without an MPI Compiler.)
$(info This will be a dry-run of the Makefile.)
$(info For more information on how to set up your environment to build and run this )
$(info sample, please refer the CUDA Samples documentation and release notes)
$(info -----------------------------------------------------------------------------------------------)
MPICXX=mpicxx
SAMPLE_ENABLED := 0
else
MPI_GCC := $(shell $(MPICXX) -v 2>&1 | grep gcc | wc -l | tr -d ' ')
ifeq ($(MPI_GCC),0)
ifneq ($(TARGET_OS),darwin)
MPI_CCFLAGS += -stdlib=libstdc++
MPI_LDFLAGS += -stdlib=libstdc++
endif
endif
endif
# Gencode arguments
SMS ?= 30 35 37 50 52 60 61 70 75
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
LIBSIZE :=
ifneq ($(TARGET_OS),darwin)
ifeq ($(TARGET_SIZE),64)
LIBSIZE := 64
endif
endif
LIBRARIES += -L$(CUDA_PATH)/lib$(LIBSIZE) -L$(CUDA_LIB) -lcudart
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: hello_mpi_omp_cuda
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
hello_mpi_omp_cuda_mpi.o:hello_mpi_omp_cuda.cpp
$(EXEC) $(MPICXX) $(INCLUDES) $(MPI_CCFLAGS) -o $@ -c $<
hello_mpi_omp_cuda.o:hello_mpi_omp_cuda.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
hello_mpi_omp_cuda: hello_mpi_omp_cuda_mpi.o hello_mpi_omp_cuda.o
$(EXEC) $(MPICXX) $(MPI_LDFLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./hello_mpi_omp_cuda
clean:
rm -f hello_mpi_omp_cuda hello_mpi_omp_cuda_mpi.o hello_mpi_omp_cuda.o
rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/hello_mpi_omp_cuda
clobber: clean
// MPI include
#include <mpi.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/unistd.h>
#include <iostream>
using std::cout;
using std::cerr;
using std::endl;
#include "hello_superhybrid.h"
int main(int argc, char *argv[])
{
int mpiRank, mpiSize;
char hostname[128];
int nthreads, tid, cpuid;
int i, j=0;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
gethostname(hostname, sizeof hostname);
char DGPU[100];
int nb_GPU;
nb_GPU=hello_countGPU();
#pragma omp parallel
{
nthreads = omp_get_num_threads();
}
if ( mpiRank== 0)
printf("Run executed using %d MPI processes, with %d threads per process \n", mpiSize, nthreads);
if (nb_GPU == 1)
{
for(i = 0; i < mpiSize; i++)
{
MPI_Barrier(MPI_COMM_WORLD);
if (i == mpiRank)
{
hello_displayGPU(0, DGPU, sizeof(DGPU));
printf("%s: MPI n° %d -> cpuid %d -> on GPU n°%s\n",hostname, mpiRank,sched_getcpu(),DGPU);
#pragma omp parallel private(tid, nthreads, cpuid) shared(i)
{
tid=omp_get_thread_num();
nthreads = omp_get_num_threads();
cpuid = sched_getcpu();
while(j < tid){
#pragma omp flush(j)
}
printf("\t thread n° %d -> cpuid %d on MPI n° %d on %s\n", tid, cpuid, mpiRank,hostname);
j++;
#pragma omp flush(j)
}
}
}
}
else if (nb_GPU == 2)
{
for(i = 0; i < mpiSize/2; i++)
{
MPI_Barrier(MPI_COMM_WORLD);
if (i == mpiRank)
{
hello_displayGPU(0, DGPU, sizeof(DGPU));
printf("%s: MPI n° %d -> cpuid %d -> on GPU n°%s\n",hostname, mpiRank,sched_getcpu(),DGPU);
#pragma omp parallel private(tid, nthreads, cpuid) shared(i)
{
tid=omp_get_thread_num();
nthreads = omp_get_num_threads();
cpuid = sched_getcpu();
while(j < tid){
#pragma omp flush(j)
}
printf("\t thread n° %d -> cpuid %d on MPI n° %d on %s\n", tid, cpuid, mpiRank,hostname);
j++;
#pragma omp flush(j)
}
}
}
for(i = mpiSize/2; i < mpiSize; i++)
{
MPI_Barrier(MPI_COMM_WORLD);
if (i == mpiRank)
{
hello_displayGPU(1, DGPU, sizeof(DGPU));
printf("%s: MPI n° %d -> cpuid %d -> on GPU n°%s\n",hostname, mpiRank,sched_getcpu(),DGPU);
#pragma omp parallel private(tid, nthreads, cpuid) shared(i)
{
tid=omp_get_thread_num();
nthreads = omp_get_num_threads();
cpuid = sched_getcpu();
while(j < tid){
#pragma omp flush(j)
}
printf("\t thread n° %d -> cpuid %d on MPI n° %d on %s\n", tid, cpuid, mpiRank,hostname);
j++;
#pragma omp flush(j)
}
}
}
}
MPI_Finalize();
}
#include <iostream>
using std::cerr;
using std::endl;
#include "hello_superhybrid.h"
// CUDA runtime
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <timer.h>
// Error handling macro
#define CUDA_CHECK(call) \
if((call) != cudaSuccess) { \
cudaError_t err = cudaGetLastError(); \
cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
my_abort(err); }
int hello_countGPU()
{
int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess)
{
printf("cudaGetDeviceCount returned %d\n-> %s\n",
static_cast<int>(error_id), cudaGetErrorString(error_id));
printf("Result = FAIL\n");
exit(EXIT_FAILURE);
}
return deviceCount;
}
int hello_displayGPU(int gpuid, char* DGPU, size_t len)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, gpuid);
if (deviceProp.pciBusID == 27)
{
strcpy(DGPU," 0, pciBusID 27");
}
else if (deviceProp.pciBusID == 56)
{
strcpy(DGPU," 1, pciBusID 56");
}
return 0;
}
extern "C" {
int hello_countGPU();
int hello_displayGPU(int gpuid, char* DGPU, size_t len);
}
#!/bin/bash
#SBATCH -J hello_world
#SBATCH -C SKL224
#SBATCH -t 00:30:00
#SBATCH --nodes=1
#SBATCH --ntasks=8
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=7
#SBATCH --threads-per-core=1
#SBATCH --gres=gpu:2
#SBATCH --mem=100GB
module purge
module load intel/18.1 intelmpi/2018.1.163 gcc/6.2.0 cuda/10.1
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export KMP_HW_SUBSET=1T
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export KMP_AFFINITY=compact,1,0,granularity=fine
srun -n $SLURM_NTASKS ./hello_mpi_omp_cuda
#CUDA_VISIBLE_DEVICES=0 mpirun -np 2 ./hello_mpi_omp_cuda
#CUDA_VISIBLE_DEVICES=1 mpirun -np 2 ./hello_mpi_omp_cuda
#CUDA_VISIBLE_DEVICES=0,1 mpirun -np 2 ./hello_mpi_omp_cuda
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment