PROGRAM MUMPS_BENCH !$ USE OMP_LIB IMPLICIT NONE INCLUDE 'mpif.h' INCLUDE 'dmumps_struc.h' C C C Purpose: C ======= C C Given MPI_PER_NODE, the number of MPI processes with the C constraint that the number of MPI processes per node is C constant, C Given K, a factor (smaller than MPI_PER_NODE) defining the C number of MPI processes per node that will belong to C the same subcommunicator, C C This code creates NB_COMM_MAX internode communicators, C such that NB_COMM_MAX = MPI_PER_NODE/K C C Once the new communicators (noted COMM) are defined we may run the C same instance of MUMPS on 1,2, ... NB_COMM_MAX communicators. Each C instance consists in factorizing and solving A x = b, where the C system matrix A results from an 11-point Laplacian discretization C on a 3D grid whose size is (NX, NY, NZ). C C Notes: C ===== C C * Each MPI process may have some threads. C * If K > 1 and MPI_PER_NODE is not divisible by K, some C MPI processes will not be used. C * To avoid 100% core usage or in case of tests on machines C with not enough physical cores, it may be worth to have C barriers and collectives waiting instead of polling C constantly, this can be done with: C export OMPI_MCA_mpi_yield_when_idle=1 (openmpi) C export I_MPI_WAIT_MODE=1 (intelmpi) C C Control parameters and file input format: C ======================================== C C The main control parameters are read from stdin on the C MPI process with rank 0 in MPI_COMM_WORLD. C Each input file defines the grid size, the factor K and the C number of tests to be performed. Each test beeing characterized C by a number of instances and if a check of the solution C need be performed. C C EXAMPLE OF INPUT FILE: C on 40nodes with 24 MPI per node C K=2 leads to a maximum of 12 instances (NB_COMM_MAX= 12) C C 80 80 500 # NX NY NZ C 2 # K C 8 # iteration count C 1 # NB_INSTANCES C 1 # CHECK (0=OFF) C 1 # NB_INSTANCES C 0 # CHECK (0=OFF) C 2 # NB_INSTANCES C 0 # CHECK (0=OFF) C 4 # NB_INSTANCES C 0 # CHECK (0=OFF) C 8 # NB_INSTANCES C 0 # CHECK (0=OFF) C MAX # NB_INSTANCES (MAX=NB_COMM_MAX) C 0 # CHECK (0=OFF) C MAX # NB_INSTANCES (MAX=NB_COMM_MAX) C 0 # CHECK (0=OFF) C MAX # NB_INSTANCES (MAX=NB_COMM_MAX) C 0 # CHECK (0=OFF) C C The objective of this test is to measure the C "TIME (seconds) FOR ITERATION" as a function of the C number of instances. C The num C In the output file lines with "BENCH:" are most interesting C for the benchers. C C C ERROR return: C ============ C - CONFIGURATION ERROR: C The number of MPI process per node is not constant C - ERROR during MUMPS ANALYSIS: C see INFOG(1:2) in Section "Error Diagnostics" of C MUMPS Users' guide C - ERROR DURING MUMPS FACTORIZATION: C see INFOG(1:2) in Section "Error Diagnostics" of C MUMPS Users' guide C - ERROR RESIDUAL (RINFOG(6)) TOO LARGE: C RINFOG(6) (see MUMPS Users' guide) should be smaller C than 1e-12. C - ERROR FLOPS_REFERENCE DIFFERENCE is NONZERO C The number of operation performed during C factorization should be constant. C C Grid size INTEGER :: NX, NY, NZ ! e.g. 80 80 500 C C K factor INTEGER :: K C C NB_ITERATIONS: the number of iterations, i.e., C the number of tests to be performd. C INTEGER :: NB_ITERATIONS C C NB_INSTANCES and CHECK will be read from the file C NB_INSTANCES(I): between 1 and MPI_PER_NODE / K, larger values C are treated as MPI_PER_NODE / K C CHECK(I) : = 0 or 1. It indicates whether bwd error should C be computed and checked (done if equal to 1) INTEGER, ALLOCATABLE, DIMENSION(:) :: NB_INSTANCES INTEGER, ALLOCATABLE, DIMENSION(:) :: CHECK C C C C ================================================================== C C C FLOPS_REFERENCE is the reference number of flops (for a run C where the bwd error was computed) DOUBLE PRECISION :: FLOPS_REFERENCE INTEGER :: I C C Each MPI process declares a MUMPS structure. C MUMPS structures will be combined to have one MUMPS C instance on each communicator. TYPE (DMUMPS_STRUC) :: id INTEGER, PARAMETER :: NRHS = 1 C Iteration-related INTEGER :: IT ! loop index INTEGER :: CHECK_IT, NB_COMM_IT C Timing: INTEGER :: t_start, t_end, t_rate C C OpenMP related !$ INTEGER NOMP C C MPI-related INTEGER :: THREAD_SUPPORT, IERR INTEGER :: NPROCS_WORLD, MYID_WORLD INTEGER :: NPROCS_COMM, MYID_COMM, NB_COMM_MAX, & MPI_PER_NODE, MPI_PER_COMM INTEGER :: COMM ! the new communicators INTEGER :: MY_COMM_COLOR ! set to process color C Each MPI must know the name of the node it is on. C For validation purposes, use -DSIMULATE_SEVERAL_NODES CHARACTER(len=80) :: TMP_STRING CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: MYNAME INTEGER :: MYNAME_LENGTH C C COLOR(rank in MPI_COMM_WORLD) => communicator number C C COLOR(i)==COLOR(j) if i and j are in the same communicator. C Process i is associated to communicator COLOR(i)=0...NB_COMM_MAX-1 C and will participate to the computations only if COLOR(i) <= NB_INSTANCES(IT) INTEGER, ALLOCATABLE, DIMENSION(:) :: COLOR C C C Only one thread in each process calls MPI: CALL MPI_INIT_THREAD(MPI_THREAD_FUNNELED, THREAD_SUPPORT, IERR) CALL MPI_COMM_SIZE( MPI_COMM_WORLD, NPROCS_WORLD, IERR ) CALL MPI_COMM_RANK( MPI_COMM_WORLD, MYID_WORLD, IERR ) IF ( THREAD_SUPPORT .EQ. MPI_THREAD_SINGLE ) THEN IF (MYID_WORLD .EQ. 0) THEN WRITE(*,'(A)') "BENCH: Warning: MPI_THREAD_FUNNELED expected," WRITE(*,'(A)') "BENCH but MPI_THREAD_SINGLE provided" ENDIF ENDIF FLOPS_REFERENCE = 0.0D0 ! special value meaning: not computed yet C ALLOCATE(COLOR(0:NPROCS_WORLD-1)) #if defined(SIMULATE_SEVERAL_NODES) C For local testing only, not to be activated. IF (NPROCS_WORLD.LE.1) THEN WRITE(*,'(A)') ' BENCH: More than 1 MPI processs needed ' CALL MPI_FINALIZE(IERR) STOP ENDIF WRITE(MYNAME, '(A,I4)') "Node ",MYID_WORLD/(NPROCS_WORLD/2) MYNAME_LENGTH = 9 #else CALL MPI_GET_PROCESSOR_NAME(MYNAME, MYNAME_LENGTH, IERR ) #endif WRITE(*,'(I4,A)') MYID_WORLD, & " : NODE NAME="//MYNAME(1:MYNAME_LENGTH) C C Read configuration file C IF (MYID_WORLD .EQ. 0) THEN WRITE(*,'(A)') "BENCH: Reading configuration file" READ(*,*,ERR=200, END=200) NX, NY, NZ WRITE(*,'(A,3I3)') "BENCH: Read NX NY NZ =",NX,NY,NZ READ(*,*,ERR=200, END=200) K WRITE(*,'(A,I3)') "BENCH: Read K =", K READ(*,*, ERR=200, END=200) NB_ITERATIONS WRITE(*,'(A,I3)') "BENCH: Read NB_ITERATIONS =", NB_ITERATIONS ALLOCATE (NB_INSTANCES(NB_ITERATIONS), & CHECK(NB_ITERATIONS)) DO IT = 1, NB_ITERATIONS READ(*,*,ERR=200,END=200) TMP_STRING WRITE(*,'(A,I3,A,A)') "BENCH: Read #instances for IT.", & IT,"=", TMP_STRING IF (TMP_STRING(1:3) .EQ. "MAX") THEN NB_INSTANCES(IT) = huge(NB_INSTANCES(IT)) ELSE READ(TMP_STRING,*,ERR=200) NB_INSTANCES(IT) ENDIF READ(*,*,ERR=200) CHECK(IT) WRITE(*,'(A,I3,A,I1)') "BENCH: Read CHECK value for IT.", & IT,"=",CHECK(IT) ENDDO GOTO 300 200 CONTINUE C Error while reading configuration file WRITE(*,'(A)') "BENCH: ERROR IN INPUT FILE FORMAT" WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(IERR) 300 CONTINUE ENDIF C Broadcast main parameters CALL MPI_BCAST( NX, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR) CALL MPI_BCAST( NY, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR) CALL MPI_BCAST( NZ, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR) CALL MPI_BCAST( K, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR) CALL MPI_BCAST( NB_ITERATIONS, 1, MPI_INTEGER, 0, & MPI_COMM_WORLD, IERR) CALL BUILD_COMMUNICATORS( MYNAME, MYNAME_LENGTH, & MPI_COMM_WORLD, COMM, & K, MYID_WORLD, NPROCS_WORLD, & MPI_PER_NODE, MPI_PER_COMM, NB_COMM_MAX, COLOR ) IF (MYID_WORLD .EQ. 0) THEN !$ NOMP = OMP_GET_MAX_THREADS() WRITE(*,'(A)') "BENCH:======================================" WRITE(*,'(A,I4)') "BENCH:#MPI processes per node :", & MPI_PER_NODE WRITE(*,'(A,I4)') "BENCH:#OpenMP threads per MPI process :", & NOMP WRITE(*,'(A,I4)') "BENCH:#MPI processes per communicator :", & MPI_PER_COMM WRITE(*,'(A,I4)') "BENCH:NB_COMM_MAX (#communic. created):", & NB_COMM_MAX WRITE(*,'(A,I4)') "BENCH:shmem communications ratio :", K WRITE(*,'(A,I4)') "BENCH:NB_ITERATIONS :", & NB_ITERATIONS WRITE(*,'(A)') "BENCH:======================================" WRITE(*,*) C MPI proces with rank k is in communicator COLOR(k) WRITE(*,'(A,300I5)') & "BENCH: COLOR array defining communicators:", COLOR IF ( NB_COMM_MAX * K .NE. MPI_PER_NODE ) THEN WRITE(*,*) "BENCH: Note: ", MPI_PER_NODE/NB_COMM_MAX, & " MPI ranks per node unused" ENDIF ENDIF CALL MPI_COMM_SIZE( COMM, NPROCS_COMM, IERR ) CALL MPI_COMM_RANK( COMM, MYID_COMM, IERR ) C IF (MPI_PER_COMM .NE. NPROCS_COMM) THEN WRITE(*,'(A,I4,A,I4)') & "BENCH: ERROR IN BUILD_COMMUNICATORS: MPI_PER_COMM =", & MPI_PER_COMM, " but MPI_COMM_SIZE returns",NPROCS_COMM WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(MPI_COMM_WORLD, IERR) ENDIF C --------------------------------------------- C Initialize a MUMPS instance on all the MPI C processes that are part of a new communicator C --------------------------------------------- MY_COMM_COLOR = COLOR(MYID_WORLD) IF ( MY_COMM_COLOR .LT. NB_COMM_MAX ) THEN C NB_COMM_MAX = 1 => communicator 0 (processes with color 0) C NB_COMM_MAX = 2 => communicators 0,1 (processes with colors 0,1) C NB_COMM_MAX = 3 => communicators 0,1,2, etc. id%JOB = -1 id%SYM = 0 id%CNTL(1) = 0.0D0 id%PAR = 1 id%COMM = COMM CALL DMUMPS(id) C --------------------------------------- C Initialize problem and perform analysis C --------------------------------------- id%JOB = 1 IF ( MYID_COMM .EQ. 0 ) THEN C Matrix is initialized on each master CALL ALLOCATE_INSTANCE( id, NX, NY, NZ, NRHS, MYID_WORLD ) ENDIF IF (MYID_WORLD .EQ. 0) THEN WRITE(*,*) WRITE(*,'(A)') " ====================================" WRITE(*,'(A)') " 11pt discr. pb characteristics " WRITE(*,'(A, I9)')" NX =", NX WRITE(*,'(A, I9)')" NY =", NY WRITE(*,'(A, I9)')" NZ =", NZ WRITE(*,'(A, I9)')" NRHS =", NRHS WRITE(*,'(A, I9)')" id%N =", id%N WRITE(*,'(A, I9)')" id%NNZ =", id%NNZ WRITE(*,'(A)') " ====================================" WRITE(*,*) ENDIF id%ICNTL(7)=5 ! Metis ordering should be used id%KEEP(3)=32 id%KEEP(6)=16 id%KEEP(5)=16 id%KEEP(4)=32 if (id%SYM .EQ. 0) THEN id%KEEP(9)=350 else id%KEEP(9)=200 endif id%KEEP(102)=110 id%KEEP(375)=1 id%KEEP(77)=75 id%KEEP(78)=2 id%KEEP(83)=min(8,NPROCS_COMM) id%KEEP(68)=25 id%KEEP(263)=0 id%KEEP(213) = 101 id%KEEP(85)=-4 id%KEEP(376)=1 IF (MYID_WORLD.EQ.0) THEN WRITE(*,*) " CONFIGURATION: MUMPS internal SETTING:" DO I=1, 500 WRITE(*,*) " ... KEEP(", I, ")=", id%KEEP(I) ENDDO ENDIF CALL DMUMPS(id) IF (id%INFOG(1) .LT. 0) THEN WRITE(*,'(A,I4,I4)') & "BENCH: ERROR DURING MUMPS ANALYSIS, INFOG(1:2)=", & id%INFOG(1:2) WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(MPI_COMM_WORLD, IERR) ENDIF ENDIF C DO IT= 1, NB_ITERATIONS C For each iteration, perform a factorization and possibly a solve C Number of communicators is limited by MPI_PER_NODE/K=NB_COMM_MAX IF (MYID_WORLD.EQ. 0) THEN NB_COMM_IT = min(NB_INSTANCES(IT), NB_COMM_MAX) CHECK_IT = CHECK(IT) WRITE(*,*) WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH:======================================" WRITE(*,'(A,I3, A,I3)') & "BENCH: Starting it.", IT, & " with #simultaneous instances= ", & NB_COMM_IT WRITE(*,'(A,I1)') "BENCH: CHECK = ",CHECK_IT WRITE(*,'(A)') "BENCH:======================================" WRITE(*,*) WRITE(*,*) ENDIF CALL MPI_BCAST( NB_COMM_IT, 1, MPI_INTEGER, 0, & MPI_COMM_WORLD, IERR) CALL MPI_BCAST( CHECK_IT, 1, MPI_INTEGER, 0, & MPI_COMM_WORLD, IERR) IF (CHECK_IT.EQ.0) THEN id%KEEP(201)=-1 ELSE id%KEEP(201)=0 ENDIF CALL SYSTEM_CLOCK( t_start ) IF ( MY_COMM_COLOR .LT. NB_COMM_IT ) THEN WRITE(*,'(A,I4,A,I4)') "BENCH:",MYID_WORLD, & " participates to it.",IT C --------------------------- C JOB=2: MATRIX FACTORIZATION C --------------------------- id%JOB=2 ! id%ICNTL(1:4)=0 CALL DMUMPS(id) C IF (MYID_WORLD .EQ. 0.AND.id%ICNTL(35).GT.0) THEN C - print BLR flops WRITE(*,'(A,ES10.3,ES10.3)') " ** RINFOG(3)(14)= ", & id%RINFOG(3),id%RINFOG(14) ENDIF C IF (id%INFOG(1) .LT. 0) THEN WRITE(*,'(A,I4,I4)') & "BENCH: ERROR DURING MUMPS FACTORIZATION: INFOG(1:2)=", & id%INFOG(1:2) WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(MPI_COMM_WORLD, IERR) ENDIF CALL SYSTEM_CLOCK(t_end, t_rate) IF (MYID_COMM .EQ. 0) THEN C Print for each instance WRITE(*,'(A,I3,A,I3,A,I3,A,F10.3)') & "BENCH: TIME (seconds) factorization for", & MYID_WORLD," (COMM_COLOR ", MY_COMM_COLOR, & ", it.",IT,") = ", & dble( t_end - t_start ) / dble( t_rate ) ENDIF ENDIF C C Synchronization between iterations C CALL MPI_BARRIER(MPI_COMM_WORLD, IERR) IF (MYID_WORLD .EQ. 0) THEN CALL SYSTEM_CLOCK(t_end, t_rate) WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH: ==================================== " WRITE(*,'(A,I3,A,I3,A,F10.3)') & "BENCH: TIME (seconds) FOR ITERATION ", & IT, & " with #simultaneous instances=", & min(NB_INSTANCES(IT), NB_COMM_MAX ), & "=", dble( t_end - t_start ) / dble(t_rate) WRITE(*,'(A)') " ==================================== " WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH:" ENDIF IF (CHECK_IT .EQ. 1 ) THEN IF ( MY_COMM_COLOR .LT. NB_COMM_IT ) THEN C --------------- C Perform a solve C --------------- IF ( MYID_COMM .EQ. 0 ) THEN CALL RANDOM_NUMBER(id%RHS) ENDIF id%ICNTL(11) = 2 !id%ICNTL(1:4)=0 id%JOB=3 CALL DMUMPS(id) IF (MYID_COMM .EQ. 0) THEN WRITE(*,'(A)') "BENCH:" WRITE(*,'(A,ES10.3,A,I4,A,I4,A)') & "BENCH: Scaled Residual (<1D-12): ", id%RINFOG(6), & " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")" IF (id%RINFOG(6).GE.1D-12) THEN WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') & "BENCH: ERROR RESIDUAL (RINFOG(6)) TOO LARGE " WRITE(*,'(A)') "BENCH:" ENDIF IF (FLOPS_REFERENCE .NE. 0.0D0) THEN WRITE(*,'(A,ES16.5,A,I4,A,I4,A)') & "BENCH: FLOPS_REFERENCE DIFFERENCE=", & FLOPS_REFERENCE - id%RINFOG(3), & " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")" IF (abs(FLOPS_REFERENCE - id%RINFOG(3)).GT.1.0D0) & THEN WRITE(*,'(A)') & "BENCH: ERROR FLOPS_REFERENCE DIFFERENCE is NONZERO" ENDIF ELSE C Set flops reference WRITE(*,'(A,I3,A,ES16.5,A,I4,A,I4,A)') & "BENCH:",MYID_WORLD, & " setting FLOPS_REFERENCE =", id%RINFOG(3), & " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")" FLOPS_REFERENCE = id%RINFOG(3) ENDIF ENDIF ENDIF C This instance was checked, use FLOPS_REFERENCE C from process with rank 0 on all processes CALL MPI_BCAST( FLOPS_REFERENCE, 1, MPI_DOUBLE_PRECISION, 0, & MPI_COMM_WORLD, IERR) ELSE IF ( MY_COMM_COLOR .LT. NB_COMM_IT ) THEN IF (MYID_COMM .EQ. 0) THEN IF ( FLOPS_REFERENCE .NE. 0.0D0 ) THEN WRITE(*,'(A,ES16.5,A,I4,A,I4,A)') & "BENCH: FLOPS_REFERENCE DIFFERENCE=", & FLOPS_REFERENCE - id%RINFOG(3), & " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")" IF (abs(FLOPS_REFERENCE - id%RINFOG(3)).GT.1.0D0) & THEN WRITE(*,'(A)') & "BENCH: ERROR FLOPS_REFERENCE DIFFERENCE is NONZERO" ENDIF ENDIF ENDIF ENDIF ENDIF ENDDO C IF (MYID_WORLD .EQ. 0) THEN WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH:" WRITE(*,'(A)') "BENCH: =====================================" WRITE(*,'(A)') "BENCH: All iterations have finished. " WRITE(*,'(A)') "BENCH: Destroying instances and finalizing " WRITE(*,'(A)') "BENCH: =====================================" ENDIF C C --------------------------- C Destroy all MUMPS instances C --------------------------- IF ( COLOR(MYID_WORLD) .LT. NB_COMM_MAX ) THEN id%JOB=-2 CALL DMUMPS(id) IF (MYID_COMM .EQ. 0) THEN C Free matrix and right-hand side DEALLOCATE(id%IRN, id%JCN, id%A, id%RHS) NULLIFY(id%IRN, id%JCN, id%A, id%RHS) ENDIF ENDIF IF (MYID_WORLD .EQ. 0) THEN DEALLOCATE(NB_INSTANCES) DEALLOCATE(CHECK) ENDIF CALL MPI_FINALIZE(IERR) END PROGRAM MUMPS_BENCH C C ---------------------------------------------------------------- C SUBROUTINE BUILD_COMMUNICATORS( MYNAME, MYNAME_LENGTH, & OLD_COMM, NEW_COMM, & K, MYID, NPROCS, & MPI_PER_NODE, & MPI_PER_COMM, & NB_COMMUNICATORS, COLOR ) IMPLICIT NONE INCLUDE 'mpif.h' INTEGER, INTENT(IN) :: OLD_COMM, MYID, NPROCS, K INTEGER, INTENT(OUT) :: NEW_COMM INTEGER, INTENT(OUT) :: MPI_PER_NODE, & MPI_PER_COMM, & NB_COMMUNICATORS INTEGER, INTENT(OUT) :: COLOR(0:NPROCS-1) CHARACTER(len=MPI_MAX_PROCESSOR_NAME), INTENT(IN) :: MYNAME INTEGER, INTENT(IN) :: MYNAME_LENGTH C C Local declarations C ================== C INTEGER :: IERR C INTEGER :: NB_ON_THIS_NODE, MAX_ON_A_NODE, MIN_ON_A_NODE, NB_NODES INTEGER :: CURRENT_INODE_REP INTEGER :: IPROC ! for loops between 0 and NPROCS-1 INTEGER, ALLOCATABLE, DIMENSION(:) :: IS_ON_SAME_NODE INTEGER, ALLOCATABLE, DIMENSION(:) :: TABLE_OF_PROCESSES C ALLOCATE(IS_ON_SAME_NODE(0:NPROCS-1)) CALL SET_IS_ON_SAME_NODE( MYNAME, MYNAME_LENGTH, & MYID, NPROCS, IS_ON_SAME_NODE, & MPI_COMM_WORLD ) C On exit, IS_ON_SAME_NODE is an array of size NPROCS, C identical on all MPI ranks. We illustrate its content C in the example below, with NPROCS=12. C Assuming we have: C MPI rank : 0 1 2 3 4 5 6 7 8 9 10 11 C node name : A D B B B A C D D C A C C Then, IS_ON_SAME_NODE = 0 1 2 2 2 0 6 1 1 6 0 6 C It is such that IS_ON_SAME_NODE(i)=IS_ONS_SAME_NODE(j)=k if i C and j and k are on the same node, and k is the smallest rank C among the MPI processes present on that node. C ALLOCATE(TABLE_OF_PROCESSES(0:NPROCS-1)) DO IPROC = 0, NPROCS-1 TABLE_OF_PROCESSES(IPROC)=IPROC ENDDO CALL BUBBLE_SORT_INT( NPROCS, IS_ON_SAME_NODE(0), & TABLE_OF_PROCESSES(0) ) C C MPI rank : 0 1 2 3 4 5 6 7 8 9 10 11 C node name : A D B B B A C D D C A C C C corresponds to: C C TABLE_OF_PROCESSES = 0 5 10 1 7 8 2 3 4 6 9 11 C node name : A A A D D D B B B C C C C IS_ON_SAME_NODE = 0 0 0 1 1 1 2 2 2 6 6 6 C C COLOR (see below) = 0 1 2 0 1 2 0 1 2 0 1 2 C Indeed, this means: meaning COLOR(0) = 0, COLOR(5)=1, C COLOR(10)=2, COLOR(1)=1, COLOR(7) = 1, etc. C C Check Number of MPI per node C CURRENT_INODE_REP = -99999 MAX_ON_A_NODE = -99999 MIN_ON_A_NODE = 99999 NB_ON_THIS_NODE = -99999 C IS_ON_SAME_NODE and TABLE_OF_PROCESSES are now sorted so C that entries corresponding to a given node are contiguous DO IPROC = 0, NPROCS-1 IF ( CURRENT_INODE_REP .NE. IS_ON_SAME_NODE(IPROC) ) THEN C Beginning of a new node CURRENT_INODE_REP = IS_ON_SAME_NODE(IPROC) NB_ON_THIS_NODE = 1 ELSE NB_ON_THIS_NODE = NB_ON_THIS_NODE + 1 ENDIF IF (IPROC .EQ. NPROCS-1) THEN C Last IPROC, NB_ON_THIS_NODE is valid MIN_ON_A_NODE = min( MIN_ON_A_NODE, NB_ON_THIS_NODE ) MAX_ON_A_NODE = max( MAX_ON_A_NODE, NB_ON_THIS_NODE ) ELSE IF ( CURRENT_INODE_REP .NE. IS_ON_SAME_NODE(IPROC+1) ) THEN C Last IPROC for that node, NB_ON_THIS_NODE is valid MIN_ON_A_NODE = min( MIN_ON_A_NODE, NB_ON_THIS_NODE ) MAX_ON_A_NODE = max( MAX_ON_A_NODE, NB_ON_THIS_NODE ) ENDIF C Define COLOR entry C Processes with same color will form communicators: C 1st on node i with 1st on node j and 1st on node k, C 2nd on node i with 2nd on node j, etc. COLOR(TABLE_OF_PROCESSES(IPROC)) = NB_ON_THIS_NODE - 1 ENDDO C IF (MAX_ON_A_NODE.NE. MIN_ON_A_NODE) THEN IF (MYID .EQ. 0) THEN WRITE(*,'(A,A,I3,A,I3)') & "BENCH: CONFIGURATION ERROR:", & "#MPI per node not constant: #MPI min=", & MIN_ON_A_NODE, " and #MPI max=", MAX_ON_A_NODE WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID CALL MPI_ABORT(IERR) ENDIF ENDIF MPI_PER_NODE = MIN_ON_A_NODE NB_COMMUNICATORS = MPI_PER_NODE / K NB_NODES = NPROCS / MPI_PER_NODE ! should be exact MPI_PER_COMM = K * NB_NODES ! output argument for checking C C Depending on the factor K, we create as many communicators C as possible of size NPROCS / K. C K = 1 => 3 communicators of size 4 C K = 2 => 1 communicator of size 8, 4 MPI processes per node unused C K = 3 => 1 communicator of size 12 C C --------------------------------------------- C Create the inter-node communicators: CALL MPI_COMM_SPLIT( OLD_COMM, & COLOR(MYID)/K, ! Color & COLOR(MYID), ! Key & NEW_COMM, & IERR ) C In the call above, COLOR(MYID)/K is the color to create C communicators (same color => same communicator), and C COLOR(MYID) is used as a key to sort MPI ranks within C a communicator). C C Fix COLOR output array, which should be divided C by K to be interpreted correctly on exit (COLOR). COLOR = COLOR / K RETURN END SUBROUTINE BUILD_COMMUNICATORS C C ---------------------------------------------------------------- C SUBROUTINE SET_IS_ON_SAME_NODE( MYNAME, MYNAME_LENGTH, & MYID, NPROCS, & IS_ON_SAME_NODE, COMM ) IMPLICIT NONE INCLUDE 'mpif.h' C C Given a communicator COMM, build IS_ON_SAME_NODE on all C MPI processes, where on exit, IS_ON_SAME_NODE( i ) = j C means that MPI rank i is on the same node as MPI rank j, C where j is the smallest MPI rank of the processes present C on the node. C CHARACTER(len=MPI_MAX_PROCESSOR_NAME), INTENT(IN) :: MYNAME INTEGER, INTENT(IN) :: MYNAME_LENGTH INTEGER, INTENT(IN) :: MYID, NPROCS, COMM INTEGER, INTENT(OUT):: IS_ON_SAME_NODE(0:NPROCS-1) C INTEGER :: IERR C CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: TMPNAME INTEGER :: TMPNAME_LENGTH INTEGER :: IPROC, IPROC_MIN DO IPROC = 0, NPROCS - 1 IF ( IPROC. EQ. MYID ) THEN TMPNAME_LENGTH = MYNAME_LENGTH TMPNAME = MYNAME ENDIF CALL MPI_BCAST( TMPNAME_LENGTH, 1, MPI_INTEGER, IPROC, & MPI_COMM_WORLD, IERR) CALL MPI_BCAST( TMPNAME, TMPNAME_LENGTH, MPI_CHARACTER, IPROC, & MPI_COMM_WORLD, IERR) IS_ON_SAME_NODE( IPROC ) = 0 C Test below includes myself, leading to C IS_ON_SAME_PROC(MYID) .eqv. .true. IF ( TMPNAME_LENGTH .EQ. MYNAME_LENGTH ) THEN IF ( TMPNAME(1:TMPNAME_LENGTH) .EQ. MYNAME(1:MYNAME_LENGTH) ) & THEN IS_ON_SAME_NODE(IPROC) = 1 ENDIF ENDIF ENDDO C For each node, define a representant C (one of the MPI ranks on that node) C We choose IPROC_MIN, ie, the one with smallest rank DO IPROC = 0, NPROCS - 1 IF (IS_ON_SAME_NODE(IPROC) .EQ. 1) THEN IPROC_MIN = IPROC EXIT ENDIF ENDDO DO IPROC = 0, NPROCS - 1 IF (IS_ON_SAME_NODE(IPROC) .EQ. 1) THEN IS_ON_SAME_NODE(IPROC) = IPROC_MIN ENDIF ENDDO CALL MPI_ALLREDUCE( MPI_IN_PLACE, IS_ON_SAME_NODE(0), NPROCS, & MPI_INTEGER, MPI_MAX, COMM, IERR ) RETURN END SUBROUTINE SET_IS_ON_SAME_NODE C C ---------------------------------------------------------------- C SUBROUTINE ALLOCATE_INSTANCE( id, N1, N2, N3, NRHS, MYID_WORLD ) INCLUDE 'dmumps_struc.h' INTEGER, INTENT(in) :: N1, N2, N3, NRHS, MYID_WORLD C C Prepares a MUMPS instance with user parameters on a C master process. Assumes that id%SYM has been set. C INCLUDE 'mpif.h' INTEGER :: IERR, I TYPE (DMUMPS_STRUC) :: id INTEGER :: allocok INTEGER :: NMAX INTEGER(8) :: NNZMAX INTEGER(8) :: NNZ, j8, i8 INTEGER(8), ALLOCATABLE, DIMENSION(:) :: IP INTEGER, POINTER, DIMENSION(:) :: IRN, JCN DOUBLE PRECISION, POINTER, DIMENSION(:) :: A C CALL PT11AD_ANA(N1,N2,N3,id%N,id%NNZ,.TRUE.) NMAX = id%N NNZMAX = id%NNZ id%NRHS = NRHS id%LRHS = id%N ALLOCATE(id%IRN(id%NNZ),id%JCN(id%NNZ),id%A(id%NNZ), & IP(id%N+1),id%RHS(id%N*id%NRHS), stat=allocok) IF (allocok > 0 ) THEN WRITE(*,'(A)') "BENCH: ERROR: failure allocating matrix", & allocok WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(MPI_COMM_WORLD, IERR) ENDIF CALL PT11AD(NMAX, NNZMAX, IERR, N1, N2, N3, id%N, id%NNZ, IP, & id%IRN(1), id%A(1), 6, .TRUE., 1, .FALSE.) C Convert from CSC to coordinate format !$OMP PARALLEL DO DO I = 1, id%N id%JCN(IP(I):IP(I+1)-1) = I ENDDO !$OMP END PARALLEL DO DEALLOCATE(IP) C Case of symmetric matrix: update matrix to keep C only lower triangular part + diagonal IF ((id%SYM.EQ.1).OR.(id%SYM.EQ.2)) THEN C{ NNZ=(id%NNZ-id%N)/2+id%N ALLOCATE(IRN(NNZ), JCN(NNZ), A(NNZ), stat=allocok) IF (allocok > 0 ) THEN WRITE(*,'(A,I3)') & "BENCH: ERROR: failure allocating sym. matrix", & allocok WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(MPI_COMM_WORLD, IERR) ENDIF j8=1 DO i8=1,id%NNZ C keep only lower triangualr matrix if(id%IRN(i8).GE.id%JCN(i8)) then IRN(j8)=id%IRN(i8) JCN(j8)=id%JCN(i8) A(j8)=id%A(i8) j8=j8+1 endif ENDDO IF (j8-1.ne.NNZ) THEN WRITE(6,'(A,I8,I8)') & "BENCH: ERROR / internal error NNZ, j8-1=", & NNZ, j8-1 WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ", & MYID_WORLD CALL MPI_ABORT(MPI_COMM_WORLD, IERR) ENDIF id%NNZ = NNZ DEALLOCATE(id%IRN, id%JCN, id%A) id%IRN=>IRN id%JCN=>JCN id%A=>A ENDIF RETURN END SUBROUTINE ALLOCATE_INSTANCE C C ---------------------------------------------------------------- C SUBROUTINE PT11AD( NMAX, NZMAX, IFLAG, & Nx, Ny, Nz, & N, NNZ, IRST, JCN, A, LP, YESA, IPROB,GEN_VERBOSE ) * * * Produces Positive-Definite matrix from eleven point * discretization on an M by N by P grid, * with a nine point discretization in X/Y directions. * The matrix A is generated BY ROWS. * If YESA is .FALSE. only structure is generated. * * Order of A = NX*NY*NZ * Nonzeros in A = (3*NX -2)*(3*ny-2)*nz +nx*ny*(2*nz-2) * * Input Parameters (not modified in MUP11A) * ---------------- * NMAX, NZMAX INTEGER variables that need to be set to * respectively the maximum order and * the maximum number of nonzeros in the output matrix * Nx, Ny, Nz INTEGER variable taht need be set to the * dimension of the grid. * LP INTEGER variable, Output unit for printing. * YESA LOGICAL variable. * If YESA is .FALSE. only the structure of the output * matrix is generated * IPROB INTEGER variable that need be set on entry. * IF IPROB=1 then 3D-Poisson operator with 11-pt discretization * stored in XYZ order is generated. * ELSE ZYX storage is used. * * * Output Parameters (need not be set on entry) * ----------------- * IFLAG INTEGER variable that is set * to the Error return (0 if no problem detected) * N, NNZ INTEGER variables. * N is the order of the output matrix, NNZ is the * number of nonzeros. * IRST, JCN, A: Output matrix in HB format (by rows). * INTEGER NMAX,IFLAG INTEGER Nx, Ny, Nz INTEGER N, IPROB, LP INTEGER(8) :: NNZ,NZMAX, NTEMP_8,IPOS,IFLAG_8 INTEGER(8) IRST(NMAX+1) INTEGER JCN(NZMAX) DOUBLE PRECISION A(NZMAX) LOGICAL YESA * INTEGER I, J, K, IncI, IncJ, IncK, & IRow, IPT, JPT, KPT INTEGER NTEMP * DOUBLE PRECISION ONE #if 1 PARAMETER( ONE = 1.0D0 ) #else PARAMETER( ONE = (1.0D0,0.0D0) ) #endif LOGICAL,intent(in) :: GEN_VERBOSE * * * check size of arrays IFLAG = 0 NTEMP = Nx * Ny * Nz IF (NMAX .LT. NTEMP) THEN IFLAG = - Nx * Ny * Nz IF (LP.GT.0) WRITE(LP,'(A,I10)') 'Increase NMAX to', NTEMP GOTO 500 ENDIF NTEMP_8 = (3*int(Nx,8)-2)*(3*int(Ny,8)-2)*int(Nz,8) & + int(Nx,8)*int(Ny,8)*(2*int(Nz,8)-2) IF (NZMAX .LT. NTEMP_8) THEN IFLAG_8 =-NTEMP_8 IF (LP.GT.0) WRITE(LP,'(A,I10)') 'Increase NZMAX to', NTEMP_8 GOTO 500 ENDIF * * IF (IPROB.EQ.1) THEN if(GEN_VERBOSE) then WRITE( 6, 1100 ) endif 1100 FORMAT(' 3D-Poisson operator with 11-point discretization', & ' --- Stored in XYZ order' ) * N = Nx * Ny * Nz IPOS=1 IncI = 1 IncJ = Nx IncK = Nx * Ny * DO 110 K = 1, Nz KPT = (K-1) * IncK * DO 111 J = 1, Ny JPT = (J-1) * IncJ * DO 112 I = 1, Nx IRow = KPT + JPT + I IRST(IRow) = IPOS * IF (K.EQ.1) GOTO 120 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncK IPOS = IPOS + 1 * 120 IF (J.EQ.1) GOTO 130 IF (I.GT.1) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncJ - IncI IPOS = IPOS + 1 ENDIF IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncJ IPOS = IPOS + 1 IF (I.LT.Nx) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncJ + IncI IPOS = IPOS + 1 ENDIF * 130 IF (I.EQ.1) GOTO 140 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncI IPOS = IPOS + 1 * 140 CONTINUE #if 1 IF (YESA) A(IPOS) = 22.0D0 #else IF (YESA) A(IPOS) = (22.0D0,22.0D0) #endif JCN(IPOS) = IRow IPOS = IPOS + 1 * IF (I.EQ.Nx) GO TO 150 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncI IPOS = IPOS + 1 * 150 IF (J.EQ.Ny) GO TO 160 IF (I.GT.1) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncJ - IncI IPOS = IPOS + 1 ENDIF IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncJ IPOS = IPOS + 1 IF (I.LT.Nx) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncJ + IncI IPOS = IPOS + 1 ENDIF * 160 IF (K.EQ.Nz) GO TO 112 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncK IPOS = IPOS + 1 * 112 CONTINUE 111 CONTINUE 110 CONTINUE * NNZ = IPOS - 1 IRST(N+1) = NNZ + 1 ENDIF * * * * * IF (IPROB.NE.1) THEN if(GEN_VERBOSE) then WRITE( 6, 1200 ) endif 1200 FORMAT(' 3D-Poisson operator with 11-point ' & //'discretization', & ' --- Stored in ZYX order' ) * N = Nx * Ny * Nz IPOS=1 IncK = 1 IncJ = Nz IncI = Nz * Ny * DO 210 I = 1, Nx IPT = (I-1) * IncI * DO 211 J = 1, Ny JPT = (J-1) * IncJ * DO 212 K = 1, Nz IRow = IPT + JPT + K IRST(IRow) = IPOS * IF (K.EQ.1) GOTO 220 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncK IPOS = IPOS + 1 * 220 IF (J.EQ.1) GOTO 230 IF (I.GT.1) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncJ - IncI IPOS = IPOS + 1 ENDIF IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncJ IPOS = IPOS + 1 IF (I.LT.Nx) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncJ + IncI IPOS = IPOS + 1 ENDIF * 230 IF (I.EQ.1) GOTO 240 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow - IncI IPOS = IPOS + 1 * 240 CONTINUE #if 1 IF (YESA) A(IPOS) = 22.0D0 #else IF (YESA) A(IPOS) = (22.0D0,0.0D0) #endif JCN(IPOS) = IRow IPOS = IPOS + 1 * IF (I.EQ.Nx) GO TO 250 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncI IPOS = IPOS + 1 * 250 IF (J.EQ.Ny) GO TO 260 IF (I.GT.1) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncJ - IncI IPOS = IPOS + 1 ENDIF IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncJ IPOS = IPOS + 1 IF (I.LT.Nx) THEN IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncJ + IncI IPOS = IPOS + 1 ENDIF * 260 IF (K.EQ.Nz) GO TO 212 IF (YESA) A(IPOS) = -2*ONE JCN(IPOS) = IRow + IncK IPOS = IPOS + 1 * 212 CONTINUE 211 CONTINUE 210 CONTINUE * NNZ = IPOS - 1 IRST(N+1) = NNZ + 1 ENDIF * * * if(GEN_VERBOSE) then IF (LP.GT.0) WRITE(LP,60) Nx, Ny, Nz IF (LP.GT.0) WRITE(LP,70) N, NNZ endif 60 FORMAT( ' NX= ',I5,' NY= ',I5,' NZ= ',I5) 70 FORMAT(' ORDER OF MATRIX IS ',I10, & ' NUMBER OF NON-ZEROS IS ',I12) * * * 500 RETURN END C C ---------------------------------------------------------------- C SUBROUTINE PT11AD_ANA(N1, N2, N3, NNN, NNZ, GEN_VERBOSE) C INTEGER, INTENT(IN) :: N1,N2,N3 INTEGER, INTENT(OUT) :: NNN INTEGER(8), INTENT(OUT) :: NNZ LOGICAL,INTENT(IN) :: GEN_VERBOSE C integer(8)::n1_8,n2_8,n3_8 C n1_8=int(N1,8) n2_8=int(N2,8) n3_8=int(N3,8) NNN=n1*n2*n3 NNZ=(3*n1_8-2)*(3*n2_8-2)*n3_8+n1_8*n2_8*(2*n3_8-2) if(GEN_VERBOSE) then write(*,*) 'N = ', NNN write(*,*) 'NNZ = ',NNZ endif return END SUBROUTINE PT11AD_ANA C C ---------------------------------------------------------------- C SUBROUTINE BUBBLE_SORT_INT( N, VAL, ID ) C C Basic sort of VAL values, reporting changes to ID. C Not the most efficient possible sort, but only done once. C INTEGER N INTEGER ID( N ) INTEGER VAL( N ) INTEGER I, ISWAP INTEGER SWAP LOGICAL DONE DONE = .FALSE. DO WHILE ( .NOT. DONE ) DONE = .TRUE. DO I = 1, N - 1 IF ( VAL( I ) .GT. VAL( I + 1 ) ) THEN DONE = .FALSE. ISWAP = ID( I ) ID ( I ) = ID ( I + 1 ) ID ( I + 1 ) = ISWAP SWAP = VAL( I ) VAL( I ) = VAL( I + 1 ) VAL( I + 1 ) = SWAP END IF END DO END DO RETURN END SUBROUTINE BUBBLE_SORT_INT