PROGRAM MUMPS_BENCH
!$    USE OMP_LIB
      IMPLICIT NONE
      INCLUDE 'mpif.h'
      INCLUDE 'dmumps_struc.h'
C
C
C     Purpose:
C     =======
C
C     Given MPI_PER_NODE, the number of MPI processes with the 
C           constraint that the number of MPI processes per node is 
C           constant,
C     Given K, a factor (smaller than MPI_PER_NODE) defining the 
C           number of MPI processes per node that will belong to
C           the same subcommunicator,
C
C     This code creates NB_COMM_MAX internode communicators,
C     such that NB_COMM_MAX = MPI_PER_NODE/K
C
C     Once the new communicators (noted COMM) are defined we may run the
C     same instance of MUMPS on 1,2, ... NB_COMM_MAX communicators. Each
C     instance consists in factorizing and solving A x = b, where the
C     system matrix A results from an 11-point Laplacian discretization
C     on a 3D grid whose size is (NX, NY, NZ).
C
C     Notes:
C     =====
C
C     * Each MPI process may have some threads.
C     * If K > 1 and MPI_PER_NODE is not divisible by K, some
C     MPI processes will not be used.
C     * To avoid 100% core usage or in case of tests on machines
C     with not enough physical cores, it may be worth to have
C     barriers and collectives waiting instead of polling
C     constantly, this can be done with: 
C       export OMPI_MCA_mpi_yield_when_idle=1  (openmpi)
C       export I_MPI_WAIT_MODE=1 (intelmpi)
C
C     Control parameters and file input format:
C     ========================================
C
C     The main control parameters are read from stdin on the
C     MPI process with rank 0 in MPI_COMM_WORLD.
C     Each input file defines the grid size, the factor K and the
C     number of tests to be performed. Each test beeing characterized
C     by a number of instances and if a check of the solution 
C     need be performed.
C
C     EXAMPLE OF INPUT FILE: 
C       on  40nodes with 24 MPI per node 
C       K=2 leads to a maximum of 12 instances (NB_COMM_MAX= 12)
C
C         80 80 500     # NX NY NZ
C         2             # K
C         8             # iteration count
C            1             # NB_INSTANCES
C            1             # CHECK (0=OFF)
C            1             # NB_INSTANCES
C            0             # CHECK (0=OFF)
C            2             # NB_INSTANCES
C            0             # CHECK (0=OFF)
C            4             # NB_INSTANCES
C            0             # CHECK (0=OFF)
C            8             # NB_INSTANCES
C            0             # CHECK (0=OFF)
C            MAX           # NB_INSTANCES (MAX=NB_COMM_MAX)
C            0             # CHECK (0=OFF)
C            MAX           # NB_INSTANCES (MAX=NB_COMM_MAX)
C            0             # CHECK (0=OFF)
C            MAX           # NB_INSTANCES (MAX=NB_COMM_MAX)
C            0             # CHECK (0=OFF)
C
C     The objective of this test is to measure the 
C     "TIME (seconds) FOR ITERATION" as a function of the 
C     number of instances.
C     The num
C     In the output file lines with "BENCH:" are most interesting
C     for the benchers.
C     
C
C     ERROR return:
C     ============
C        - CONFIGURATION ERROR:
C           The number of MPI process per node is not constant
C        - ERROR during MUMPS ANALYSIS:
C           see INFOG(1:2) in Section "Error Diagnostics" of 
C           MUMPS Users' guide
C        - ERROR DURING MUMPS FACTORIZATION:
C           see INFOG(1:2) in Section "Error Diagnostics" of 
C           MUMPS Users' guide
C        - ERROR RESIDUAL (RINFOG(6)) TOO LARGE:
C          RINFOG(6) (see MUMPS Users' guide) should be smaller 
C          than 1e-12.
C        - ERROR FLOPS_REFERENCE DIFFERENCE is NONZERO
C          The number of operation performed during
C          factorization should be constant.
C
C     Grid size
      INTEGER :: NX, NY, NZ ! e.g. 80 80 500
C
C     K factor
      INTEGER :: K
C
C     NB_ITERATIONS: the number of iterations, i.e.,
C     the number of tests to be performd.
C
      INTEGER :: NB_ITERATIONS
C
C     NB_INSTANCES and CHECK will be read from the file
C     NB_INSTANCES(I): between 1 and MPI_PER_NODE / K, larger values
C                      are treated as MPI_PER_NODE / K
C     CHECK(I)       : = 0 or 1. It indicates whether bwd error should
C                        be computed and checked (done if equal to 1)
      INTEGER, ALLOCATABLE, DIMENSION(:) :: NB_INSTANCES
      INTEGER, ALLOCATABLE, DIMENSION(:) :: CHECK
C
C
C
C     ==================================================================
C
C
C     FLOPS_REFERENCE is the reference number of flops (for a run
C                     where the bwd error was computed)
      DOUBLE PRECISION :: FLOPS_REFERENCE
      INTEGER          :: I
C
C     Each MPI process declares a MUMPS structure.
C     MUMPS structures will be combined to have one MUMPS
C     instance on each communicator.
      TYPE (DMUMPS_STRUC) :: id
      INTEGER, PARAMETER :: NRHS = 1
C     Iteration-related
      INTEGER :: IT ! loop index
      INTEGER :: CHECK_IT, NB_COMM_IT
C     Timing:
      INTEGER :: t_start, t_end, t_rate
C    
C     OpenMP related
!$    INTEGER NOMP
C
C     MPI-related
      INTEGER :: THREAD_SUPPORT, IERR
      INTEGER :: NPROCS_WORLD, MYID_WORLD
      INTEGER :: NPROCS_COMM, MYID_COMM, NB_COMM_MAX,
     &           MPI_PER_NODE, MPI_PER_COMM
      INTEGER :: COMM ! the new communicators
      INTEGER :: MY_COMM_COLOR ! set to process color
C     Each MPI must know the name of the node it is on.
C     For validation purposes, use -DSIMULATE_SEVERAL_NODES
      CHARACTER(len=80) :: TMP_STRING
      CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: MYNAME
      INTEGER :: MYNAME_LENGTH
C
C     COLOR(rank in MPI_COMM_WORLD) => communicator number
C
C     COLOR(i)==COLOR(j) if i and j are in the same communicator.
C     Process i is associated to communicator COLOR(i)=0...NB_COMM_MAX-1
C     and will participate to the computations only if COLOR(i) <= NB_INSTANCES(IT)
      INTEGER, ALLOCATABLE, DIMENSION(:) :: COLOR
C
C
C     Only one thread in each process calls MPI:
      CALL MPI_INIT_THREAD(MPI_THREAD_FUNNELED, THREAD_SUPPORT, IERR)
      CALL MPI_COMM_SIZE( MPI_COMM_WORLD, NPROCS_WORLD, IERR )
      CALL MPI_COMM_RANK( MPI_COMM_WORLD, MYID_WORLD, IERR )
      IF ( THREAD_SUPPORT .EQ. MPI_THREAD_SINGLE ) THEN
       IF (MYID_WORLD .EQ. 0) THEN
         WRITE(*,'(A)') "BENCH: Warning: MPI_THREAD_FUNNELED expected,"
         WRITE(*,'(A)') "BENCH but MPI_THREAD_SINGLE provided"
       ENDIF
      ENDIF
      FLOPS_REFERENCE = 0.0D0  ! special value meaning: not computed yet
C
      ALLOCATE(COLOR(0:NPROCS_WORLD-1))
#if defined(SIMULATE_SEVERAL_NODES)
C     For local testing only, not to be activated.
      IF (NPROCS_WORLD.LE.1) THEN
        WRITE(*,'(A)') ' BENCH: More than 1 MPI processs needed '
        CALL MPI_FINALIZE(IERR)
        STOP
      ENDIF
      WRITE(MYNAME, '(A,I4)') "Node ",MYID_WORLD/(NPROCS_WORLD/2)
      MYNAME_LENGTH = 9
#else
      CALL MPI_GET_PROCESSOR_NAME(MYNAME, MYNAME_LENGTH, IERR )
#endif
      WRITE(*,'(I4,A)') MYID_WORLD,
     &                    " : NODE NAME="//MYNAME(1:MYNAME_LENGTH)
C
C     Read configuration file
C
      IF (MYID_WORLD .EQ. 0) THEN
        WRITE(*,'(A)') "BENCH: Reading configuration file"
        READ(*,*,ERR=200, END=200) NX, NY, NZ
        WRITE(*,'(A,3I3)') "BENCH: Read NX NY NZ      =",NX,NY,NZ
        READ(*,*,ERR=200, END=200) K
        WRITE(*,'(A,I3)')  "BENCH: Read K             =", K
        READ(*,*, ERR=200, END=200) NB_ITERATIONS
        WRITE(*,'(A,I3)')  "BENCH: Read NB_ITERATIONS =", NB_ITERATIONS
        ALLOCATE (NB_INSTANCES(NB_ITERATIONS),
     &            CHECK(NB_ITERATIONS))
        DO IT = 1, NB_ITERATIONS
          READ(*,*,ERR=200,END=200) TMP_STRING
          WRITE(*,'(A,I3,A,A)')  "BENCH: Read #instances  for IT.",
     &    IT,"=", TMP_STRING
          IF (TMP_STRING(1:3) .EQ. "MAX") THEN
            NB_INSTANCES(IT) = huge(NB_INSTANCES(IT))
          ELSE
            READ(TMP_STRING,*,ERR=200) NB_INSTANCES(IT)
          ENDIF
          READ(*,*,ERR=200) CHECK(IT)
          WRITE(*,'(A,I3,A,I1)') "BENCH: Read CHECK value for IT.",
     &    IT,"=",CHECK(IT)
        ENDDO
        GOTO 300
 200    CONTINUE
C       Error while reading configuration file
        WRITE(*,'(A)') "BENCH: ERROR IN INPUT FILE FORMAT"
        WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &  MYID_WORLD
        CALL MPI_ABORT(IERR)
 300    CONTINUE
      ENDIF
C     Broadcast main parameters
      CALL MPI_BCAST( NX, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR)
      CALL MPI_BCAST( NY, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR)
      CALL MPI_BCAST( NZ, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR)
      CALL MPI_BCAST( K, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, IERR)
      CALL MPI_BCAST( NB_ITERATIONS, 1, MPI_INTEGER, 0,
     &                MPI_COMM_WORLD, IERR)

      CALL BUILD_COMMUNICATORS( MYNAME, MYNAME_LENGTH,
     &     MPI_COMM_WORLD, COMM,
     &     K, MYID_WORLD, NPROCS_WORLD,
     &     MPI_PER_NODE, MPI_PER_COMM, NB_COMM_MAX, COLOR )

      IF (MYID_WORLD .EQ. 0) THEN
!$      NOMP = OMP_GET_MAX_THREADS()
        WRITE(*,'(A)')    "BENCH:======================================"
        WRITE(*,'(A,I4)') "BENCH:#MPI processes per node         :", 
     &                     MPI_PER_NODE
        WRITE(*,'(A,I4)') "BENCH:#OpenMP threads per MPI process :", 
     &                     NOMP
        WRITE(*,'(A,I4)') "BENCH:#MPI processes per communicator :",
     &                     MPI_PER_COMM
        WRITE(*,'(A,I4)') "BENCH:NB_COMM_MAX (#communic. created):",
     &                     NB_COMM_MAX
        WRITE(*,'(A,I4)') "BENCH:shmem communications ratio      :", K
        WRITE(*,'(A,I4)') "BENCH:NB_ITERATIONS                   :",
     &                     NB_ITERATIONS
        WRITE(*,'(A)')    "BENCH:======================================"
        WRITE(*,*)
C       MPI proces with rank k is in communicator COLOR(k)
        WRITE(*,'(A,300I5)')
     &  "BENCH: COLOR array defining communicators:", COLOR
        IF ( NB_COMM_MAX * K .NE. MPI_PER_NODE ) THEN
          WRITE(*,*) "BENCH: Note: ", MPI_PER_NODE/NB_COMM_MAX,
     &               " MPI ranks per node unused"
        ENDIF
      ENDIF

      CALL MPI_COMM_SIZE( COMM, NPROCS_COMM, IERR )
      CALL MPI_COMM_RANK( COMM, MYID_COMM, IERR )
C
      IF (MPI_PER_COMM .NE. NPROCS_COMM) THEN
        WRITE(*,'(A,I4,A,I4)')
     &  "BENCH: ERROR IN BUILD_COMMUNICATORS: MPI_PER_COMM =",
     &  MPI_PER_COMM, "  but MPI_COMM_SIZE returns",NPROCS_COMM
        WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &  MYID_WORLD
        CALL MPI_ABORT(MPI_COMM_WORLD, IERR)
      ENDIF


C     ---------------------------------------------
C     Initialize a MUMPS instance on all the MPI
C     processes that are part of a new communicator
C     ---------------------------------------------
      MY_COMM_COLOR = COLOR(MYID_WORLD)
      IF ( MY_COMM_COLOR .LT. NB_COMM_MAX ) THEN
C        NB_COMM_MAX = 1 => communicator 0 (processes with color 0)
C        NB_COMM_MAX = 2 => communicators 0,1 (processes with colors 0,1)
C        NB_COMM_MAX = 3 => communicators 0,1,2, etc.
         id%JOB  = -1
         id%SYM  =  0 
         id%CNTL(1) = 0.0D0
         id%PAR  =  1
         id%COMM = COMM
         CALL DMUMPS(id)
C        ---------------------------------------
C        Initialize problem and perform analysis
C        ---------------------------------------
         id%JOB = 1
         IF ( MYID_COMM .EQ. 0 ) THEN
C          Matrix is initialized on each master
           CALL ALLOCATE_INSTANCE( id, NX, NY, NZ, NRHS, MYID_WORLD )
         ENDIF
         IF (MYID_WORLD .EQ. 0) THEN
           WRITE(*,*)
           WRITE(*,'(A)')    " ===================================="
           WRITE(*,'(A)')    " 11pt discr. pb characteristics      "
           WRITE(*,'(A, I9)')"   NX     =", NX
           WRITE(*,'(A, I9)')"   NY     =", NY
           WRITE(*,'(A, I9)')"   NZ     =", NZ
           WRITE(*,'(A, I9)')"   NRHS   =", NRHS
           WRITE(*,'(A, I9)')"   id%N   =", id%N
           WRITE(*,'(A, I9)')"   id%NNZ =", id%NNZ
           WRITE(*,'(A)')    " ===================================="
           WRITE(*,*)
         ENDIF
         id%ICNTL(7)=5 ! Metis ordering should be used
         id%KEEP(3)=32
         id%KEEP(6)=16
         id%KEEP(5)=16
         id%KEEP(4)=32
         if (id%SYM .EQ. 0) THEN
           id%KEEP(9)=350
         else
           id%KEEP(9)=200
         endif
         id%KEEP(102)=110
         id%KEEP(375)=1
         id%KEEP(77)=75
         id%KEEP(78)=2
         id%KEEP(83)=min(8,NPROCS_COMM)
         id%KEEP(68)=25
         id%KEEP(263)=0
         id%KEEP(213) = 101
         id%KEEP(85)=-4
         id%KEEP(376)=1
         IF (MYID_WORLD.EQ.0) THEN
           WRITE(*,*) " CONFIGURATION: MUMPS internal SETTING:"
           DO I=1, 500
             WRITE(*,*) " ... KEEP(", I, ")=", id%KEEP(I)
           ENDDO
         ENDIF
         CALL DMUMPS(id)
         IF (id%INFOG(1) .LT. 0) THEN
           WRITE(*,'(A,I4,I4)') 
     &         "BENCH: ERROR DURING MUMPS ANALYSIS, INFOG(1:2)=", 
     &          id%INFOG(1:2)
           WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &          MYID_WORLD
           CALL MPI_ABORT(MPI_COMM_WORLD, IERR)
         ENDIF
      ENDIF
C
      DO IT= 1, NB_ITERATIONS
C       For each iteration, perform a factorization and possibly a solve
C       Number of communicators is limited by MPI_PER_NODE/K=NB_COMM_MAX
        IF (MYID_WORLD.EQ. 0) THEN
          NB_COMM_IT = min(NB_INSTANCES(IT), NB_COMM_MAX)
          CHECK_IT = CHECK(IT)
          WRITE(*,*)
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH:======================================"
          WRITE(*,'(A,I3, A,I3)') 
     &           "BENCH: Starting it.", IT,
     &           " with #simultaneous instances= ",
     &            NB_COMM_IT
          WRITE(*,'(A,I1)') "BENCH: CHECK = ",CHECK_IT
          WRITE(*,'(A)') "BENCH:======================================"
          WRITE(*,*)
          WRITE(*,*)
        ENDIF
        CALL MPI_BCAST( NB_COMM_IT, 1, MPI_INTEGER, 0,
     &                  MPI_COMM_WORLD, IERR)
        CALL MPI_BCAST( CHECK_IT, 1, MPI_INTEGER, 0,
     &                  MPI_COMM_WORLD, IERR)
        IF (CHECK_IT.EQ.0) THEN
         id%KEEP(201)=-1
        ELSE
         id%KEEP(201)=0
        ENDIF

        CALL SYSTEM_CLOCK( t_start )
        IF ( MY_COMM_COLOR .LT. NB_COMM_IT ) THEN
          WRITE(*,'(A,I4,A,I4)') "BENCH:",MYID_WORLD,
     &    " participates to it.",IT
C         ---------------------------
C         JOB=2: MATRIX FACTORIZATION
C         ---------------------------
          id%JOB=2
          ! id%ICNTL(1:4)=0
          CALL DMUMPS(id)
C
          IF (MYID_WORLD .EQ. 0.AND.id%ICNTL(35).GT.0) THEN
C           - print BLR flops
            WRITE(*,'(A,ES10.3,ES10.3)') " ** RINFOG(3)(14)= ",
     &            id%RINFOG(3),id%RINFOG(14)
          ENDIF
C
          IF (id%INFOG(1) .LT. 0) THEN
            WRITE(*,'(A,I4,I4)') 
     &       "BENCH: ERROR DURING MUMPS FACTORIZATION: INFOG(1:2)=",
     &       id%INFOG(1:2)
            WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &       MYID_WORLD
            CALL MPI_ABORT(MPI_COMM_WORLD, IERR)
          ENDIF
          CALL SYSTEM_CLOCK(t_end, t_rate)
          IF (MYID_COMM .EQ. 0) THEN
C           Print for each instance
            WRITE(*,'(A,I3,A,I3,A,I3,A,F10.3)')
     &      "BENCH: TIME (seconds) factorization for",
     &      MYID_WORLD," (COMM_COLOR ", MY_COMM_COLOR,
     &      ", it.",IT,") = ",
     &      dble( t_end - t_start ) / dble( t_rate )
          ENDIF
        ENDIF
C
C       Synchronization between iterations
C
        CALL MPI_BARRIER(MPI_COMM_WORLD, IERR)
        IF (MYID_WORLD .EQ. 0) THEN
          CALL SYSTEM_CLOCK(t_end, t_rate)
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH: ==================================== "
          WRITE(*,'(A,I3,A,I3,A,F10.3)')
     &          "BENCH: TIME (seconds) FOR ITERATION ",
     &           IT,
     &          " with #simultaneous instances=",
     &            min(NB_INSTANCES(IT), NB_COMM_MAX ),
     &          "=", dble( t_end - t_start ) / dble(t_rate)
          WRITE(*,'(A)') " ==================================== "
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH:"
        ENDIF

        IF (CHECK_IT .EQ. 1 ) THEN
          IF ( MY_COMM_COLOR .LT. NB_COMM_IT ) THEN
C             ---------------
C             Perform a solve
C             ---------------
              IF ( MYID_COMM .EQ. 0 ) THEN
                CALL RANDOM_NUMBER(id%RHS)
              ENDIF
              id%ICNTL(11)  = 2
              !id%ICNTL(1:4)=0
              id%JOB=3
              CALL DMUMPS(id)
              IF (MYID_COMM .EQ. 0) THEN
                WRITE(*,'(A)') "BENCH:"
                WRITE(*,'(A,ES10.3,A,I4,A,I4,A)') 
     &              "BENCH: Scaled Residual (<1D-12): ", id%RINFOG(6),
     &           " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")"

                IF (id%RINFOG(6).GE.1D-12) THEN
                 WRITE(*,'(A)') "BENCH:"
                 WRITE(*,'(A)') 
     &            "BENCH: ERROR RESIDUAL (RINFOG(6)) TOO LARGE "
                 WRITE(*,'(A)') "BENCH:"
                ENDIF
                IF (FLOPS_REFERENCE .NE. 0.0D0) THEN
                  WRITE(*,'(A,ES16.5,A,I4,A,I4,A)')
     &            "BENCH: FLOPS_REFERENCE DIFFERENCE=",
     &                       FLOPS_REFERENCE - id%RINFOG(3),
     &            " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")"
                  IF (abs(FLOPS_REFERENCE - id%RINFOG(3)).GT.1.0D0) 
     &            THEN
                    WRITE(*,'(A)') 
     &      "BENCH: ERROR FLOPS_REFERENCE DIFFERENCE is NONZERO"
                  ENDIF
                ELSE
C                 Set flops reference
                  WRITE(*,'(A,I3,A,ES16.5,A,I4,A,I4,A)')
     &            "BENCH:",MYID_WORLD,
     &            " setting FLOPS_REFERENCE =", id%RINFOG(3),
     &           " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")"
                  FLOPS_REFERENCE = id%RINFOG(3)
                ENDIF
              ENDIF
          ENDIF
C         This instance was checked, use FLOPS_REFERENCE
C         from process with rank 0 on all processes
          CALL MPI_BCAST( FLOPS_REFERENCE, 1, MPI_DOUBLE_PRECISION, 0,
     &                  MPI_COMM_WORLD, IERR)
        ELSE
          IF ( MY_COMM_COLOR .LT. NB_COMM_IT ) THEN
            IF (MYID_COMM .EQ. 0) THEN
              IF ( FLOPS_REFERENCE .NE. 0.0D0 ) THEN
                WRITE(*,'(A,ES16.5,A,I4,A,I4,A)')
     &          "BENCH: FLOPS_REFERENCE DIFFERENCE=",
     &                      FLOPS_REFERENCE - id%RINFOG(3),
     &          " (COMM_COLOR ", MY_COMM_COLOR, ", it.",IT,")"
                IF (abs(FLOPS_REFERENCE - id%RINFOG(3)).GT.1.0D0) 
     &          THEN
                  WRITE(*,'(A)') 
     &        "BENCH: ERROR FLOPS_REFERENCE DIFFERENCE is NONZERO"
                ENDIF
              ENDIF
            ENDIF
          ENDIF
        ENDIF
      ENDDO
C
      IF (MYID_WORLD .EQ. 0) THEN
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH:"
          WRITE(*,'(A)') "BENCH: ====================================="
          WRITE(*,'(A)') "BENCH: All iterations have finished.        "
          WRITE(*,'(A)') "BENCH: Destroying instances and finalizing  "
          WRITE(*,'(A)') "BENCH: ====================================="
      ENDIF
C
C     ---------------------------
C     Destroy all MUMPS instances
C     ---------------------------
      IF ( COLOR(MYID_WORLD) .LT. NB_COMM_MAX ) THEN
        id%JOB=-2
        CALL DMUMPS(id)
        IF (MYID_COMM .EQ. 0) THEN
C         Free matrix and right-hand side
          DEALLOCATE(id%IRN, id%JCN, id%A, id%RHS)
          NULLIFY(id%IRN, id%JCN, id%A, id%RHS)
        ENDIF
      ENDIF
      IF (MYID_WORLD .EQ. 0) THEN
        DEALLOCATE(NB_INSTANCES)
        DEALLOCATE(CHECK)
      ENDIF
      CALL MPI_FINALIZE(IERR)
      END PROGRAM MUMPS_BENCH
C
C      ----------------------------------------------------------------
C
      SUBROUTINE BUILD_COMMUNICATORS( MYNAME, MYNAME_LENGTH,
     &                                OLD_COMM, NEW_COMM,
     &                                K, MYID, NPROCS,
     &                                MPI_PER_NODE,
     &                                MPI_PER_COMM,
     &                                NB_COMMUNICATORS, COLOR )
      IMPLICIT NONE
      INCLUDE 'mpif.h'
      INTEGER, INTENT(IN)  :: OLD_COMM, MYID, NPROCS, K
      INTEGER, INTENT(OUT) :: NEW_COMM
      INTEGER, INTENT(OUT) :: MPI_PER_NODE,
     &                        MPI_PER_COMM,
     &                        NB_COMMUNICATORS
      INTEGER, INTENT(OUT) :: COLOR(0:NPROCS-1)
      CHARACTER(len=MPI_MAX_PROCESSOR_NAME), INTENT(IN) :: MYNAME
      INTEGER, INTENT(IN) :: MYNAME_LENGTH
C
C     Local declarations
C     ==================
C
      INTEGER :: IERR
C
      INTEGER :: NB_ON_THIS_NODE, MAX_ON_A_NODE, MIN_ON_A_NODE, NB_NODES
      INTEGER :: CURRENT_INODE_REP
      INTEGER :: IPROC ! for loops between 0 and NPROCS-1
      INTEGER, ALLOCATABLE, DIMENSION(:) :: IS_ON_SAME_NODE
      INTEGER, ALLOCATABLE, DIMENSION(:) :: TABLE_OF_PROCESSES
C
      ALLOCATE(IS_ON_SAME_NODE(0:NPROCS-1))
      CALL SET_IS_ON_SAME_NODE( MYNAME, MYNAME_LENGTH,
     &                          MYID, NPROCS, IS_ON_SAME_NODE,
     &                          MPI_COMM_WORLD )
C     On exit, IS_ON_SAME_NODE is an array of size NPROCS,
C     identical on all MPI ranks. We illustrate its content
C     in the example below, with NPROCS=12.
C     Assuming we have:
C               MPI rank    :  0  1  2  3  4  5  6  7  8  9 10 11
C               node name   :  A  D  B  B  B  A  C  D  D  C  A  C
C     Then, IS_ON_SAME_NODE =  0  1  2  2  2  0  6  1  1  6  0  6
C     It is such that IS_ON_SAME_NODE(i)=IS_ONS_SAME_NODE(j)=k if i
C     and j and k are on the same node, and k is the smallest rank
C     among the MPI processes present on that node.
C
      ALLOCATE(TABLE_OF_PROCESSES(0:NPROCS-1))
      DO IPROC = 0, NPROCS-1
        TABLE_OF_PROCESSES(IPROC)=IPROC
      ENDDO
      CALL BUBBLE_SORT_INT( NPROCS, IS_ON_SAME_NODE(0), 
     &                     TABLE_OF_PROCESSES(0) )
C
C               MPI rank    :  0  1  2  3  4  5  6  7  8  9 10 11
C               node name   :  A  D  B  B  B  A  C  D  D  C  A  C
C
C     corresponds to:
C
C     TABLE_OF_PROCESSES    =  0  5 10  1  7  8  2  3  4  6  9 11
C               node name   :  A  A  A  D  D  D  B  B  B  C  C  C
C           IS_ON_SAME_NODE =  0  0  0  1  1  1  2  2  2  6  6  6
C
C         COLOR (see below) =  0  1  2  0  1  2  0  1  2  0  1  2
C         Indeed, this means: meaning COLOR(0) = 0, COLOR(5)=1,
C                 COLOR(10)=2, COLOR(1)=1, COLOR(7) = 1, etc.
C
C     Check Number of MPI per node
C
      CURRENT_INODE_REP = -99999
      MAX_ON_A_NODE     = -99999
      MIN_ON_A_NODE     = 99999
      NB_ON_THIS_NODE   = -99999
C     IS_ON_SAME_NODE and TABLE_OF_PROCESSES are now sorted so
C     that entries corresponding to a given node are contiguous
      DO IPROC = 0, NPROCS-1
        IF ( CURRENT_INODE_REP .NE. IS_ON_SAME_NODE(IPROC) ) THEN
C         Beginning of a new node
          CURRENT_INODE_REP = IS_ON_SAME_NODE(IPROC)
          NB_ON_THIS_NODE = 1
        ELSE
          NB_ON_THIS_NODE = NB_ON_THIS_NODE + 1
        ENDIF
        IF (IPROC .EQ. NPROCS-1) THEN
C           Last IPROC, NB_ON_THIS_NODE is valid
            MIN_ON_A_NODE = min( MIN_ON_A_NODE, NB_ON_THIS_NODE )
            MAX_ON_A_NODE = max( MAX_ON_A_NODE, NB_ON_THIS_NODE )
        ELSE IF ( CURRENT_INODE_REP .NE. IS_ON_SAME_NODE(IPROC+1) ) THEN
C           Last IPROC for that node, NB_ON_THIS_NODE is valid
            MIN_ON_A_NODE = min( MIN_ON_A_NODE, NB_ON_THIS_NODE )
            MAX_ON_A_NODE = max( MAX_ON_A_NODE, NB_ON_THIS_NODE )
        ENDIF
C       Define COLOR entry
C       Processes with same color will form communicators:
C       1st on node i with 1st on node j and 1st on node k,
C       2nd on node i with 2nd on node j, etc.
        COLOR(TABLE_OF_PROCESSES(IPROC)) = NB_ON_THIS_NODE - 1
      ENDDO
C
      IF (MAX_ON_A_NODE.NE. MIN_ON_A_NODE) THEN
        IF (MYID .EQ. 0) THEN
          WRITE(*,'(A,A,I3,A,I3)')
     &    "BENCH: CONFIGURATION ERROR:", 
     &    "#MPI per node not constant: #MPI min=",
     &    MIN_ON_A_NODE, " and #MPI max=", MAX_ON_A_NODE
          WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &    MYID
          CALL MPI_ABORT(IERR)
        ENDIF
      ENDIF

      MPI_PER_NODE = MIN_ON_A_NODE
      NB_COMMUNICATORS = MPI_PER_NODE / K

      NB_NODES = NPROCS / MPI_PER_NODE ! should be exact
      MPI_PER_COMM = K * NB_NODES ! output argument for checking

C
C     Depending on the factor K, we create as many communicators 
C     as possible of size NPROCS / K.
C     K = 1 => 3 communicators of size 4
C     K = 2 => 1 communicator of size 8, 4 MPI processes per node unused
C     K = 3 => 1 communicator of size 12
C
C     ---------------------------------------------
C     Create the inter-node communicators:
      CALL MPI_COMM_SPLIT( OLD_COMM, 
     &                     COLOR(MYID)/K,  ! Color
     &                     COLOR(MYID), ! Key
     &                     NEW_COMM,
     &                     IERR )
C     In the call above, COLOR(MYID)/K is the color to create
C     communicators (same color => same communicator), and
C     COLOR(MYID) is used as a key to sort MPI ranks within
C     a communicator).
C
C     Fix COLOR output array, which should be divided
C     by K to be interpreted correctly on exit (COLOR).
      COLOR = COLOR / K

      RETURN
      END SUBROUTINE BUILD_COMMUNICATORS
C
C      ----------------------------------------------------------------
C
      SUBROUTINE SET_IS_ON_SAME_NODE( MYNAME, MYNAME_LENGTH,
     &                                MYID, NPROCS,
     &                                IS_ON_SAME_NODE, COMM )
      IMPLICIT NONE
      INCLUDE 'mpif.h'
C
C     Given a communicator COMM, build IS_ON_SAME_NODE on all
C     MPI processes, where on exit, IS_ON_SAME_NODE( i ) = j
C     means that MPI rank i is on the same node as MPI rank j,
C     where j is the smallest MPI rank of the processes present
C     on the node.
C
      CHARACTER(len=MPI_MAX_PROCESSOR_NAME), INTENT(IN) :: MYNAME
      INTEGER, INTENT(IN) :: MYNAME_LENGTH
      INTEGER, INTENT(IN) :: MYID, NPROCS, COMM
      INTEGER, INTENT(OUT):: IS_ON_SAME_NODE(0:NPROCS-1)
C
      INTEGER :: IERR
C
      CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: TMPNAME
      INTEGER :: TMPNAME_LENGTH
      INTEGER :: IPROC, IPROC_MIN
      DO IPROC = 0, NPROCS - 1
        IF ( IPROC. EQ. MYID ) THEN
          TMPNAME_LENGTH = MYNAME_LENGTH
          TMPNAME = MYNAME
        ENDIF
        CALL MPI_BCAST( TMPNAME_LENGTH, 1, MPI_INTEGER, IPROC,
     &                  MPI_COMM_WORLD, IERR)
        CALL MPI_BCAST( TMPNAME, TMPNAME_LENGTH, MPI_CHARACTER, IPROC,
     &                  MPI_COMM_WORLD, IERR)
        IS_ON_SAME_NODE( IPROC ) = 0
C       Test below includes myself, leading to
C       IS_ON_SAME_PROC(MYID) .eqv. .true.
        IF ( TMPNAME_LENGTH .EQ. MYNAME_LENGTH ) THEN
          IF ( TMPNAME(1:TMPNAME_LENGTH) .EQ. MYNAME(1:MYNAME_LENGTH) )
     &    THEN
            IS_ON_SAME_NODE(IPROC) = 1
          ENDIF
        ENDIF
      ENDDO
      
C     For each node, define a representant
C     (one of the MPI ranks on that node)
C     We choose IPROC_MIN, ie, the one with smallest rank
      DO IPROC = 0, NPROCS - 1
        IF (IS_ON_SAME_NODE(IPROC) .EQ. 1) THEN
          IPROC_MIN = IPROC
          EXIT
        ENDIF
      ENDDO
      DO IPROC = 0, NPROCS - 1
        IF (IS_ON_SAME_NODE(IPROC) .EQ. 1) THEN
          IS_ON_SAME_NODE(IPROC) = IPROC_MIN
        ENDIF
      ENDDO
      CALL MPI_ALLREDUCE( MPI_IN_PLACE, IS_ON_SAME_NODE(0), NPROCS,
     &                    MPI_INTEGER, MPI_MAX, COMM, IERR )
      RETURN
      END SUBROUTINE SET_IS_ON_SAME_NODE
C
C      ----------------------------------------------------------------
C
      SUBROUTINE ALLOCATE_INSTANCE( id, N1, N2, N3, NRHS, MYID_WORLD )
      INCLUDE 'dmumps_struc.h'
      INTEGER, INTENT(in) :: N1, N2, N3, NRHS, MYID_WORLD
C
C     Prepares a MUMPS instance with user parameters on a
C     master process. Assumes that id%SYM has been set.
C
      INCLUDE 'mpif.h'
      INTEGER :: IERR, I
      TYPE (DMUMPS_STRUC) :: id
      INTEGER :: allocok
      INTEGER :: NMAX
      INTEGER(8) :: NNZMAX
      INTEGER(8) :: NNZ, j8, i8
      INTEGER(8), ALLOCATABLE, DIMENSION(:) :: IP
      INTEGER, POINTER, DIMENSION(:) :: IRN, JCN
      DOUBLE PRECISION, POINTER, DIMENSION(:) :: A
C
      CALL PT11AD_ANA(N1,N2,N3,id%N,id%NNZ,.TRUE.)
      NMAX    = id%N
      NNZMAX  = id%NNZ
      id%NRHS = NRHS
      id%LRHS = id%N
      ALLOCATE(id%IRN(id%NNZ),id%JCN(id%NNZ),id%A(id%NNZ),
     & IP(id%N+1),id%RHS(id%N*id%NRHS), stat=allocok)
      IF (allocok > 0 ) THEN
        WRITE(*,'(A)') "BENCH: ERROR: failure allocating matrix",
     &             allocok
        WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &  MYID_WORLD
        CALL MPI_ABORT(MPI_COMM_WORLD, IERR)
      ENDIF
      CALL PT11AD(NMAX, NNZMAX, IERR, N1, N2, N3, id%N, id%NNZ, IP,
     & id%IRN(1), id%A(1), 6, .TRUE., 1, .FALSE.)
C     Convert from CSC to coordinate format
!$OMP PARALLEL DO
      DO I = 1, id%N
        id%JCN(IP(I):IP(I+1)-1) = I
      ENDDO
!$OMP END PARALLEL DO
      DEALLOCATE(IP)

C     Case of symmetric matrix: update matrix to keep
C     only lower triangular part + diagonal
      IF ((id%SYM.EQ.1).OR.(id%SYM.EQ.2)) THEN
C{
         NNZ=(id%NNZ-id%N)/2+id%N
         ALLOCATE(IRN(NNZ), JCN(NNZ), A(NNZ), stat=allocok)
         IF (allocok > 0 ) THEN
           WRITE(*,'(A,I3)')
     &             "BENCH: ERROR: failure allocating sym. matrix",
     &                allocok
           WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &     MYID_WORLD
           CALL MPI_ABORT(MPI_COMM_WORLD, IERR)
         ENDIF
         j8=1
         DO i8=1,id%NNZ
C           keep only lower triangualr matrix
            if(id%IRN(i8).GE.id%JCN(i8)) then
               IRN(j8)=id%IRN(i8)
               JCN(j8)=id%JCN(i8)
               A(j8)=id%A(i8)
               j8=j8+1
            endif
         ENDDO
         IF (j8-1.ne.NNZ) THEN
            WRITE(6,'(A,I8,I8)') 
     &      "BENCH: ERROR / internal error NNZ, j8-1=",
     &      NNZ, j8-1
            WRITE(*,'(A,I4)') "BENCH: ** Aborting on MPI rank ",
     &      MYID_WORLD
            CALL MPI_ABORT(MPI_COMM_WORLD, IERR) 
         ENDIF
         id%NNZ = NNZ
         DEALLOCATE(id%IRN, id%JCN, id%A)
         id%IRN=>IRN
         id%JCN=>JCN
         id%A=>A
      ENDIF


      


      RETURN
      END SUBROUTINE ALLOCATE_INSTANCE
C
C      ----------------------------------------------------------------
C
      SUBROUTINE PT11AD( NMAX, NZMAX, IFLAG, 
     &     Nx, Ny, Nz,
     &     N, NNZ, IRST, JCN, A, LP, YESA, IPROB,GEN_VERBOSE )
*     
*     
*     Produces Positive-Definite matrix from eleven point
*     discretization on an M by N by P grid,
*     with a nine point discretization in X/Y directions.
*     The matrix A is generated BY ROWS.
*     If YESA is .FALSE. only structure is generated.
*     
*     Order of A = NX*NY*NZ
*     Nonzeros in A = (3*NX -2)*(3*ny-2)*nz +nx*ny*(2*nz-2)
*     
*     Input Parameters (not modified in MUP11A)
*     ----------------
*     NMAX, NZMAX  INTEGER variables that need to be set to 
*     respectively the maximum order and 
*     the maximum number of nonzeros in the output matrix
*     Nx, Ny, Nz   INTEGER variable taht need be set to the 
*     dimension of the grid. 
*     LP           INTEGER variable, Output unit for printing.
*     YESA         LOGICAL variable. 
*     If YESA is .FALSE. only the structure of the output 
*     matrix is generated
*     IPROB        INTEGER variable that need be set on entry. 
*     IF IPROB=1 then 3D-Poisson operator with 11-pt discretization
*     stored in XYZ order is generated.
*     ELSE ZYX storage is used.
*     
*     
*     Output Parameters (need not be set on entry)
*     -----------------
*     IFLAG       INTEGER variable that is set 
*     to the Error return (0 if no problem detected)
*     N, NNZ      INTEGER variables.
*     N is the order of the output matrix, NNZ is the 
*     number of nonzeros.
*     IRST, JCN, A: Output matrix in HB format (by rows).
*     
      INTEGER   NMAX,IFLAG
      INTEGER   Nx, Ny, Nz
      INTEGER   N, IPROB, LP
      INTEGER(8) :: NNZ,NZMAX, NTEMP_8,IPOS,IFLAG_8
      INTEGER(8) IRST(NMAX+1) 
      INTEGER JCN(NZMAX)
      DOUBLE PRECISION      A(NZMAX)
      LOGICAL   YESA
*     
      INTEGER   I, J, K, IncI, IncJ, IncK,
     &     IRow, IPT, JPT, KPT
      INTEGER   NTEMP
*     
      DOUBLE PRECISION ONE
#if 1
      PARAMETER( ONE = 1.0D0 )
#else
      PARAMETER( ONE = (1.0D0,0.0D0) )
#endif
      LOGICAL,intent(in) :: GEN_VERBOSE
*     
*     
*     check size of arrays
      IFLAG = 0
      NTEMP = Nx * Ny * Nz
      IF (NMAX .LT. NTEMP) THEN 
         IFLAG = - Nx * Ny * Nz
         IF (LP.GT.0) WRITE(LP,'(A,I10)') 'Increase NMAX to', NTEMP
         GOTO 500
      ENDIF
      NTEMP_8 = (3*int(Nx,8)-2)*(3*int(Ny,8)-2)*int(Nz,8)
     &     + int(Nx,8)*int(Ny,8)*(2*int(Nz,8)-2)
      IF (NZMAX .LT. NTEMP_8) THEN 
         IFLAG_8 =-NTEMP_8
         IF (LP.GT.0) WRITE(LP,'(A,I10)') 'Increase NZMAX to', NTEMP_8
         GOTO 500
      ENDIF
*     
*     
      IF (IPROB.EQ.1) THEN
         if(GEN_VERBOSE) then
            WRITE( 6, 1100 )
         endif
 1100       FORMAT(' 3D-Poisson operator with 11-point discretization',
     &           ' --- Stored in XYZ order' )
*     
         N = Nx * Ny * Nz
         IPOS=1
         IncI = 1
         IncJ = Nx
         IncK = Nx * Ny
*     
         DO 110 K = 1, Nz
            KPT = (K-1) * IncK
*     
            DO 111 J = 1, Ny
               JPT = (J-1) * IncJ
*     
               DO 112 I = 1, Nx
                  IRow = KPT + JPT + I
                  IRST(IRow) = IPOS
*     
                  IF (K.EQ.1) GOTO 120
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow - IncK
                  IPOS = IPOS + 1
*     
 120              IF (J.EQ.1) GOTO 130
                  IF (I.GT.1) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow - IncJ - IncI
                     IPOS = IPOS + 1
                  ENDIF
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow - IncJ
                  IPOS = IPOS + 1
                  IF (I.LT.Nx) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow - IncJ + IncI
                     IPOS = IPOS + 1
                  ENDIF
*     
 130              IF (I.EQ.1) GOTO 140
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow - IncI
                  IPOS = IPOS + 1
*     
 140              CONTINUE
#if 1
                  IF (YESA) A(IPOS) = 22.0D0
#else
                  IF (YESA) A(IPOS) = (22.0D0,22.0D0)
#endif
                  JCN(IPOS) = IRow
                  IPOS = IPOS + 1
*     
                  IF (I.EQ.Nx) GO TO 150
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow + IncI
                  IPOS = IPOS + 1
*     
 150              IF (J.EQ.Ny) GO TO 160
                  IF (I.GT.1) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow + IncJ - IncI
                     IPOS = IPOS + 1
                  ENDIF
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow + IncJ
                  IPOS = IPOS + 1
                  IF (I.LT.Nx) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow + IncJ + IncI
                     IPOS = IPOS + 1
                  ENDIF
*     
 160              IF (K.EQ.Nz) GO TO 112
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow + IncK
                  IPOS = IPOS + 1
*     
 112           CONTINUE
 111        CONTINUE
 110     CONTINUE
*     
         NNZ = IPOS - 1
         IRST(N+1) = NNZ + 1
      ENDIF
*     
*     
*     
*     
*     
      IF (IPROB.NE.1) THEN
         if(GEN_VERBOSE) then
            WRITE( 6, 1200 )
         endif
 1200       FORMAT(' 3D-Poisson operator with 11-point '
     &           //'discretization',
     &           ' --- Stored in ZYX order' )
*     
         N = Nx * Ny * Nz
         IPOS=1
         IncK = 1
         IncJ = Nz
         IncI = Nz * Ny
*     
         DO 210 I = 1, Nx
            IPT = (I-1) * IncI
*     
            DO 211 J = 1, Ny
               JPT = (J-1) * IncJ
*     
               DO 212 K = 1, Nz
                  IRow = IPT + JPT + K
                  IRST(IRow) = IPOS
*     
                  IF (K.EQ.1) GOTO 220
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow - IncK
                  IPOS = IPOS + 1
*     
 220              IF (J.EQ.1) GOTO 230
                  IF (I.GT.1) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow - IncJ - IncI
                     IPOS = IPOS + 1
                  ENDIF
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow - IncJ
                  IPOS = IPOS + 1
                  IF (I.LT.Nx) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow - IncJ + IncI
                     IPOS = IPOS + 1
                  ENDIF
*     
 230              IF (I.EQ.1) GOTO 240
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow - IncI
                  IPOS = IPOS + 1
*     
 240              CONTINUE
#if 1
                  IF (YESA) A(IPOS) = 22.0D0
#else
                  IF (YESA) A(IPOS) = (22.0D0,0.0D0)
#endif
                  JCN(IPOS) = IRow
                  IPOS = IPOS + 1
*     
                  IF (I.EQ.Nx) GO TO 250
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow + IncI
                  IPOS = IPOS + 1
*     
 250              IF (J.EQ.Ny) GO TO 260
                  IF (I.GT.1) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow + IncJ - IncI
                     IPOS = IPOS + 1
                  ENDIF
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow + IncJ
                  IPOS = IPOS + 1
                  IF (I.LT.Nx) THEN
                     IF (YESA) A(IPOS) = -2*ONE
                     JCN(IPOS) = IRow + IncJ + IncI
                     IPOS = IPOS + 1
                  ENDIF
*     
 260              IF (K.EQ.Nz) GO TO 212
                  IF (YESA) A(IPOS) = -2*ONE
                  JCN(IPOS) = IRow + IncK
                  IPOS = IPOS + 1
*     
 212           CONTINUE
 211        CONTINUE
 210     CONTINUE
*     
         NNZ = IPOS - 1
         IRST(N+1) = NNZ + 1
      ENDIF
*     
*     
*     
      if(GEN_VERBOSE) then
         IF (LP.GT.0) WRITE(LP,60)  Nx, Ny, Nz
         IF (LP.GT.0) WRITE(LP,70)  N, NNZ
      endif
 60   FORMAT( ' NX= ',I5,'     NY= ',I5,'      NZ= ',I5)
 70   FORMAT(' ORDER OF MATRIX IS ',I10, 
     &     ' NUMBER OF NON-ZEROS IS ',I12)
*     
*     
*     
 500  RETURN
      END
C
C      ----------------------------------------------------------------
C
      SUBROUTINE PT11AD_ANA(N1, N2, N3, NNN, NNZ, GEN_VERBOSE)
C
      INTEGER, INTENT(IN) :: N1,N2,N3
      INTEGER, INTENT(OUT) :: NNN
      INTEGER(8), INTENT(OUT) :: NNZ
      LOGICAL,INTENT(IN) :: GEN_VERBOSE
C
      integer(8)::n1_8,n2_8,n3_8
C
      n1_8=int(N1,8)
      n2_8=int(N2,8)
      n3_8=int(N3,8)
      
      NNN=n1*n2*n3
      NNZ=(3*n1_8-2)*(3*n2_8-2)*n3_8+n1_8*n2_8*(2*n3_8-2)
      
      if(GEN_VERBOSE) then
         write(*,*) 'N = ', NNN
         write(*,*) 'NNZ = ',NNZ
      endif
      return
      END SUBROUTINE PT11AD_ANA
C
C      ----------------------------------------------------------------
C
      SUBROUTINE BUBBLE_SORT_INT( N, VAL, ID )
C     
C     Basic sort of VAL values, reporting changes to ID.
C     Not the most efficient possible sort, but only done once.
C     
      INTEGER N
      INTEGER ID( N )
      INTEGER VAL( N )
      INTEGER I, ISWAP
      INTEGER SWAP
      LOGICAL DONE
      DONE = .FALSE.
      DO WHILE ( .NOT. DONE )
        DONE = .TRUE.
        DO I = 1, N - 1
           IF ( VAL( I ) .GT. VAL( I + 1 ) ) THEN
              DONE = .FALSE.
              ISWAP = ID( I )
              ID ( I ) = ID ( I + 1 )
              ID ( I + 1 ) = ISWAP
              SWAP = VAL( I )
              VAL( I ) = VAL( I + 1 )
              VAL( I + 1 ) = SWAP
           END IF
        END DO
      END DO
      RETURN
      END SUBROUTINE BUBBLE_SORT_INT