c
c----------- Example to reproduce issue#46 ----------------------------------------------
c
      program issue46
      include 'mpif.h'
c     %-------------------------------%
c     | MPI INTERFACE                 |
c     | ILP64 is not supported by MPI |
c     | integer*4 must be imposed in  |
c     | all calls involving MPI.      |
c     |                               |
c     | Use ierr for MPI calls.       |
c     %-------------------------------%
      integer*4 ierr, comm, myid, nprocs, cnprocs, color
      call MPI_INIT( ierr )
      call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
      call MPI_COMM_SIZE( MPI_COMM_WORLD, nprocs, ierr )
c     Create commuticator to run arnoldi only on the first CPU
      if(myid .lt. nprocs/2) then
      color = 1
      else
      color = 0
      endif
      write(*,*) "Hello", color
      call MPI_Comm_split(MPI_COMM_WORLD, color, myid, comm, ierr)
      call MPI_COMM_SIZE( comm, cnprocs, ierr )
      if(color .eq. 1) call parnoldi(comm)
      call MPI_Barrier(MPI_COMM_WORLD, ierr)
      comm = MPI_COMM_WORLD
      call parnoldi(comm)
      call MPI_FINALIZE(ierr)
      end


      subroutine parnoldi(comm)
      include 'mpif.h'
      include 'debug.h'
      include 'stat.h'

      integer*4         comm, myid, nprocs, rc, ierr
c
c     %-----------------------------%
c     | Define leading dimensions   |
c     | for all arrays.             |
c     | MAXN:   Maximum dimension   |
c     |         of the A allowed.   |
c     | MAXNEV: Maximum NEV allowed |
c     | MAXNCV: Maximum NCV allowed |
c     %-----------------------------%
c
      integer          maxnloc, maxnev, maxncv, ldv
      parameter       (maxnloc=256, maxnev=10, maxncv=25,
     &                 ldv=maxnloc )
c
c     %--------------%
c     | Local Arrays |
c     %--------------%
c
      Double precision
     &                 v(ldv,maxncv), workl(maxncv*(maxncv+8)),
     &                 workd(3*maxnloc), d(maxncv,2), resid(maxnloc),
     &                 ax(maxnloc)
      logical          select(maxncv)
      integer          iparam(11), ipntr(11)
c
c     %------------------------------------%
c     | Local Scalars                      |
c     |                                    |
c     | Use info if ILP64 can be supported |
c     | (call to BLAS, LAPACK, ARPACK).    |
c     %------------------------------------%
c
      character        bmat*1, which*2
      integer          ido, n, nev, ncv, lworkl, info, nloc, j,
     &                 nx, nconv, maxitr, mode, ishfts
      logical          rvec
      Double precision
     &                 tol, sigma
c
c     %----------------------------------------------%
c     | Local Buffers needed for MPI communication |
c     %----------------------------------------------%
c
      Double precision
     &                  mv_buf(maxnloc)
c
c     %------------%
c     | Parameters |
c     %------------%
c
      Double precision
     &                 zero
      parameter        (zero = 0.0)
c
c     %-----------------------------%
c     | BLAS & LAPACK routines used |
c     %-----------------------------%
c
      Double precision
     &                 pdnorm2
      external         pdnorm2, daxpy
c
c     %---------------------%
c     | Intrinsic Functions |
c     %---------------------%
c
      intrinsic         abs
c
c     %-----------------------%
c     | Executable Statements |
c     %-----------------------%
c
      call MPI_COMM_RANK( comm, myid, ierr )
      call MPI_COMM_SIZE( comm, nprocs, ierr )
c
      ndigit = -3
      logfil = 6
      msaupd = 1
c
c     %----------------------------------------------------%
c     | The number NX is the number of interior points     |
c     | in the discretization of the 2-dimensional         |
c     | Laplacian on the unit square with zero Dirichlet   |
c     | boundary condition.  The number N(=NX*NX) is the   |
c     | dimension of the matrix.  A standard eigenvalue    |
c     | problem is solved (BMAT = 'I'). NEV is the number  |
c     | of eigenvalues to be approximated.  The user can   |
c     | modify NEV, NCV, WHICH to solve problems of        |
c     | different sizes, and to get different parts of the |
c     | spectrum.  However, The following conditions must  |
c     | be satisfied:                                      |
c     |                   N <= MAXN,                       |
c     |                 NEV <= MAXNEV,                     |
c     |             NEV + 2 <= NCV <= MAXNCV               |
c     %----------------------------------------------------%
c
      nx = 10
      n = nx*nx
      nev =  4
      ncv =  20
c
c     %--------------------------------------%
c     | Set up distribution of data to nodes |
c     %--------------------------------------%
c
      nloc = (nx / nprocs)*nx
      if ( mod(nx, nprocs) .gt. myid ) nloc = nloc + nx
c
      if ( nloc .gt. maxnloc ) then
         print *, ' ERROR with _SDRV1: NLOC is greater than MAXNLOC '
         go to 9000
      else if ( nev .gt. maxnev ) then
         print *, ' ERROR with _SDRV1: NEV is greater than MAXNEV '
         go to 9000
      else if ( ncv .gt. maxncv ) then
         print *, ' ERROR with _SDRV1: NCV is greater than MAXNCV '
         go to 9000
      end if
      bmat = 'I'
      which = 'SM'
c
c     %--------------------------------------------------%
c     | The work array WORKL is used in PSSAUPD as       |
c     | workspace.  Its dimension LWORKL is set as       |
c     | illustrated below.  The parameter TOL determines |
c     | the stopping criterion.  If TOL<=0, machine      |
c     | precision is used.  The variable IDO is used for |
c     | reverse communication and is initially set to 0. |
c     | Setting INFO=0 indicates that a random vector is |
c     | generated in PSSAUPD to start the Arnoldi        |
c     | iteration.                                       |
c     %--------------------------------------------------%
c
      lworkl = ncv*(ncv+8)
      tol = zero
      info = 0
      ido = 0
c
c     %---------------------------------------------------%
c     | This program uses exact shifts with respect to    |
c     | the current Hessenberg matrix (IPARAM(1) = 1).    |
c     | IPARAM(3) specifies the maximum number of Arnoldi |
c     | iterations allowed.  Mode 1 of PSSAUPD is used    |
c     | (IPARAM(7) = 1).  All these options may be        |
c     | changed by the user. For details, see the         |
c     | documentation in PSSAUPD.                         |
c     %---------------------------------------------------%
c
      ishfts = 1
      maxitr = 300
      mode   = 1
c
      iparam(1) = ishfts
      iparam(3) = maxitr
      iparam(7) = mode
c
c     %-------------------------------------------%
c     | M A I N   L O O P (Reverse communication) |
c     %-------------------------------------------%
c
 10   continue
c
c        %---------------------------------------------%
c        | Repeatedly call the routine PSSAUPD and take|
c        | actions indicated by parameter IDO until    |
c        | either convergence is indicated or maxitr   |
c        | has been exceeded.                          |
c        %---------------------------------------------%
c
         call pdsaupd ( comm, ido, bmat, nloc, which, nev, tol, resid,
     &                 ncv, v, ldv, iparam, ipntr, workd, workl,
     &                 lworkl, info )
c
         if (ido .eq. -1 .or. ido .eq. 1) then
c
c           %--------------------------------------%
c           | Perform matrix vector multiplication |
c           |              y <--- OP*x             |
c           | The user should supply his/her own   |
c           | matrix vector multiplication routine |
c           | here that takes workd(ipntr(1)) as   |
c           | the input, and return the result to  |
c           | workd(ipntr(2)).                     |
c           %--------------------------------------%
c
            call av ( comm, nloc, nx, mv_buf,
     &               workd(ipntr(1)), workd(ipntr(2)))
c
c           %-----------------------------------------%
c           | L O O P   B A C K to call PSSAUPD again.|
c           %-----------------------------------------%
c
            go to 10
c
         end if
c
c     %----------------------------------------%
c     | Either we have convergence or there is |
c     | an error.                              |
c     %----------------------------------------%
c
      if ( info .lt. 0 ) then
c
c        %--------------------------%
c        | Error message. Check the |
c        | documentation in PSSAUPD.|
c        %--------------------------%
c
         if ( myid .eq. 0 ) then
            print *, ' '
            print *, ' Error with _saupd, info = ', info
            print *, ' Check documentation in _saupd '
            print *, ' '
         endif
c
      else
c
c        %-------------------------------------------%
c        | No fatal errors occurred.                 |
c        | Post-Process using PSSEUPD.               |
c        |                                           |
c        | Computed eigenvalues may be extracted.    |
c        |                                           |
c        | Eigenvectors may also be computed now if  |
c        | desired.  (indicated by rvec = .true.)    |
c        %-------------------------------------------%
c
         rvec = .true.
c
         call pdseupd ( comm, rvec, 'A', select,
     &        d, v, ldv, sigma,
     &        bmat, nloc, which, nev, tol, resid, ncv, v, ldv,
     &        iparam, ipntr, workd, workl, lworkl, info )
c        %----------------------------------------------%
c        | Eigenvalues are returned in the first column |
c        | of the two dimensional array D and the       |
c        | corresponding eigenvectors are returned in   |
c        | the first NEV columns of the two dimensional |
c        | array V if requested.  Otherwise, an         |
c        | orthogonal basis for the invariant subspace  |
c        | corresponding to the eigenvalues in D is     |
c        | returned in V.                               |
c        %----------------------------------------------%
c
         if ( info .ne. 0) then
c
c            %------------------------------------%
c            | Error condition:                   |
c            | Check the documentation of PSSEUPD.|
c            %------------------------------------%
c
c
            if ( myid .eq. 0 ) then
             	print *, ' '
             	print *, ' Error with _seupd, info = ', info
             	print *, ' Check the documentation of _seupd. '
             	print *, ' '
            endif
c
         else
c
             nconv =  iparam(5)
             do 20 j=1, nconv
c
c               %---------------------------%
c               | Compute the residual norm |
c               |                           |
c               |   ||  A*x - lambda*x ||   |
c               |                           |
c               | for the NCONV accurately  |
c               | computed eigenvalues and  |
c               | eigenvectors.  (iparam(5) |
c               | indicates how many are    |
c               | accurate to the requested |
c               | tolerance)                |
c               %---------------------------%
c
                call av(comm, nloc, nx, mv_buf, v(1,j), ax)
                call daxpy(nloc, -d(j,1), v(1,j), 1, ax, 1)
                d(j,2) = pdnorm2( comm, nloc, ax, 1 )
c
 20          continue
c
c            %-------------------------------%
c            | Display computed residuals    |
c            %-------------------------------%
c
             call pdmout(comm, 6, nconv, 2, d, maxncv, -6,
     &            'Ritz values and direct residuals')
         end if
c
c        %------------------------------------------%
c        | Print additional convergence information |
c        %------------------------------------------%
c
         if (myid .eq. 0)then
         if ( info .eq. 1) then
            print *, ' '
            print *, ' Maximum number of iterations reached.'
            print *, ' '
         else if ( info .eq. 3) then
            print *, ' '
            print *, ' No shifts could be applied during implicit
     &                 Arnoldi update, try increasing NCV.'
            print *, ' '
         end if
c
         print *, ' '
         print *, '_SDRV1 '
         print *, '====== '
         print *, ' '
         print *, ' Size of the matrix is ', n
         print *, ' The number of processors is ', nprocs
         print *, ' The number of Ritz values requested is ', nev
         print *, ' The number of Arnoldi vectors generated',
     &            ' (NCV) is ', ncv
         print *, ' What portion of the spectrum: ', which
         print *, ' The number of converged Ritz values is ',
     &              nconv
         print *, ' The number of Implicit Arnoldi update',
     &            ' iterations taken is ', iparam(3)
         print *, ' The number of OP*x is ', iparam(9)
         print *, ' The convergence criterion is ', tol
         print *, ' '
         endif
c
      end if
c
c     %---------------------------%
c     | Done with program pdsdrv1.|
c     %---------------------------%
c
 9000 continue
c
c
      end
c
c ------------------------------------------------------------------
c     parallel matrix vector subroutine
c
c     The matrix used is the 2 dimensional discrete Laplacian on unit
c     square with zero Dirichlet boundary condition.
c
c     Computes w <--- OP*v, where OP is the nx*nx by nx*nx block
c     tridiagonal matrix
c
c                  | T -I          |
c                  |-I  T -I       |
c             OP = |   -I  T       |
c                  |        ...  -I|
c                  |           -I T|
c
c     The subroutine TV is called to computed y<---T*x.
c-------------------------------------------------------------------
c
      subroutine av (comm, nloc, nx, mv_buf, v, w)
c
c     .. MPI Declarations ...
      include           'mpif.h'
      integer*4         comm, nprocs, myid, ierr,
     &                  status(MPI_STATUS_SIZE)
      integer           nloc, nx, np, j, lo, next, prev
      Double precision
     &                  v(nloc), w(nloc), mv_buf(nx), one
      parameter         (one = 1.0 )
      external          daxpy

      call MPI_COMM_RANK( comm, myid, ierr )
      call MPI_COMM_SIZE( comm, nprocs, ierr )
c
      np = nloc/nx
      call tv(nx,v(1),w(1))
      call daxpy(nx, -one, v(nx+1), 1, w(1), 1)
c
      if ( np .gt. 2) then
         do 10 j = 2, np-1
            lo = (j-1)*nx
            call tv(nx, v(lo+1), w(lo+1))
            call daxpy(nx, -one, v(lo-nx+1), 1, w(lo+1), 1)
            call daxpy(nx, -one, v(lo+nx+1), 1, w(lo+1), 1)
  10     continue
      end if
c
      if ( np .gt. 1) then
         lo = (np-1)*nx
         call tv(nx, v(lo+1), w(lo+1))
         call daxpy(nx, -one, v(lo-nx+1), 1, w(lo+1), 1)
      end if
c
      next = myid + 1
      prev = myid - 1
      if ( myid .lt. nprocs-1 ) then
         call mpi_send( v((np-1)*nx+1), nx, MPI_DOUBLE_PRECISION,
     &                  next, myid+1, comm, ierr )
      endif
      if ( myid .gt. 0 ) then
         call mpi_recv( mv_buf, nx, MPI_DOUBLE_PRECISION, prev, myid,
     &                  comm, status, ierr )
         call daxpy( nx, -one, mv_buf, 1, w(1), 1 )
      endif
c
      if ( myid .gt. 0 ) then
         call mpi_send( v(1), nx, MPI_DOUBLE_PRECISION,
     &                  prev, myid-1, comm, ierr )
      endif
      if ( myid .lt. nprocs-1 ) then
         call mpi_recv( mv_buf, nx, MPI_DOUBLE_PRECISION, next, myid,
     &                  comm, status, ierr )
         call daxpy( nx, -one, mv_buf, 1, w(lo+1), 1 )
      endif
c
      return
      end
c=========================================================================
      subroutine tv (nx, x, y)
c
      integer           nx, j
      Double precision
     &                  x(nx), y(nx), dd, dl, du
c
      Double precision
     &                 one
      parameter        (one = 1.0 )
c
c     Compute the matrix vector multiplication y<---T*x
c     where T is a nx by nx tridiagonal matrix with DD on the
c     diagonal, DL on the subdiagonal, and DU on the superdiagonal.
c
c
      dd  = 4.0
      dl  = -one
      du  = -one
c
      y(1) =  dd*x(1) + du*x(2)
      do 10 j = 2,nx-1
         y(j) = dl*x(j-1) + dd*x(j) + du*x(j+1)
 10   continue
      y(nx) =  dl*x(nx-1) + dd*x(nx)
      return
      end