c-----------------------------------------------------------------------
c\BeginDoc
c
c\Name: pssapps
c
c\Description:
c  Given the Arnoldi factorization
c
c     A*V_{k} - V_{k}*H_{k} = r_{k+p}*e_{k+p}^T,
c
c  apply NP shifts implicitly resulting in
c
c     A*(V_{k}*Q) - (V_{k}*Q)*(Q^T* H_{k}*Q) = r_{k+p}*e_{k+p}^T * Q
c
c  where Q is an orthogonal matrix of order KEV+NP. Q is the product of
c  rotations resulting from the NP bulge chasing sweeps.  The updated Arnoldi
c  factorization becomes:
c
c     A*VNEW_{k} - VNEW_{k}*HNEW_{k} = rnew_{k}*e_{k}^T.
c
c\Usage:
c  call pssapps
c     ( COMM, N, KEV, NP, SHIFT, V, LDV, H, LDH, RESID, Q, LDQ, WORKD )
c
c\Arguments
c  COMM    MPI Communicator for the processor grid.  (INPUT)
c
c  N       Integer.  (INPUT)
c          Problem size, i.e. dimension of matrix A.
c
c  KEV     Integer.  (INPUT)
c          INPUT: KEV+NP is the size of the input matrix H.
c          OUTPUT: KEV is the size of the updated matrix HNEW.
c
c  NP      Integer.  (INPUT)
c          Number of implicit shifts to be applied.
c
c  SHIFT   Real array of length NP.  (INPUT)
c          The shifts to be applied.
c
c  V       Real N by (KEV+NP) array.  (INPUT/OUTPUT)
c          INPUT: V contains the current KEV+NP Arnoldi vectors.
c          OUTPUT: VNEW = V(1:n,1:KEV); the updated Arnoldi vectors
c          are in the first KEV columns of V.
c
c  LDV     Integer.  (INPUT)
c          Leading dimension of V exactly as declared in the calling
c          program.
c
c  H       Real (KEV+NP) by 2 array.  (INPUT/OUTPUT)
c          INPUT: H contains the symmetric tridiagonal matrix of the
c          Arnoldi factorization with the subdiagonal in the 1st column
c          starting at H(2,1) and the main diagonal in the 2nd column.
c          OUTPUT: H contains the updated tridiagonal matrix in the
c          KEV leading submatrix.
c
c  LDH     Integer.  (INPUT)
c          Leading dimension of H exactly as declared in the calling
c          program.
c
c  RESID   Real array of length (N).  (INPUT/OUTPUT)
c          INPUT: RESID contains the the residual vector r_{k+p}.
c          OUTPUT: RESID is the updated residual vector rnew_{k}.
c
c  Q       Real KEV+NP by KEV+NP work array.  (WORKSPACE)
c          Work array used to accumulate the rotations during the bulge
c          chase sweep.
c
c  LDQ     Integer.  (INPUT)
c          Leading dimension of Q exactly as declared in the calling
c          program.
c
c  WORKD   Real work array of length 2*N.  (WORKSPACE)
c          Distributed array used in the application of the accumulated
c          orthogonal matrix Q.
c
c\EndDoc
c
c-----------------------------------------------------------------------
c
c\BeginLib
c
c\Local variables:
c     xxxxxx  real
c
c\References:
c  1. D.C. Sorensen, "Implicit Application of Polynomial Filters in
c     a k-Step Arnoldi Method", SIAM J. Matr. Anal. Apps., 13 (1992),
c     pp 357-385.
c  2. R.B. Lehoucq, "Analysis and Implementation of an Implicitly
c     Restarted Arnoldi Iteration", Rice University Technical Report
c     TR95-13, Department of Computational and Applied Mathematics.
c
c\Routines called:
c     pivout  Parallel ARPACK utility routine that prints integers.
c     arscnd  ARPACK utility routine for timing.
c     psvout  Parallel ARPACK utility routine that prints vectors.
c     pslamch10 ScaLAPACK routine that determines machine constants.
c     slartg  LAPACK Givens rotation construction routine.
c     slacpy  LAPACK matrix copy routine.
c     slaset  LAPACK matrix initialization routine.
c     sgemv   Level 2 BLAS routine for matrix vector multiplication.
c     saxpy   Level 1 BLAS that computes a vector triad.
c     scopy   Level 1 BLAS that copies one vector to another.
c     sscal   Level 1 BLAS that scales a vector.
c
c\Author
c     Danny Sorensen               Phuong Vu
c     Richard Lehoucq              CRPC / Rice University
c     Dept. of Computational &     Houston, Texas
c     Applied Mathematics
c     Rice University
c     Houston, Texas
c
c\Parallel Modifications
c     Kristi Maschhoff
c
c\Revision history:
c     Starting Point: Serial Code FILE: sapps.F   SID: 2.4
c
c\SCCS Information:
c FILE: sapps.F   SID: 1.3   DATE OF SID: 3/19/97
c
c\Remarks
c  1. In this version, each shift is applied to all the subblocks of
c     the tridiagonal matrix H and not just to the submatrix that it
c     comes from. This routine assumes that the subdiagonal elements
c     of H that are stored in h(1:kev+np,1) are nonegative upon input
c     and enforce this condition upon output. This version incorporates
c     deflation. See code for documentation.
c
c\EndLib
c
c-----------------------------------------------------------------------
c
      subroutine pssapps
     &  ( comm, n, kev, np, shift, v, ldv, h, ldh, resid, q, ldq, workd)
c
c     %--------------------%
c     | MPI Communicator |
c     %--------------------%
c
      include   'pcontext.h'
      integer   comm
c
c     %----------------------------------------------------%
c     | Include files for debugging and timing information |
c     %----------------------------------------------------%
c
      include   'debug.h'
      include   'stat.h'
c
c     %------------------%
c     | Scalar Arguments |
c     %------------------%
c
      integer    kev, ldh, ldq, ldv, n, np
c
c     %-----------------%
c     | Array Arguments |
c     %-----------------%
c
      Real
     &           h(ldh,2), q(ldq,kev+np), resid(n), shift(np),
     &           v(ldv,kev+np), workd(2*n)
c
c     %------------%
c     | Parameters |
c     %------------%
c
      Real
     &           one, zero
      parameter (one = 1.0, zero = 0.0)
c
c     %---------------%
c     | Local Scalars |
c     %---------------%
c
      integer    i, iend, istart, itop, j, jj, kplusp, msglvl
      Real
     &           a1, a2, a3, a4, big, c, epsmch, f, g, r, s
      save       epsmch
c
c
c     %----------------------%
c     | External Subroutines |
c     %----------------------%
c
      external   saxpy, scopy, sscal, slacpy, slartg, slaset, psvout,
     &           pivout, arscnd, sgemv
c
c     %--------------------%
c     | External Functions |
c     %--------------------%
c
      Real
     &           pslamch10
      external   pslamch10
c
c     %----------------------%
c     | Intrinsics Functions |
c     %----------------------%
c
      intrinsic  abs
c
c     %----------------%
c     | Data statements |
c     %----------------%
c
c
c     %-----------------------%
c     | Executable Statements |
c     %-----------------------%
c
      if (apps_first) then
         epsmch = pslamch10(comm, 'Epsilon-Machine')
         apps_first = .false.
      end if
      itop = 1
c
c     %-------------------------------%
c     | Initialize timing statistics  |
c     | & message level for debugging |
c     %-------------------------------%
c
      call arscnd (t0)
      msglvl = msapps
c
      kplusp = kev + np
c
c     %----------------------------------------------%
c     | Initialize Q to the identity matrix of order |
c     | kplusp used to accumulate the rotations.     |
c     %----------------------------------------------%
c
      call slaset ('All', kplusp, kplusp, zero, one, q, ldq)
c
c     %----------------------------------------------%
c     | Quick return if there are no shifts to apply |
c     %----------------------------------------------%
c
      if (np .eq. 0) go to 9000
c
c     %----------------------------------------------------------%
c     | Apply the np shifts implicitly. Apply each shift to the  |
c     | whole matrix and not just to the submatrix from which it |
c     | comes.                                                   |
c     %----------------------------------------------------------%
c
      do 90 jj = 1, np
c
         istart = itop
c
c        %----------------------------------------------------------%
c        | Check for splitting and deflation. Currently we consider |
c        | an off-diagonal element h(i+1,1) negligible if           |
c        |         h(i+1,1) .le. epsmch*( |h(i,2)| + |h(i+1,2)| )   |
c        | for i=1:KEV+NP-1.                                        |
c        | If above condition tests true then we set h(i+1,1) = 0.  |
c        | Note that h(1:KEV+NP,1) are assumed to be non negative.  |
c        %----------------------------------------------------------%
c
   20    continue
c
c        %------------------------------------------------%
c        | The following loop exits early if we encounter |
c        | a negligible off diagonal element.             |
c        %------------------------------------------------%
c
         do 30 i = istart, kplusp-1
            big   = abs(h(i,2)) + abs(h(i+1,2))
            if (h(i+1,1) .le. epsmch*big) then
               if (msglvl .gt. 0) then
                  call pivout (comm, logfil, 1, [i], ndigit,
     &                 '_sapps: deflation at row/column no.')
                  call pivout (comm, logfil, 1, [jj], ndigit,
     &                 '_sapps: occurred before shift number.')
                  call psvout (comm, logfil, 1, h(i+1,1), ndigit,
     &                 '_sapps: the corresponding off diagonal element')
               end if
               h(i+1,1) = zero
               iend = i
               go to 40
            end if
   30    continue
         iend = kplusp
   40    continue
c
         if (istart .lt. iend) then
c
c           %--------------------------------------------------------%
c           | Construct the plane rotation G'(istart,istart+1,theta) |
c           | that attempts to drive h(istart+1,1) to zero.          |
c           %--------------------------------------------------------%
c
             f = h(istart,2) - shift(jj)
             g = h(istart+1,1)
             call slartg (f, g, c, s, r)
c
c            %-------------------------------------------------------%
c            | Apply rotation to the left and right of H;            |
c            | H <- G' * H * G,  where G = G(istart,istart+1,theta). |
c            | This will create a "bulge".                           |
c            %-------------------------------------------------------%
c
             a1 = c*h(istart,2)   + s*h(istart+1,1)
             a2 = c*h(istart+1,1) + s*h(istart+1,2)
             a4 = c*h(istart+1,2) - s*h(istart+1,1)
             a3 = c*h(istart+1,1) - s*h(istart,2)
             h(istart,2)   = c*a1 + s*a2
             h(istart+1,2) = c*a4 - s*a3
             h(istart+1,1) = c*a3 + s*a4
c
c            %----------------------------------------------------%
c            | Accumulate the rotation in the matrix Q;  Q <- Q*G |
c            %----------------------------------------------------%
c
             do 60 j = 1, min(istart+jj,kplusp)
                a1            =   c*q(j,istart) + s*q(j,istart+1)
                q(j,istart+1) = - s*q(j,istart) + c*q(j,istart+1)
                q(j,istart)   = a1
   60        continue
c
c
c            %----------------------------------------------%
c            | The following loop chases the bulge created. |
c            | Note that the previous rotation may also be  |
c            | done within the following loop. But it is    |
c            | kept separate to make the distinction among  |
c            | the bulge chasing sweeps and the first plane |
c            | rotation designed to drive h(istart+1,1) to  |
c            | zero.                                        |
c            %----------------------------------------------%
c
             do 70 i = istart+1, iend-1
c
c               %----------------------------------------------%
c               | Construct the plane rotation G'(i,i+1,theta) |
c               | that zeros the i-th bulge that was created   |
c               | by G(i-1,i,theta). g represents the bulge.   |
c               %----------------------------------------------%
c
                f = h(i,1)
                g = s*h(i+1,1)
c
c               %----------------------------------%
c               | Final update with G(i-1,i,theta) |
c               %----------------------------------%
c
                h(i+1,1) = c*h(i+1,1)
                call slartg (f, g, c, s, r)
c
c               %-------------------------------------------%
c               | The following ensures that h(1:iend-1,1), |
c               | the first iend-2 off diagonal of elements |
c               | H, remain non negative.                   |
c               %-------------------------------------------%
c
                if (r .lt. zero) then
                   r = -r
                   c = -c
                   s = -s
                end if
c
c               %--------------------------------------------%
c               | Apply rotation to the left and right of H; |
c               | H <- G * H * G',  where G = G(i,i+1,theta) |
c               %--------------------------------------------%
c
                h(i,1) = r
c
                a1 = c*h(i,2)   + s*h(i+1,1)
                a2 = c*h(i+1,1) + s*h(i+1,2)
                a3 = c*h(i+1,1) - s*h(i,2)
                a4 = c*h(i+1,2) - s*h(i+1,1)
c
                h(i,2)   = c*a1 + s*a2
                h(i+1,2) = c*a4 - s*a3
                h(i+1,1) = c*a3 + s*a4
c
c               %----------------------------------------------------%
c               | Accumulate the rotation in the matrix Q;  Q <- Q*G |
c               %----------------------------------------------------%
c
                do 50 j = 1, min( i+jj, kplusp )
                   a1       =   c*q(j,i) + s*q(j,i+1)
                   q(j,i+1) = - s*q(j,i) + c*q(j,i+1)
                   q(j,i)   = a1
   50           continue
c
   70        continue
c
         end if
c
c        %--------------------------%
c        | Update the block pointer |
c        %--------------------------%
c
         istart = iend + 1
c
c        %------------------------------------------%
c        | Make sure that h(iend,1) is non-negative |
c        | If not then set h(iend,1) <-- -h(iend,1) |
c        | and negate the last column of Q.         |
c        | We have effectively carried out a        |
c        | similarity on transformation H           |
c        %------------------------------------------%
c
         if (h(iend,1) .lt. zero) then
             h(iend,1) = -h(iend,1)
             call sscal(kplusp, -one, q(1,iend), 1)
         end if
c
c        %--------------------------------------------------------%
c        | Apply the same shift to the next block if there is any |
c        %--------------------------------------------------------%
c
         if (iend .lt. kplusp) go to 20
c
c        %-----------------------------------------------------%
c        | Check if we can increase the the start of the block |
c        %-----------------------------------------------------%
c
         do 80 i = itop, kplusp-1
            if (h(i+1,1) .gt. zero) go to 90
            itop  = itop + 1
   80    continue
c
c        %-----------------------------------%
c        | Finished applying the jj-th shift |
c        %-----------------------------------%
c
   90 continue
c
c     %------------------------------------------%
c     | All shifts have been applied. Check for  |
c     | more possible deflation that might occur |
c     | after the last shift is applied.         |
c     %------------------------------------------%
c
      do 100 i = itop, kplusp-1
         big   = abs(h(i,2)) + abs(h(i+1,2))
         if (h(i+1,1) .le. epsmch*big) then
            if (msglvl .gt. 0) then
               call pivout (comm, logfil, 1, [i], ndigit,
     &              '_sapps: deflation at row/column no.')
               call psvout (comm, logfil, 1, h(i+1,1), ndigit,
     &              '_sapps: the corresponding off diagonal element')
            end if
            h(i+1,1) = zero
         end if
 100  continue
c
c     %-------------------------------------------------%
c     | Compute the (kev+1)-st column of (V*Q) and      |
c     | temporarily store the result in WORKD(N+1:2*N). |
c     | This is not necessary if h(kev+1,1) = 0.         |
c     %-------------------------------------------------%
c
      if ( h(kev+1,1) .gt. zero )
     &   call sgemv ('N', n, kplusp, one, v, ldv,
     &                q(1,kev+1), 1, zero, workd(n+1), 1)
c
c     %-------------------------------------------------------%
c     | Compute column 1 to kev of (V*Q) in backward order    |
c     | taking advantage that Q is an upper triangular matrix |
c     | with lower bandwidth np.                              |
c     | Place results in v(:,kplusp-kev:kplusp) temporarily.  |
c     %-------------------------------------------------------%
c
      do 130 i = 1, kev
         call sgemv ('N', n, kplusp-i+1, one, v, ldv,
     &               q(1,kev-i+1), 1, zero, workd, 1)
         call scopy (n, workd, 1, v(1,kplusp-i+1), 1)
  130 continue
c
c     %-------------------------------------------------%
c     |  Move v(:,kplusp-kev+1:kplusp) into v(:,1:kev). |
c     %-------------------------------------------------%
c
      call slacpy ('All', n, kev, v(1,np+1), ldv, v, ldv)
c
c     %--------------------------------------------%
c     | Copy the (kev+1)-st column of (V*Q) in the |
c     | appropriate place if h(kev+1,1) .ne. zero. |
c     %--------------------------------------------%
c
      if ( h(kev+1,1) .gt. zero )
     &     call scopy (n, workd(n+1), 1, v(1,kev+1), 1)
c
c     %-------------------------------------%
c     | Update the residual vector:         |
c     |    r <- sigmak*r + betak*v(:,kev+1) |
c     | where                               |
c     |    sigmak = (e_{kev+p}'*Q)*e_{kev}  |
c     |    betak = e_{kev+1}'*H*e_{kev}     |
c     %-------------------------------------%
c
      call sscal (n, q(kplusp,kev), resid, 1)
      if (h(kev+1,1) .gt. zero)
     &   call saxpy (n, h(kev+1,1), v(1,kev+1), 1, resid, 1)
c
      if (msglvl .gt. 1) then
         call psvout (comm, logfil, 1, q(kplusp,kev), ndigit,
     &      '_sapps: sigmak of the updated residual vector')
         call psvout (comm, logfil, 1, h(kev+1,1), ndigit,
     &      '_sapps: betak of the updated residual vector')
         call psvout (comm, logfil, kev, h(1,2), ndigit,
     &      '_sapps: updated main diagonal of H for next iteration')
         if (kev .gt. 1) then
         call psvout (comm, logfil, kev-1, h(2,1), ndigit,
     &      '_sapps: updated sub diagonal of H for next iteration')
         end if
      end if
c
      call arscnd (t1)
      tsapps = tsapps + (t1 - t0)
c
 9000 continue
      return
c
c     %----------------%
c     | End of pssapps |
c     %----------------%
c
      end