\documentclass[10pt]{article} \title{Alignment with dynamic programming} \author{Heng Li} \begin{document} \maketitle \section{General notations} Suppose we have two sequences: a \emph{target} sequence and a \emph{query} sequence. The length of the target sequence is $\ell_t$ with each residue indexed by $i$. The length of query is $\ell_q$ with each residue indexed by $j$. Gaps on the target sequence are \emph{deletions} and gaps on the query are \emph{insertions}. Function $S(i,j)$ gives the score between two residues on the target and the query, respectively. $q>0$ is the gap open/initiation penalty and $e>0$ the gap extension penalty. A gap of length $k$ costs $q+k\cdot e$. \section{Global alignment with affine-gap penalties} \subsection{Durbin's formulation} The original Durbin's formulation is: \begin{eqnarray*} M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\ E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}\} - e\\ F_{ij}&=&\max\{M_{i,j-1}-q, F_{i,j-1}\} - e \end{eqnarray*} This formulation disallows a deletion immediately followed an insertion, or vice versa. A more general form is: \begin{eqnarray*} M_{ij}&=&\max\{M_{i-1,j-1}, E_{i-1,j-1}, F_{i-1,j-1}\} + S(i,j)\\ E_{ij}&=&\max\{M_{i-1,j}-q, E_{i-1,j}, F_{i-1,j}-q\} - e\\ F_{ij}&=&\max\{M_{i,j-1}-q, E_{i,j-1}-q, F_{i,j-1}\} - e \end{eqnarray*} \subsection{Green's formulation} If we define: \[H_{ij}=\max\{M_{ij},E_{ij},F_{ij}\}\] the Durbin's formulation can be transformed to \begin{eqnarray*} E_{ij} &=& \max\{H_{i-1,j}-q, E_{i-1,j}\} - e \\ F_{ij} &=& \max\{H_{i,j-1}-q, F_{i,j-1}\} - e \\ H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j), E_{ij}, F_{ij}\} \end{eqnarray*} I first saw this formulation in Phrap developed by Phil Green, though it may have been used earlier. If we further introduce \begin{eqnarray*} E'_{ij}&=&E_{i+1,j}\\ F'_{ij}&=&F_{i,j+1} \end{eqnarray*} we have \begin{eqnarray*} H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1}\}\\ E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\ F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e \end{eqnarray*} In fact, we more often use this set of equations in practical implementations. The initial conditions are \begin{eqnarray*} H_{-1,j}&=& \left\{\begin{array}{ll} 0 & (j=-1)\\ -q-(j+1)\cdot e & (0\le j<\ell_q) \end{array}\right.\\ H_{i,-1}&=& \left\{\begin{array}{ll} 0 & (i=-1)\\ -q-(i+1)\cdot e & (0\le i<\ell_t) \end{array}\right.\\ E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e=-2q-(j+2)\cdot e\\ F'_{i,-1}&=&F_{i,0}=-2q-(i+2)\cdot e \end{eqnarray*} \subsection{Suzuki's formulation} \subsubsection{Standard coordinate} Now let \begin{eqnarray*} u'_{ij}&=&H_{ij}-H_{i-1,j}\\ v'_{ij}&=&H_{ij}-H_{i,j-1}\\ x'_{ij}&=&E'_{ij}-H_{ij}\\ y'_{ij}&=&F'_{ij}-H_{ij} \end{eqnarray*} We have \begin{eqnarray}\label{eq:x} x'_{ij}&=&\max\{-q,E'_{i-1,j}-H_{i-1,j}+H_{i-1,j}-H_{ij}\}-e\\\nonumber &=&\max\{-q,x'_{i-1,j}-u'_{ij}\}-e \end{eqnarray} Similarly \begin{equation} y'_{ij}=\max\{-q,y'_{i,j-1}-v'_{ij}\}-e \end{equation} To derive the equation to compute $u'(i,j)$ and $v'(i,j)$, we note that \begin{eqnarray*} H_{ij}-H_{i-1,j-1} &=&\max\{S(i,j),E'_{i-1,j}-H_{i-1,j-1},F'_{i,j-1}-H_{i-1,j-1}\}\\ &=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\} \end{eqnarray*} and \[H_{ij}-H_{i-1,j-1}=u'_{ij}+v'_{i-1,j}=v'_{ij}+u'_{i,j-1}\] We can derive the recursive equation for $u'_{ij}$ and $v'_{ij}$: \begin{eqnarray*} z'_{ij}&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1}\}\\ u'_{ij}&=&z'_{ij}-v'_{i-1,j}\\ v'_{ij}&=&z'_{ij}-u'_{i,j-1}\\ x'_{ij}&=&\max\{0,x'_{i-1,j}+v'_{i-1,j}-z'_{ij}+q\}-q-e\\ y'_{ij}&=&\max\{0,y'_{i,j-1}+u'_{i,j-1}-z'_{ij}+q\}-q-e \end{eqnarray*} From eq.~(\ref{eq:x}) we can infer that $x'_{ij}\ge-q-e$ and similarly $y'_{ij}\ge-q-e$. We further have: \[ u'_{ij}=H_{ij}-H_{i-1,j-1}-v'_{i-1,j}\ge x'_{i-1,j}\ge-q-e \] Therefore, we have a lower bound $-q-e$ for $u'$, $v'$, $x'$ and $y'$. This motivates us to redefine the four variables as: \begin{eqnarray*} u''_{ij}&=&H_{ij}-H_{i-1,j}+q+e\\ v''_{ij}&=&H_{ij}-H_{i,j-1}+q+e\\ x''_{ij}&=&E'_{ij}-H_{ij}+q+e\\ y''_{ij}&=&F'_{ij}-H_{ij}+q+e \end{eqnarray*} The recursion becomes \begin{eqnarray*} z''_{ij}&=&\max\{S(i,j)+2q+2e,x''_{i-1,j}+v''_{i-1,j},y''_{i,j-1}+u''_{i,j-1}\}\\ u''_{ij}&=&z''_{ij}-v''_{i-1,j}\\ v''_{ij}&=&z''_{ij}-u''_{i,j-1}\\ x''_{ij}&=&\max\{0,x''_{i-1,j}-u''_{ij}+q\}=\max\{0,x''_{i-1,j}+v''_{i-1,j}-z''_{ij}+q\}\\ y''_{ij}&=&\max\{0,y''_{i,j-1}-v''_{ij}+q\}=\max\{0,y''_{i,j-1}+u''_{i,j-1}-z''_{ij}+q\} \end{eqnarray*} Here $z_{ij}$ is a temporary variable. $u''$, $v''$, $x''$ and $y''$ are all non-negtive. \subsubsection{Rotated coordinate} We let \begin{eqnarray*} r&=&i+j\\ t&=&i \end{eqnarray*} We have \begin{eqnarray*} z_{rt}&=&\max\{S(t,r-t)+2q+2e,x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1,t}\}\\ u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ v_{rt}&=&z_{rt}-u_{r-1,t}\\ x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}\\ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\} \end{eqnarray*} Due to the definition of $r$ and $t$, the following inequation must stand: \[0\le r-t \le\ell_q-1\] \[0\le t \le\ell_t-1\] where $\ell_t$ is the length of the sequence indexed by $i$ and $\ell_q$ the length indexed by $j$. In case of banded alignment with a fixed diagonal band of size $w$, \[-w\le j-i\le w\] In the $(r,t)$ coordinate, it is: \[\frac{r-w}{2}\le t\le \frac{r+w}{2}\] Putting these together: \[0\le r\le \ell_q+\ell_t-2\] \[\max\left\{0,r-\ell_q+1,\frac{r-w}{2}\right\}\le t\le\min\left\{\ell_t-1,r,\frac{r+w}{2}\right\}\] \subsubsection{Initial conditions} \[x_{r-1,-1}=x''_{-1,r}=E'_{-1,r}-H_{-1,r}+q+e=0\] \[y_{r-1,r}=y''_{r,-1}=0\] \[v_{r-1,-1}=v''_{-1,r}=H_{-1,r}-H_{-1,r-1}+q+e=\left\{\begin{array}{ll} q & (r>0) \\ 0 & (r=0) \end{array}\right.\] \[u_{r-1,r}=u''_{r,-1}=H_{r,-1}-H_{r-1,-1}+q+e=\left\{\begin{array}{ll} q & (r>0) \\ 0 & (r=0) \end{array}\right.\] \section{Alignment with dual affine-gap penalties} \subsection{Green's formulation} \begin{eqnarray*} H_{ij} &=& \max\{H_{i-1,j-1}+S(i,j),E'_{i-1,j},F'_{i,j-1},\tilde{E}'_{i-1,j},\tilde{F}'_{i,j-1}\}\\ E'_{ij}&=& \max\{H_{ij}-q,E'_{i-1,j}\}-e\\ F'_{ij}&=& \max\{H_{ij}-q,F'_{i,j-1}\}-e\\ \tilde{E}'_{ij}&=& \max\{H_{ij}-\tilde{q},\tilde{E}'_{i-1,j}\}-\tilde{e}\\ \tilde{F}'_{ij}&=& \max\{H_{ij}-\tilde{q},\tilde{F}'_{i,j-1}\}-\tilde{e} \end{eqnarray*} The initial conditions are: \begin{eqnarray*} H_{-1,j}&=& \left\{\begin{array}{ll} 0 & (j=-1)\\ \max\{-q-(j+1)\cdot e,-\tilde{q}-(j+1)\cdot\tilde{e}\} & (0\le j<\ell_q) \end{array}\right.\\ H_{i,-1}&=& \left\{\begin{array}{ll} 0 & (i=-1)\\ \max\{-q-(i+1)\cdot e,-\tilde{q}-(i+1)\cdot\tilde{e}\} & (0\le i<\ell_t) \end{array}\right.\\ E'_{-1,j}&=&E_{0,j}=H_{-1,j}-q-e\\ F'_{i,-1}&=&F_{i,0}=H_{i,-1}-q-e\\ \tilde{E}'_{-1,j}&=&\tilde{E}_{0,j}=H_{-1,j}-\tilde{q}-\tilde{e}\\ \tilde{F}'_{i,-1}&=&\tilde{F}_{i,0}=H_{i,-1}-\tilde{q}-\tilde{e} \end{eqnarray*} \subsection{Suzuki's formulation} \begin{eqnarray*} z'_{ij}&=&\max\{S(i,j),x'_{i-1,j}+v'_{i-1,j},y'_{i,j-1}+u'_{i,j-1},\\ &&\tilde{x}'_{i-1,j}+v'_{i-1,j},\tilde{y}'_{i,j-1}+u'_{i,j-1}\}\\ u'_{ij}&=&z'_{ij}-v'_{i-1,j}\\ v'_{ij}&=&z'_{ij}-u'_{i,j-1}\\ x'_{ij}&=&\max\{0,x'_{i-1,j}+v'_{i-1,j}-z'_{ij}+q\}-q-e\\ y'_{ij}&=&\max\{0,y'_{i,j-1}+u'_{i,j-1}-z'_{ij}+q\}-q-e\\ \tilde{x}'_{ij}&=&\max\{0,\tilde{x}'_{i-1,j}+v'_{i-1,j}-z'_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ \tilde{y}'_{ij}&=&\max\{0,\tilde{y}'_{i,j-1}+u'_{i,j-1}-z'_{ij}+\tilde{q}\}-\tilde{q}-\tilde{e} \end{eqnarray*} In the rotated coordinate: \begin{eqnarray*} z_{rt}&=&\max\{S(t,r-t),x_{r-1,t-1}+v_{r-1,t-1},y_{r-1,t}+u_{r-1i,t},\\ &&\tilde{x}_{r-1,t-1}+v_{r-1,t-1},\tilde{y}_{r-1,t}+u_{r-1,t}\}\\ u_{rt}&=&z_{rt}-v_{r-1,t-1}\\ v_{rt}&=&z_{rt}-u_{r-1,t}\\ x_{rt}&=&\max\{0,x_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+q\}-q-e\\ y_{rt}&=&\max\{0,y_{r-1,t}+u_{r-1,t}-z_{rt}+q\}-q-e\\ \tilde{x}_{rt}&=&\max\{0,\tilde{x}_{r-1,t-1}+v_{r-1,t-1}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e}\\ \tilde{y}_{rt}&=&\max\{0,\tilde{y}_{r-1,t}+u_{r-1,t}-z_{rt}+\tilde{q}\}-\tilde{q}-\tilde{e} \end{eqnarray*} By definition, it is easy to see the initial conditions except $u$ and $v$: \[x_{r-1,-1}=x'_{-1,r}=E'_{-1,r}-H_{-1,r}=-q-e\] \[y_{r-1,r}=y'_{r,-1}=F'_{r,-1}-H_{r,-1}=-q-e\] \[\tilde{x}_{r-1,-1}=-\tilde{q}-\tilde{e}\] \[\tilde{y}_{r-1,-1}=-\tilde{q}-\tilde{e}\] \[v_{r-1,-1}=H_{-1,r}-H_{-1,r-1}=\left\{\begin{array}{ll} \max\{-q-e,-\tilde{q}-\tilde{e}\} & (r=0)\\ -e & (r<\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)\\ r(e-\tilde{e})-(\tilde{q}-q)-\tilde{e} & (r=\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil)\\ -\tilde{e} & (r>\lceil\frac{\tilde{q}-q}{e-\tilde{e}}-1\rceil) \end{array}\right.\] \end{document}