📜  序列比对问题

📅  最后修改于: 2021-09-17 07:11:29             🧑  作者: Mango

给定两个字符串作为输入, X = x_{1} x_{2}... x_{m} , 和Y = y_{1} y_{2}... y_{m} ,输出字符串的对准的,每个,从而使净惩罚为最小。罚款计算如下:
1. 罚款p_{gap} 如果在字符串之间插入间隙,则会发生。
2. 罚款p_{xy} 发生错误匹配的字符X Y .
例子:

Input : X = CG, Y = CA, p_gap = 3, p_xy = 7
Output : X = CG_, Y = C_A, Total penalty = 6

Input : X = AGGGCT, Y = AGGCA, p_gap = 3, p_xy = 2
Output : X = AGGGCT, Y = A_GGCA, Total penalty = 5

Input : X = CG, Y = CA, p_gap = 3, p_xy = 5
Output : X = CG, Y = CA, Total penalty = 5

关于问题历史的简要说明
序列比对问题是生物科学的基本问题之一,旨在发现两个氨基酸序列的相似性。比较氨基酸对人类至关重要,因为它提供了关于进化和发育的重要信息。 Saul B. Needleman 和 Christian D. Wunsch 针对这个问题设计了一个动态规划算法并于 1970 年发表。从那时起,已经进行了大量改进以提高时间复杂度和空间复杂度,但是这些超出了本文讨论的范围这个帖子。
解决方案我们可以使用动态规划来解决这个问题。可行的解决方案是在字符串引入间隙,以使长度相等。因为可以很容易地证明,在均衡长度后增加额外的间隙只会导致惩罚的增加。
最优子结构
可以从最优解中观察到,例如从给定的样本输入中,最优解缩小到只有三个候选。
1. x_{m} y_{n} .
2. x_{m} 和差距。
3. 差距和y_{n} .
最优子结构的证明。
我们可以很容易地用反证法来证明。让X - x_{m} X^' Y - y_{n} Y^' .假设诱导对齐X^' , Y^' 有一些惩罚P , 并且竞争者对齐有惩罚P^* , 和P^* < P .
现在,附加x_{m} y_{n} ,我们得到了惩罚的对齐P^* + p_{xy} < P + p_{xy} .这与原始对齐的最优性相矛盾X, Y .
因此,证明。
dp[i][j] 是最优对齐的惩罚X_{i} Y_{i} .然后,从最优子结构, dp[i][j] = min(dp[i-1][j-1] + p_{xy}, dp[i-1][j] + p_{gap}, dp[i][j-1] + p_{gap}) .
因此,总的最低惩罚是, dp[m][n] .
重构解决方案
重建,
1. 追溯填满的表格,开始dp[m][n] .
2.什么时候(i, j)
…..2a。如果它是使用 case 1 填充的,请转到(i-1, j-1) .
…..2b。如果它是使用案例 2 填充的,请转到(i-1, j) .
…..2c。如果它是使用 case 3 填充的,请转到(i, j-1) .
3. 如果 i = 0 或 j = 0,则用间隙匹配剩余的子串。
下面是上述解决方案的实现。

C++
// CPP program to implement sequence alignment
// problem.
#include 
 
using namespace std;
 
// function to find out the minimum penalty
void getMinimumPenalty(string x, string y, int pxy, int pgap)
{
    int i, j; // initialising variables
     
    int m = x.length(); // length of gene1
    int n = y.length(); // length of gene2
     
    // table for storing optimal substructure answers
    int dp[m+1][n+1] = {0};
 
    // initialising the table
    for (i = 0; i <= (n+m); i++)
    {
        dp[i][0] = i * pgap;
        dp[0][i] = i * pgap;
    }   
 
    // calcuting the minimum penalty
    for (i = 1; i <= m; i++)
    {
        for (j = 1; j <= n; j++)
        {
            if (x[i - 1] == y[j - 1])
            {
                dp[i][j] = dp[i - 1][j - 1];
            }
            else
            {
                dp[i][j] = min({dp[i - 1][j - 1] + pxy ,
                                dp[i - 1][j] + pgap    ,
                                dp[i][j - 1] + pgap    });
            }
        }
    }
 
    // Reconstructing the solution
    int l = n + m; // maximum possible length
     
    i = m; j = n;
     
    int xpos = l;
    int ypos = l;
 
    // Final answers for the respective strings
    int xans[l+1], yans[l+1];
     
    while ( !(i == 0 || j == 0))
    {
        if (x[i - 1] == y[j - 1])
        {
            xans[xpos--] = (int)x[i - 1];
            yans[ypos--] = (int)y[j - 1];
            i--; j--;
        }
        else if (dp[i - 1][j - 1] + pxy == dp[i][j])
        {
            xans[xpos--] = (int)x[i - 1];
            yans[ypos--] = (int)y[j - 1];
            i--; j--;
        }
        else if (dp[i - 1][j] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)x[i - 1];
            yans[ypos--] = (int)'_';
            i--;
        }
        else if (dp[i][j - 1] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)'_';
            yans[ypos--] = (int)y[j - 1];
            j--;
        }
    }
    while (xpos > 0)
    {
        if (i > 0) xans[xpos--] = (int)x[--i];
        else xans[xpos--] = (int)'_';
    }
    while (ypos > 0)
    {
        if (j > 0) yans[ypos--] = (int)y[--j];
        else yans[ypos--] = (int)'_';
    }
 
    // Since we have assumed the answer to be n+m long,
    // we need to remove the extra gaps in the starting
    // id represents the index from which the arrays
    // xans, yans are useful
    int id = 1;
    for (i = l; i >= 1; i--)
    {
        if ((char)yans[i] == '_' && (char)xans[i] == '_')
        {
            id = i + 1;
            break;
        }
    }
 
    // Printing the final answer
    cout << "Minimum Penalty in aligning the genes = ";
    cout << dp[m][n] << "\n";
    cout << "The aligned genes are :\n";
    for (i = id; i <= l; i++)
    {
        cout<<(char)xans[i];
    }
    cout << "\n";
    for (i = id; i <= l; i++)
    {
        cout << (char)yans[i];
    }
    return;
}
 
// Driver code
int main(){
    // input strings
    string gene1 = "AGGGCT";
    string gene2 = "AGGCA";
     
    // initialsing penalties of different types
    int misMatchPenalty = 3;
    int gapPenalty = 2;
 
    // calling the function to calculate the result
    getMinimumPenalty(gene1, gene2,
        misMatchPenalty, gapPenalty);
    return 0;
}


Java
// Java program to implement
// sequence alignment problem.
import java.io.*;
import java.util.*;
import java.lang.*;
 
class GFG
{
// function to find out
// the minimum penalty
static void getMinimumPenalty(String x, String y,
                              int pxy, int pgap)
{
    int i, j; // initialising variables
     
    int m = x.length(); // length of gene1
    int n = y.length(); // length of gene2
     
    // table for storing optimal
    // substructure answers
    int dp[][] = new int[n + m + 1][n + m + 1];
     
    for (int[] x1 : dp)
    Arrays.fill(x1, 0);
 
    // initialising the table
    for (i = 0; i <= (n + m); i++)
    {
        dp[i][0] = i * pgap;
        dp[0][i] = i * pgap;
    }
 
    // calcuting the
    // minimum penalty
    for (i = 1; i <= m; i++)
    {
        for (j = 1; j <= n; j++)
        {
            if (x.charAt(i - 1) == y.charAt(j - 1))
            {
                dp[i][j] = dp[i - 1][j - 1];
            }
            else
            {
                dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy ,
                                             dp[i - 1][j] + pgap) ,
                                             dp[i][j - 1] + pgap );
            }
        }
    }
 
    // Reconstructing the solution
    int l = n + m; // maximum possible length
     
    i = m; j = n;
     
    int xpos = l;
    int ypos = l;
 
    // Final answers for
    // the respective strings
    int xans[] = new int[l + 1];
    int yans[] = new int[l + 1];
     
    while ( !(i == 0 || j == 0))
    {
        if (x.charAt(i - 1) == y.charAt(j - 1))
        {
            xans[xpos--] = (int)x.charAt(i - 1);
            yans[ypos--] = (int)y.charAt(j - 1);
            i--; j--;
        }
        else if (dp[i - 1][j - 1] + pxy == dp[i][j])
        {
            xans[xpos--] = (int)x.charAt(i - 1);
            yans[ypos--] = (int)y.charAt(j - 1);
            i--; j--;
        }
        else if (dp[i - 1][j] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)x.charAt(i - 1);
            yans[ypos--] = (int)'_';
            i--;
        }
        else if (dp[i][j - 1] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)'_';
            yans[ypos--] = (int)y.charAt(j - 1);
            j--;
        }
    }
    while (xpos > 0)
    {
        if (i > 0) xans[xpos--] = (int)x.charAt(--i);
        else xans[xpos--] = (int)'_';
    }
    while (ypos > 0)
    {
        if (j > 0) yans[ypos--] = (int)y.charAt(--j);
        else yans[ypos--] = (int)'_';
    }
 
    // Since we have assumed the
    // answer to be n+m long,
    // we need to remove the extra
    // gaps in the starting id
    // represents the index from
    // which the arrays xans,
    // yans are useful
    int id = 1;
    for (i = l; i >= 1; i--)
    {
        if ((char)yans[i] == '_' &&
            (char)xans[i] == '_')
        {
            id = i + 1;
            break;
        }
    }
 
    // Printing the final answer
    System.out.print("Minimum Penalty in " +
                     "aligning the genes = ");
    System.out.print(dp[m][n] + "\n");
    System.out.println("The aligned genes are :");
    for (i = id; i <= l; i++)
    {
        System.out.print((char)xans[i]);
    }
    System.out.print("\n");
    for (i = id; i <= l; i++)
    {
        System.out.print((char)yans[i]);
    }
    return;
}
 
// Driver code
public static void main(String[] args)
{
    // input strings
    String gene1 = "AGGGCT";
    String gene2 = "AGGCA";
     
    // initialising penalties
    // of different types
    int misMatchPenalty = 3;
    int gapPenalty = 2;
 
    // calling the function to
    // calculate the result
    getMinimumPenalty(gene1, gene2,
        misMatchPenalty, gapPenalty);
}
}


C#
// C# program to implement sequence alignment
// problem.
using System;
   
class GFG
{
    // function to find out the minimum penalty
    public static void getMinimumPenalty(string x, string y, int pxy, int pgap)
    {
        int i, j; // initialising variables
           
        int m = x.Length; // length of gene1
        int n = y.Length; // length of gene2
           
        // table for storing optimal substructure answers
        int[,] dp = new int[n+m+1,n+m+1];
        for(int q = 0; q < n+m+1; q++)
            for(int w = 0; w < n+m+1; w++)
                dp[q,w] = 0;
       
        // initialising the table 
        for (i = 0; i <= (n+m); i++)
        {
            dp[i,0] = i * pgap;
            dp[0,i] = i * pgap;
        }    
       
        // calcuting the minimum penalty
        for (i = 1; i <= m; i++)
        {
            for (j = 1; j <= n; j++)
            {
                if (x[i - 1] == y[j - 1])
                {
                    dp[i,j] = dp[i - 1,j - 1];
                }
                else
                {
                    dp[i,j] = Math.Min(Math.Min(dp[i - 1,j - 1] + pxy , 
                                    dp[i - 1,j] + pgap)    , 
                                    dp[i,j - 1] + pgap    );
                }
            }
        }
       
        // Reconstructing the solution
        int l = n + m; // maximum possible length
           
        i = m; j = n;
           
        int xpos = l;
        int ypos = l;
       
        // Final answers for the respective strings
        int[] xans = new int[l+1];
        int [] yans = new int[l+1];
           
        while ( !(i == 0 || j == 0))
        {
            if (x[i - 1] == y[j - 1])
            {
                xans[xpos--] = (int)x[i - 1];
                yans[ypos--] = (int)y[j - 1];
                i--; j--;
            }
            else if (dp[i - 1,j - 1] + pxy == dp[i,j])
            {
                xans[xpos--] = (int)x[i - 1];
                yans[ypos--] = (int)y[j - 1];
                i--; j--;
            }
            else if (dp[i - 1,j] + pgap == dp[i,j])
            {
                xans[xpos--] = (int)x[i - 1];
                yans[ypos--] = (int)'_';
                i--;
            }
            else if (dp[i,j - 1] + pgap == dp[i,j])
            {
                xans[xpos--] = (int)'_';
                yans[ypos--] = (int)y[j - 1];
                j--;
            }
        }
        while (xpos > 0)
        {
            if (i > 0) xans[xpos--] = (int)x[--i];
            else xans[xpos--] = (int)'_';
        }
        while (ypos > 0)
        {
            if (j > 0) yans[ypos--] = (int)y[--j];
            else yans[ypos--] = (int)'_';
        }
       
        // Since we have assumed the answer to be n+m long, 
        // we need to remove the extra gaps in the starting 
        // id represents the index from which the arrays
        // xans, yans are useful
        int id = 1;
        for (i = l; i >= 1; i--)
        {
            if ((char)yans[i] == '_' && (char)xans[i] == '_')
            {
                id = i + 1;
                break;
            }
        }
       
        // Printing the final answer
        Console.Write("Minimum Penalty in aligning the genes = " + dp[m,n] + "\n");
        Console.Write("The aligned genes are :\n");
        for (i = id; i <= l; i++)
        {
            Console.Write((char)xans[i]);
        }
        Console.Write("\n");
        for (i = id; i <= l; i++)
        {
            Console.Write((char)yans[i]);
        }
        return;
    }
       
    // Driver code
    static void Main()
    {
        // input strings
        string gene1 = "AGGGCT";
        string gene2 = "AGGCA";
           
        // initialising penalties of different types
        int misMatchPenalty = 3;
        int gapPenalty = 2;
       
        // calling the function to calculate the result
        getMinimumPenalty(gene1, gene2, 
            misMatchPenalty, gapPenalty);
    }
    //This code is contributed by DrRoot_
}


PHP
 0)
    {
        if ($i > 0) $xans[$xpos--] = $x[--$i];
        else $xans[$xpos--] = '_';
    }
    while ($ypos > 0)
    {
        if ($j > 0)
            $yans[$ypos--] = $y[--$j];
        else
            $yans[$ypos--] = '_';
    }
 
    // Since we have assumed the
    // answer to be n+m long,
    // we need to remove the extra
    // gaps in the starting
    // id represents the index
    // from which the arrays
    // xans, yans are useful
    $id = 1;
    for ($i = $l; $i >= 1; $i--)
    {
        if ($yans[$i] == '_' &&
            $xans[$i] == '_')
        {
            $id = $i + 1;
            break;
        }
    }
 
    // Printing the final answer
    echo "Minimum Penalty in ".
         "aligning the genes = ";
    echo $dp[$m][$n] . "\n";
    echo "The aligned genes are :\n";
    for ($i = $id; $i <= $l; $i++)
    {
        echo $xans[$i];
    }
    echo "\n";
    for ($i = $id; $i <= $l; $i++)
    {
        echo $yans[$i];
    }
    return;
}
 
// Driver code
 
// input strings
$gene1 = "AGGGCT";
$gene2 = "AGGCA";
 
// initialising penalties
// of different types
$misMatchPenalty = 3;
$gapPenalty = 2;
 
// calling the function
// to calculate the result
getMinimumPenalty($gene1, $gene2,
    $misMatchPenalty, $gapPenalty);
 
// This code is contributed by Abhinav96
?>


输出:
Minimum Penalty in aligning the genes = 5
The aligned genes are :
AGGGCT
A_GGCA

时间复杂度: \mathcal{O}(n*m)
空间复杂度: \mathcal{O}(n*m)