📜  序列比对问题

📅  最后修改于: 2021-05-07 09:40:20             🧑  作者: Mango

给定两个字符串作为输入, X = x_{1} x_{2}... x_{m} , 和Y = y_{1} y_{2}... y_{m} ,输出字符串的对准的,每个,从而使净惩罚为最小。罚款计算如下:
1.罚款p_{gap}如果在字符串之间插入字符串,则会发生此错误。
2.罚款p_{xy}发生由于字符不匹配XY

例子:

Input : X = CG, Y = CA, p_gap = 3, p_xy = 7
Output : X = CG_, Y = C_A, Total penalty = 6

Input : X = AGGGCT, Y = AGGCA, p_gap = 3, p_xy = 2
Output : X = AGGGCT, Y = A_GGCA, Total penalty = 5

Input : X = CG, Y = CA, p_gap = 3, p_xy = 5
Output : X = CG, Y = CA, Total penalty = 5

关于问题历史的简要说明
序列比对问题是生物学科学的基本问题之一,旨在发现两个氨基酸序列的相似性。比较氨基酸对人类至关重要,因为它提供了有关进化和发展的重要信息。 Saul B. Needleman和Christian D. Wunsch设计了一种动态编程算法来解决该问题,并于1970年发布。此后,进行了许多改进,以改善时间复杂度和空间复杂度,但是这些超出了本文的讨论范围。这个帖子。

解决方案我们可以使用动态编程来解决此问题。可行的解决方案是在字符串引入间隙,以使长度相等。因为可以很容易地证明在均衡长度之后增加额外的间隙只会导致惩罚的增加。

最佳子结构
从最佳解决方案(例如从给定的样本输入)可以观察到,最佳解决方案仅缩小到三个候选值。
1。 x_{m}y_{n}
2。 x_{m}和差距。
3.差距和y_{n}

最佳子结构的证明。
我们可以很容易地通过矛盾证明。让X - x_{m}X^'Y - y_{n}Y^' 。假设诱导的对准X^'Y^'有一些惩罚P ,而竞争对手的定位会受到处罚P^* , 和P^* < P
现在,追加x_{m}y_{n} ,我们会与罚款保持一致P^* + p_{xy} < P  + p_{xy} 。这与原始对齐方式的最优性相矛盾。 X, Y
因此,证明了。

dp[i][j]是最佳对齐方式的代价X_{i}Y_{i} 。然后,从最佳子结构中dp[i][j] = min(dp[i-1][j-1] + p_{xy}, dp[i-1][j] + p_{gap}, dp[i][j-1] + p_{gap})
因此,最低总罚款额为dp[m][n]

重构解决方案
为了重建,
1.从填充表开始追溯dp[m][n]
2.什么时候(i, j)
…..2a。如果已使用案例1填充,请转到(i-1, j-1)
…..2b。如果已使用案例2填充,请转到(i-1, j)
…..2c。如果已使用案例3填充,请转到(i, j-1)
3.如果i = 0或j = 0,则将剩余的子字符串与空格匹配。

以下是上述解决方案的实现。

C++
// CPP program to implement sequence alignment
// problem.
#include 
  
using namespace std;
  
// function to find out the minimum penalty
void getMinimumPenalty(string x, string y, int pxy, int pgap)
{
    int i, j; // intialising variables
      
    int m = x.length(); // length of gene1
    int n = y.length(); // length of gene2
      
    // table for storing optimal substructure answers
    int dp[m+1][n+1] = {0};
  
    // intialising the table 
    for (i = 0; i <= (n+m); i++)
    {
        dp[i][0] = i * pgap;
        dp[0][i] = i * pgap;
    }    
  
    // calcuting the minimum penalty
    for (i = 1; i <= m; i++)
    {
        for (j = 1; j <= n; j++)
        {
            if (x[i - 1] == y[j - 1])
            {
                dp[i][j] = dp[i - 1][j - 1];
            }
            else
            {
                dp[i][j] = min({dp[i - 1][j - 1] + pxy , 
                                dp[i - 1][j] + pgap    , 
                                dp[i][j - 1] + pgap    });
            }
        }
    }
  
    // Reconstructing the solution
    int l = n + m; // maximum possible length
      
    i = m; j = n;
      
    int xpos = l;
    int ypos = l;
  
    // Final answers for the respective strings
    int xans[l+1], yans[l+1];
      
    while ( !(i == 0 || j == 0))
    {
        if (x[i - 1] == y[j - 1])
        {
            xans[xpos--] = (int)x[i - 1];
            yans[ypos--] = (int)y[j - 1];
            i--; j--;
        }
        else if (dp[i - 1][j - 1] + pxy == dp[i][j])
        {
            xans[xpos--] = (int)x[i - 1];
            yans[ypos--] = (int)y[j - 1];
            i--; j--;
        }
        else if (dp[i - 1][j] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)x[i - 1];
            yans[ypos--] = (int)'_';
            i--;
        }
        else if (dp[i][j - 1] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)'_';
            yans[ypos--] = (int)y[j - 1];
            j--;
        }
    }
    while (xpos > 0)
    {
        if (i > 0) xans[xpos--] = (int)x[--i];
        else xans[xpos--] = (int)'_';
    }
    while (ypos > 0)
    {
        if (j > 0) yans[ypos--] = (int)y[--j];
        else yans[ypos--] = (int)'_';
    }
  
    // Since we have assumed the answer to be n+m long, 
    // we need to remove the extra gaps in the starting 
    // id represents the index from which the arrays
    // xans, yans are useful
    int id = 1;
    for (i = l; i >= 1; i--)
    {
        if ((char)yans[i] == '_' && (char)xans[i] == '_')
        {
            id = i + 1;
            break;
        }
    }
  
    // Printing the final answer
    cout << "Minimum Penalty in aligning the genes = ";
    cout << dp[m][n] << "\n";
    cout << "The aligned genes are :\n";
    for (i = id; i <= l; i++)
    {
        cout<<(char)xans[i];
    }
    cout << "\n";
    for (i = id; i <= l; i++)
    {
        cout << (char)yans[i];
    }
    return;
}
  
// Driver code
int main(){
    // input strings
    string gene1 = "AGGGCT";
    string gene2 = "AGGCA";
      
    // intialsing penalties of different types
    int misMatchPenalty = 3;
    int gapPenalty = 2;
  
    // calling the function to calculate the result
    getMinimumPenalty(gene1, gene2, 
        misMatchPenalty, gapPenalty);
    return 0;
}


Java
// Java program to implement 
// sequence alignment problem.
import java.io.*;
import java.util.*;
import java.lang.*;
  
class GFG
{
// function to find out 
// the minimum penalty
static void getMinimumPenalty(String x, String y, 
                              int pxy, int pgap)
{
    int i, j; // intialising variables
      
    int m = x.length(); // length of gene1
    int n = y.length(); // length of gene2
      
    // table for storing optimal
    // substructure answers
    int dp[][] = new int[n + m + 1][n + m + 1];
      
    for (int[] x1 : dp)
    Arrays.fill(x1, 0);
  
    // intialising the table 
    for (i = 0; i <= (n + m); i++)
    {
        dp[i][0] = i * pgap;
        dp[0][i] = i * pgap;
    } 
  
    // calcuting the 
    // minimum penalty
    for (i = 1; i <= m; i++)
    {
        for (j = 1; j <= n; j++)
        {
            if (x.charAt(i - 1) == y.charAt(j - 1))
            {
                dp[i][j] = dp[i - 1][j - 1];
            }
            else
            {
                dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy , 
                                             dp[i - 1][j] + pgap) , 
                                             dp[i][j - 1] + pgap );
            }
        }
    }
  
    // Reconstructing the solution
    int l = n + m; // maximum possible length
      
    i = m; j = n;
      
    int xpos = l;
    int ypos = l;
  
    // Final answers for 
    // the respective strings
    int xans[] = new int[l + 1]; 
    int yans[] = new int[l + 1];
      
    while ( !(i == 0 || j == 0))
    {
        if (x.charAt(i - 1) == y.charAt(j - 1))
        {
            xans[xpos--] = (int)x.charAt(i - 1);
            yans[ypos--] = (int)y.charAt(j - 1);
            i--; j--;
        }
        else if (dp[i - 1][j - 1] + pxy == dp[i][j])
        {
            xans[xpos--] = (int)x.charAt(i - 1);
            yans[ypos--] = (int)y.charAt(j - 1);
            i--; j--;
        }
        else if (dp[i - 1][j] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)x.charAt(i - 1);
            yans[ypos--] = (int)'_';
            i--;
        }
        else if (dp[i][j - 1] + pgap == dp[i][j])
        {
            xans[xpos--] = (int)'_';
            yans[ypos--] = (int)y.charAt(j - 1);
            j--;
        }
    }
    while (xpos > 0)
    {
        if (i > 0) xans[xpos--] = (int)x.charAt(--i);
        else xans[xpos--] = (int)'_';
    }
    while (ypos > 0)
    {
        if (j > 0) yans[ypos--] = (int)y.charAt(--j);
        else yans[ypos--] = (int)'_';
    }
  
    // Since we have assumed the 
    // answer to be n+m long, 
    // we need to remove the extra 
    // gaps in the starting id 
    // represents the index from 
    // which the arrays xans,
    // yans are useful
    int id = 1;
    for (i = l; i >= 1; i--)
    {
        if ((char)yans[i] == '_' && 
            (char)xans[i] == '_')
        {
            id = i + 1;
            break;
        }
    }
  
    // Printing the final answer
    System.out.print("Minimum Penalty in " + 
                     "aligning the genes = ");
    System.out.print(dp[m][n] + "\n");
    System.out.println("The aligned genes are :");
    for (i = id; i <= l; i++)
    {
        System.out.print((char)xans[i]);
    }
    System.out.print("\n");
    for (i = id; i <= l; i++)
    {
        System.out.print((char)yans[i]);
    }
    return;
}
  
// Driver code
public static void main(String[] args)
{
    // input strings
    String gene1 = "AGGGCT";
    String gene2 = "AGGCA";
      
    // intialsing penalties
    // of different types
    int misMatchPenalty = 3;
    int gapPenalty = 2;
  
    // calling the function to
    // calculate the result
    getMinimumPenalty(gene1, gene2, 
        misMatchPenalty, gapPenalty);
}
}


C#
// C# program to implement sequence alignment 
// problem. 
using System; 
    
class GFG 
{ 
    // function to find out the minimum penalty 
    public static void getMinimumPenalty(string x, string y, int pxy, int pgap) 
    { 
        int i, j; // intialising variables 
            
        int m = x.Length; // length of gene1 
        int n = y.Length; // length of gene2 
            
        // table for storing optimal substructure answers 
        int[,] dp = new int[n+m+1,n+m+1];
        for(int q = 0; q < n+m+1; q++)
            for(int w = 0; w < n+m+1; w++)
                dp[q,w] = 0;
        
        // intialising the table  
        for (i = 0; i <= (n+m); i++) 
        { 
            dp[i,0] = i * pgap; 
            dp[0,i] = i * pgap; 
        }     
        
        // calcuting the minimum penalty 
        for (i = 1; i <= m; i++) 
        { 
            for (j = 1; j <= n; j++) 
            { 
                if (x[i - 1] == y[j - 1]) 
                { 
                    dp[i,j] = dp[i - 1,j - 1]; 
                } 
                else
                { 
                    dp[i,j] = Math.Min(Math.Min(dp[i - 1,j - 1] + pxy ,  
                                    dp[i - 1,j] + pgap)    ,  
                                    dp[i,j - 1] + pgap    ); 
                } 
            } 
        } 
        
        // Reconstructing the solution 
        int l = n + m; // maximum possible length 
            
        i = m; j = n; 
            
        int xpos = l; 
        int ypos = l; 
        
        // Final answers for the respective strings 
        int[] xans = new int[l+1];
        int [] yans = new int[l+1]; 
            
        while ( !(i == 0 || j == 0)) 
        { 
            if (x[i - 1] == y[j - 1]) 
            { 
                xans[xpos--] = (int)x[i - 1]; 
                yans[ypos--] = (int)y[j - 1]; 
                i--; j--; 
            } 
            else if (dp[i - 1,j - 1] + pxy == dp[i,j]) 
            { 
                xans[xpos--] = (int)x[i - 1]; 
                yans[ypos--] = (int)y[j - 1]; 
                i--; j--; 
            } 
            else if (dp[i - 1,j] + pgap == dp[i,j]) 
            { 
                xans[xpos--] = (int)x[i - 1]; 
                yans[ypos--] = (int)'_'; 
                i--; 
            } 
            else if (dp[i,j - 1] + pgap == dp[i,j]) 
            { 
                xans[xpos--] = (int)'_'; 
                yans[ypos--] = (int)y[j - 1]; 
                j--; 
            } 
        } 
        while (xpos > 0) 
        { 
            if (i > 0) xans[xpos--] = (int)x[--i]; 
            else xans[xpos--] = (int)'_'; 
        } 
        while (ypos > 0) 
        { 
            if (j > 0) yans[ypos--] = (int)y[--j]; 
            else yans[ypos--] = (int)'_'; 
        } 
        
        // Since we have assumed the answer to be n+m long,  
        // we need to remove the extra gaps in the starting  
        // id represents the index from which the arrays 
        // xans, yans are useful 
        int id = 1; 
        for (i = l; i >= 1; i--) 
        { 
            if ((char)yans[i] == '_' && (char)xans[i] == '_') 
            { 
                id = i + 1; 
                break; 
            } 
        } 
        
        // Printing the final answer 
        Console.Write("Minimum Penalty in aligning the genes = " + dp[m,n] + "\n"); 
        Console.Write("The aligned genes are :\n"); 
        for (i = id; i <= l; i++) 
        { 
            Console.Write((char)xans[i]); 
        } 
        Console.Write("\n"); 
        for (i = id; i <= l; i++) 
        { 
            Console.Write((char)yans[i]); 
        } 
        return; 
    } 
        
    // Driver code
    static void Main() 
    { 
        // input strings 
        string gene1 = "AGGGCT"; 
        string gene2 = "AGGCA"; 
            
        // intialsing penalties of different types 
        int misMatchPenalty = 3; 
        int gapPenalty = 2; 
        
        // calling the function to calculate the result 
        getMinimumPenalty(gene1, gene2,  
            misMatchPenalty, gapPenalty);
    }
    //This code is contributed by DrRoot_
}


PHP
 0)
    {
        if ($i > 0) $xans[$xpos--] = $x[--$i];
        else $xans[$xpos--] = '_';
    }
    while ($ypos > 0)
    {
        if ($j > 0) 
            $yans[$ypos--] = $y[--$j];
        else 
            $yans[$ypos--] = '_';
    }
  
    // Since we have assumed the
    // answer to be n+m long, 
    // we need to remove the extra
    // gaps in the starting 
    // id represents the index 
    // from which the arrays
    // xans, yans are useful
    $id = 1;
    for ($i = $l; $i >= 1; $i--)
    {
        if ($yans[$i] == '_' && 
            $xans[$i] == '_')
        {
            $id = $i + 1;
            break;
        }
    }
  
    // Printing the final answer
    echo "Minimum Penalty in ". 
         "aligning the genes = ";
    echo $dp[$m][$n] . "\n";
    echo "The aligned genes are :\n";
    for ($i = $id; $i <= $l; $i++)
    {
        echo $xans[$i];
    }
    echo "\n";
    for ($i = $id; $i <= $l; $i++)
    {
        echo $yans[$i];
    }
    return;
}
  
// Driver code
  
// input strings
$gene1 = "AGGGCT";
$gene2 = "AGGCA";
  
// intialsing penalties
// of different types
$misMatchPenalty = 3;
$gapPenalty = 2;
  
// calling the function 
// to calculate the result
getMinimumPenalty($gene1, $gene2, 
    $misMatchPenalty, $gapPenalty);
  
// This code is contributed by Abhinav96
?>


输出:
Minimum Penalty in aligning the genes = 5
The aligned genes are :
AGGGCT
A_GGCA

时间复杂度: \mathcal{O}(n*m)
空间复杂度: \mathcal{O}(n*m)