[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Fri Nov 23 00:33:31 2007 UTC revision 1.11, Thu Jan 3 21:36:41 2008 UTC
# Line 99  Line 99 
99  #  @seq_entries = read_qual(  $filename )  #  @seq_entries = read_qual(  $filename )
100  # \@seq_entries = read_qual(  $filename )  # \@seq_entries = read_qual(  $filename )
101  #  #
102    #  Evaluate nucleotide alignments:
103    #
104    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
105    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
106    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_weight )
107    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_open, $gap_extend )
108    #
109    #  ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
110    #
111    
112  use strict;  use strict;
113  use Carp;  use Carp;
# Line 172  Line 181 
181          gb_location_2_seed          gb_location_2_seed
182    
183          read_qual          read_qual
184    
185            fraction_nt_diff
186            interpret_nt_align
187          );          );
188    
189  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 1829  Line 1841 
1841  }  }
1842    
1843    
1844    #-------------------------------------------------------------------------------
1845    #  Fraction difference for an alignment of two nucleotide sequences in terms of
1846    #  number of differing residues, number of gaps, and number of gap opennings.
1847    #
1848    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
1849    #
1850    #  or
1851    #
1852    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
1853    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_wgt )
1854    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $open_wgt, $extend_wgt )
1855    #
1856    #  Options:
1857    #
1858    #      gap      => $gap_wgt          # Gap open and extend weight (D = 0.5)
1859    #      open     => $open_wgt         # Gap openning weight (D = gap_wgt)
1860    #      extend   => $extend_wgt       # Gap extension weight (D = open_wgt)
1861    #      t_gap    => $term_gap_wgt     # Terminal open and extend weight
1862    #      t_open   => $term_open_wgt    # Terminal gap open weight (D = open_wgt)
1863    #      t_extend => $term_extend_wgt  # Terminal gap extend weight (D = extend_wgt)
1864    #
1865    #  Default gap open and gap extend weights are 1/2.  Beware that
1866    #
1867    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1 )
1868    #
1869    #  and
1870    #
1871    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1, 0 )
1872    #
1873    #  are different.  The first has equal openning and extension weights, whereas
1874    #  the second has an openning weight of 1, and and extension weight of 0 (it
1875    #  only penalizes the number of runs of gaps).
1876    #-------------------------------------------------------------------------------
1877    sub fraction_nt_diff
1878    {
1879        my ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( @_[0,1] );
1880    
1881        my $diff_scr;
1882        if ( ref( $_[2] ) eq 'HASH' )
1883        {
1884            my $opts = $_[2];
1885            my $gap_open    = defined $opts->{ open }     ? $opts->{ open }
1886                            : defined $opts->{ gap }      ? $opts->{ gap }
1887                            : 0.5;
1888            my $gap_extend  = defined $opts->{ extend }   ? $opts->{ extend }
1889                            : $gap_open;
1890            my $term_open   = defined $opts->{ t_open }   ? $opts->{ t_open }
1891                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
1892                            : $gap_open;
1893            my $term_extend = defined $opts->{ t_extend } ? $opts->{ t_extend }
1894                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
1895                            : $gap_extend;
1896    
1897            $nopen -= $topen;
1898            $ngap  -= $tgap;
1899            $diff_scr = $ndif + $gap_open  * $nopen + $gap_extend  * ($ngap-$nopen)
1900                              + $term_open * $topen + $term_extend * ($tgap-$topen);
1901        }
1902        else
1903        {
1904            my $gap_open   = defined( $_[2] ) ? $_[2] : 0.5;
1905            my $gap_extend = defined( $_[3] ) ? $_[3] : $gap_open;
1906            $diff_scr = $ndif + $gap_open * $nopen + $gap_extend * ($ngap-$nopen);
1907        }
1908        my $ttl_scr = $nid + $diff_scr;
1909    
1910        $ttl_scr ? $diff_scr / $ttl_scr : undef
1911    }
1912    
1913    
1914    #-------------------------------------------------------------------------------
1915    #  Interpret an alignment of two nucleotide sequences in terms of: useful
1916    #  aligned positions (unambiguous, and not a common gap), number of identical
1917    #  residues, number of differing residues, number of gaps, and number of gap
1918    #  opennings.
1919    #
1920    #     ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
1921    #
1922    #  $npos  = total aligned positons (= $nid + $ndif + $ngap)
1923    #  $nid   = number of positions with identical nucleotides (ignoring case)
1924    #  $ndif  = number of positions with differing nucleotides
1925    #  $ngap  = number of positions with gap in one sequence but not the other
1926    #  $nopen = number of runs of gaps
1927    #  $tgap  = number of gaps in runs adjacent to a terminus
1928    #  $topen = number of alignment ends with gaps
1929    #
1930    #  Some of the methods might seem overly complex, but are necessary for cases
1931    #  in which the gaps switch strands in the alignment:
1932    #
1933    #     seq1  ---ACGTGAC--TTGCAGAG
1934    #     seq2  TTT---TGACGG--GCAGGG
1935    #     mask  00000011110000111111
1936    #
1937    #     npos  = 20
1938    #     nid   =  9
1939    #     ndif  =  1
1940    #     ngap  = 10
1941    #     nopen =  4
1942    #     tgap  =  3
1943    #     topen =  1
1944    #
1945    #  Although there are 4 gap opennings, there are only 2 runs in the mask,
1946    #  and the terminal run is length 6, not 3.  (Why handle these?  Because
1947    #  pairs of sequences from a multiple sequence alignment can look like this.)
1948    #-------------------------------------------------------------------------------
1949    sub interpret_nt_align
1950    {
1951        #  Remove alignment columns that are not informative:
1952        my ( $s1, $s2 ) = useful_nt_align( @_[0,1] );
1953        my $nmat = length( $s1 );          # Useful alignment length
1954    
1955        my $m1 = $s1;
1956        $m1 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
1957        $m1 =~ tr/\377/\000/c;             # Others (gaps) to null byte
1958        my $m2 = $s2;
1959        $m2 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
1960        $m2 =~ tr/\377/\000/c;             # Others (gaps) to null byte
1961        $m1 &= $m2;                        # Gap in either sequence becomes null
1962        $s1 &= $m1;                        # Apply mask to sequence 1
1963        $s2 &= $m1;                        # Apply mask to sequence 2
1964        my $nopen = @{[ $s1 =~ /\000+/g ]}   # Gap opens in sequence 1
1965                  + @{[ $s2 =~ /\000+/g ]};  # Gap opens in sequence 2
1966        my ( $tgap, $topen ) = ( 0, 0 );
1967        if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ }
1968        if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ }
1969        $s1 =~ tr/\000//d;                 # Remove nulls (former gaps)
1970        $s2 =~ tr/\000//d;                 # Remove nulls (former gaps)
1971        my $ngap = $nmat - length( $s1 );  # Total gaps
1972    
1973        my $xor = $s1 ^ $s2;               # xor of identical residues is null byte
1974        my $nid = ( $xor =~ tr/\000//d );  # Count the nulls (identical residues)
1975        my $ndif = $nmat - $nid - $ngap;
1976    
1977        ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen )
1978    }
1979    
1980    
1981    sub useful_nt_align
1982    {
1983        my ( $s1, $s2 ) = map { uc $_ } @_;
1984        $s1 =~ tr/U/T/;         # Convert U to T
1985        my $m1 = $s1;
1986        $m1 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
1987        $m1 =~ tr/\377/\000/c;  # All else to null byte
1988        $s2 =~ tr/U/T/;         # Convert U to T
1989        my $m2 = $s2;
1990        $m2 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
1991        $m2 =~ tr/\377/\000/c;  # All else to null byte
1992        $m1 &= $m2;             # Invalid in either sequence becomes null
1993        $s1 &= $m1;             # Apply mask to sequence 1
1994        $s1 =~ tr/\000//d;      # Delete nulls in sequence 1
1995        $s2 &= $m1;             # Apply mask to sequence 2
1996        $s2 =~ tr/\000//d;      # Delete nulls in sequence 2
1997        ( $s1, $s2 )
1998    }
1999    
2000    
2001  1;  1;

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3