package gjoseqlib; # read_fasta_seqs( *FILEHANDLE ) # read_next_fasta_seq( *FILEHANDLE ) # parse_fasta_title( $title ) # split_fasta_title( $title ) # print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list ) # print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq ) # print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq ) # index_seq_list( @seq_entry_list ) # seq_entry_by_id( \%seq_index, $seq_id ) # seq_desc_by_id( \%seq_index, $seq_id ) # seq_data_by_id( \%seq_index, $seq_id ) # subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] ) # subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] ) # complement_DNA_entry( @seq_entry [, $fix_id] ) # complement_RNA_entry( @seq_entry [, $fix_id] ) # complement_DNA_seq( $NA_seq ) # complement_RNA_seq( $NA_seq ) # to_DNA_seq( $NA_seq ) # to_RNA_seq( $NA_seq ) # clean_ae_sequence( $seq ) # translate_seq( $seq [, $met_start] ) # translate_codon( $triplet ) # translate_seq_with_user_code( $seq, \%gen_code [, $met_start] ) # read_intervals( *FILEHANDLE ) # join_intervals( @interval_refs ) use gjolib qw(wrap_text); require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw( read_fasta_seqs read_next_fasta_seq parse_fasta_title split_fasta_title print_seq_list_as_fasta print_seq_as_fasta print_gb_locus index_seq_list seq_entry_by_id seq_desc_by_id seq_data_by_id subseq_DNA_entry subseq_RNA_entry complement_DNA_entry complement_RNA_entry complement_DNA_seq complement_RNA_seq to_DNA_seq to_RNA_seq clean_ae_sequence translate_seq translate_codon translate_seq_with_user_code read_intervals join_intervals ); use strict; #----------------------------------------------------------------------------- # Read fasta sequences from a file. # Save the contents in a list of refs to arrays: (id, description, seq) # # @seqs = read_fasta_seqs(*FILEHANDLE) #----------------------------------------------------------------------------- sub read_fasta_seqs { my $fh = shift; wantarray || die "read_fasta_seqs requires list context\n"; my @seqs = (); my ($id, $desc, $seq) = ("", "", ""); while (<$fh>) { chomp; if (/^>\s*(\S+)(\s+(.*))?$/) { # new id if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] } ($id, $desc, $seq) = ($1, $3 ? $3 : "", ""); } else { tr/\t 0-9//d; $seq .= $_ ; } } if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] } return @seqs; } #----------------------------------------------------------------------------- # Read one fasta sequence at a time from a file. # Return the contents as an array: (id, description, seq) # # @seq_entry = read_next_fasta_seq(*FILEHANDLE) #----------------------------------------------------------------------------- # Reading always overshoots, so save next id and description # (these should really be hashes indexed by file handle): # my $next_fasta_seq_header = ""; sub read_next_fasta_seq { my $fh = shift; wantarray || die "read_next_fasta_seq requires list context\n"; my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header ); my $seq = ""; while ( <$fh> ) { chomp; if ( /^>/ ) { # new id $next_fasta_seq_header = $_; if ( $id && $seq ) { return ($id, $desc, $seq) } ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header ); $seq = ""; } else { tr/\t 0-9//d; $seq .= $_ ; } } # Done with file, clear "next header" $next_fasta_seq_header = ""; return ($id && $seq) ? ($id, $desc, $seq) : () ; } #----------------------------------------------------------------------------- # Parse a fasta file header to id and definition parts # # ($id, $def) = parse_fasta_title( $title ) # ($id, $def) = split_fasta_title( $title ) #----------------------------------------------------------------------------- sub parse_fasta_title { my $title = shift; chomp; if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) { return ($1, $3 ? $3 : ""); } else { return ("", ""); } } sub split_fasta_title { parse_fasta_title ( shift ); } #----------------------------------------------------------------------------- # Print list of sequence entries in fasta format # # print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list); #----------------------------------------------------------------------------- sub print_seq_list_as_fasta { my $fh = shift; my @seq_list = @_; foreach my $seq_ptr (@seq_list) { print_seq_as_fasta($fh, @$seq_ptr); } } #----------------------------------------------------------------------------- # Print one sequence in fasta format to an open file # # print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq); # print_seq_as_fasta(*FILEHANDLE, @seq_entry); #----------------------------------------------------------------------------- sub print_seq_as_fasta { my $fh = shift; my ($id, $desc, $seq) = @_; printf $fh ($desc) ? ">$id $desc\n" : ">$id\n"; my $len = length($seq); for (my $i = 0; $i < $len; $i += 60) { print $fh substr($seq, $i, 60) . "\n"; } } #----------------------------------------------------------------------------- # Print one sequence in GenBank flat file format: # # print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq ) #----------------------------------------------------------------------------- sub print_gb_locus { my ($fh, $loc, $def, $acc, $seq) = @_; my ($len, $i, $imax); my $istep = 10; $len = length($seq); printf $fh "LOCUS %-10s%7d bp\n", substr($loc,0,10), $len; print $fh "DEFINITION " . substr(wrap_text($def,80,12), 12) . "\n"; if ($acc) { print $fh "ACCESSION $acc\n" } print $fh "ORIGIN\n"; for ($i = 1; $i <= $len; ) { printf $fh "%9d", $i; $imax = $i + 59; if ($imax > $len) { $imax = $len } for ( ; $i <= $imax; $i += $istep) { print $fh " " . substr($seq, $i-1, $istep); } print $fh "\n"; } print $fh "//\n"; } #----------------------------------------------------------------------------- # Build an index from seq_id to pointer to sequence entry: (id, desc, seq) # # my %seq_ind = index_seq_list(@seq_list); # # Usage example: # # my @seq_list = read_fasta_seqs(*STDIN); # list of pointers to entries # my %seq_ind = index_seq_list(@seq_list); # hash from names to pointers # my @chosen_seq = @{%seq_ind{"contig1"}}; # extract one entry # #----------------------------------------------------------------------------- sub index_seq_list { my %seq_index = map { @{$_}[0] => $_ } @_; return \%seq_index; } #----------------------------------------------------------------------------- # Three routines to access all or part of sequence entry by id: # # my @seq_entry = seq_entry_by_id( \%seq_index, $seq_id ); # my $seq_desc = seq_desc_by_id( \%seq_index, $seq_id ); # my $seq = seq_data_by_id( \%seq_index, $seq_id ); # #----------------------------------------------------------------------------- sub seq_entry_by_id { (my $ind_ref = shift) || die "No index supplied to seq_entry_by_id\n"; (my $id = shift) || die "No id supplied to seq_entry_by_id\n"; wantarray || die "entry_by_id requires list context\n"; return @{ $ind_ref->{$id} }; } sub seq_desc_by_id { (my $ind_ref = shift) || die "No index supplied to seq_desc_by_id\n"; (my $id = shift) || die "No id supplied to seq_desc_by_id\n"; return ${ $ind_ref->{$id} }[1]; } sub seq_data_by_id { (my $ind_ref = shift) || die "No index supplied to seq_data_by_id\n"; (my $id = shift) || die "No id supplied to seq_data_by_id\n"; return ${ $ind_ref->{$id} }[2]; } #----------------------------------------------------------------------------- # Some simple sequence manipulations: # # @entry = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] ); # @entry = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] ); # @entry = complement_DNA_entry( @seq_entry [, $fix_id] ); # @entry = complement_RNA_entry( @seq_entry [, $fix_id] ); # $DNAseq = complement_DNA_seq( $NA_seq ); # $RNAseq = complement_RNA_seq( $NA_seq ); # $DNAseq = to_DNA_seq( $NA_seq ); # $RNAseq = to_RNA_seq( $NA_seq ); # #----------------------------------------------------------------------------- sub subseq_DNA_entry { my ($id, $desc, @rest) = @_; wantarray || die "subseq_DNA_entry requires array context\n"; my $seq; ($id, $seq) = subseq_nt(1, $id, @rest); # 1 is for DNA, not RNA return ($id, $desc, $seq); } sub subseq_RNA_entry { my ($id, $desc, @rest) = @_; wantarray || die "subseq_RNA_entry requires array context\n"; my $seq; ($id, $seq) = subseq_nt(0, $id, @rest); # 0 is for not DNA, i.e., RNA return ($id, $desc, $seq); } sub subseq_nt { my ($DNA, $id, $seq, $from, $to, $fix_id) = @_; $fix_id ||= 0; # fix undef value my $len = length($seq); if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len } if (! $to || ( $to eq '$' ) || ( $to eq "" ) ) { $to = $len } my $left = ( $from < $to ) ? $from : $to; my $right = ( $from < $to ) ? $to : $from; if ( ( $right < 1 ) || ( $left > $len ) ) { return ($id, "") } if ( $right > $len ) { $right = $len } if ( $left < 1 ) { $left = 1 } $seq = substr($seq, $left-1, $right-$left+1); if ( $from > $to ) { $seq = reverse $seq; if ( $DNA ) { $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; } else { $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; } } if ( $fix_id ) { if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// ) && ( abs($2-$1)+1 == $len ) ) { if ( $1 <= $2 ) { $from += $1 - 1; $to += $1 - 1 } else { $from = $1 + 1 - $from; $to = $1 + 1 - $to } } $id .= "_" . $from . "_" . $to; } return ($id, $seq); } sub complement_DNA_entry { my ($id, $desc, $seq, $fix_id) = @_; $fix_id ||= 0; # fix undef values wantarray || die "complement_DNA_entry requires array context\n"; $seq = reverse $seq; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; if ($fix_id) { if ($id =~ s/_([0-9]+)_([0-9]+)$//) { $id .= "_" . $2 . "_" . $1; } else { $id .= "_" . length($seq) . "_1"; } } return ($id, $desc, $seq); } sub complement_RNA_entry { my ($id, $desc, $seq, $fix_id) = @_; $fix_id ||= 0; # fix undef values wantarray || die "complement_DNA_entry requires array context\n"; $seq = reverse $seq; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; if ($fix_id) { if ($id =~ s/_([0-9]+)_([0-9]+)$//) { $id .= "_" . $2 . "_" . $1; } else { $id .= "_" . length($seq) . "_1"; } } return ($id, $desc, $seq); } sub complement_DNA_seq { my $seq = reverse shift; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; return $seq; } sub complement_RNA_seq { my $seq = reverse shift; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; return $seq; } sub to_DNA_seq { my $seq = shift; $seq =~ tr/Uu/Tt/; return $seq; } sub to_RNA_seq { my $seq = shift; $seq =~ tr/Tt/Uu/; return $seq; } sub clean_ae_sequence { $_ = shift; $_ = to7bit($_); s/[+]/1/g; s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g; return $_; } sub to7bit { $_ = shift; my ($o, $c); while (/\\([0-3][0-7][0-7])/) { $o = oct($1) % 128; $c = sprintf("%c", $o); s/\\$1/$c/g; } return $_; } sub to8bit { $_ = shift; my ($o, $c); while (/\\([0-3][0-7][0-7])/) { $o = oct($1); $c = sprintf("%c", $o); s/\\$1/$c/g; } return $_; } #----------------------------------------------------------------------------- # Translate nucleotides to one letter protein: # # $seq = translate_seq( $seq [, $met_start] ) # $aa = translate_codon( $triplet ); # $aa = translate_uc_DNA_codon( $upcase_DNA_triplet ); # # User-supplied genetic code must be upper case index and match the # DNA versus RNA type of sequence # # $seq = translate_seq_with_user_code( $seq, $gen_code_hash [, $met_start] ) # #----------------------------------------------------------------------------- our %genetic_code = ( # work as DNA TTT => "F", TCT => "S", TAT => "Y", TGT => "C", TTC => "F", TCC => "S", TAC => "Y", TGC => "C", TTA => "L", TCA => "S", TAA => "*", TGA => "*", TTG => "L", TCG => "S", TAG => "*", TGG => "W", CTT => "L", CCT => "P", CAT => "H", CGT => "R", CTC => "L", CCC => "P", CAC => "H", CGC => "R", CTA => "L", CCA => "P", CAA => "Q", CGA => "R", CTG => "L", CCG => "P", CAG => "Q", CGG => "R", ATT => "I", ACT => "T", AAT => "N", AGT => "S", ATC => "I", ACC => "T", AAC => "N", AGC => "S", ATA => "I", ACA => "T", AAA => "K", AGA => "R", ATG => "M", ACG => "T", AAG => "K", AGG => "R", GTT => "V", GCT => "A", GAT => "D", GGT => "G", GTC => "V", GCC => "A", GAC => "D", GGC => "G", GTA => "V", GCA => "A", GAA => "E", GGA => "G", GTG => "V", GCG => "A", GAG => "E", GGG => "G", # The following ambiguous encodings are not necessary, but # speed up the processing of some common ambiguous triplets: TTY => "F", TCY => "S", TAY => "Y", TGY => "C", TTR => "L", TCR => "S", TAR => "*", TCN => "S", CTY => "L", CCY => "P", CAY => "H", CGY => "R", CTR => "L", CCR => "P", CAR => "Q", CGR => "R", CTN => "L", CCN => "P", CGN => "R", ATY => "I", ACY => "T", AAY => "N", AGY => "S", ACR => "T", AAR => "K", AGR => "R", ACN => "T", GTY => "V", GCY => "A", GAY => "D", GGY => "G", GTR => "V", GCR => "A", GAR => "E", GGR => "G", GTN => "V", GCN => "A", GGN => "G" ); my %DNA_letter_can_be = ( A => ["A"], a => ["a"], B => ["C", "G", "T"], b => ["c", "g", "t"], C => ["C"], c => ["c"], D => ["A", "G", "T"], d => ["a", "g", "t"], G => ["G"], g => ["g"], H => ["A", "C", "T"], h => ["a", "c", "t"], K => ["G", "T"], k => ["g", "t"], M => ["A", "C"], m => ["a", "c"], N => ["A", "C", "G", "T"], n => ["a", "c", "g", "t"], R => ["A", "G"], r => ["a", "g"], S => ["C", "G"], s => ["c", "g"], T => ["T"], t => ["t"], U => ["T"], u => ["t"], V => ["A", "C", "G"], v => ["a", "c", "g"], W => ["A", "T"], w => ["a", "t"], Y => ["C", "T"], y => ["c", "t"] ); my %RNA_letter_can_be = ( A => ["A"], a => ["a"], B => ["C", "G", "U"], b => ["c", "g", "u"], C => ["C"], c => ["c"], D => ["A", "G", "U"], d => ["a", "g", "u"], G => ["G"], g => ["g"], H => ["A", "C", "U"], h => ["a", "c", "u"], K => ["G", "U"], k => ["g", "u"], M => ["A", "C"], m => ["a", "c"], N => ["A", "C", "G", "U"], n => ["a", "c", "g", "u"], R => ["A", "G"], r => ["a", "g"], S => ["C", "G"], s => ["c", "g"], T => ["U"], t => ["u"], U => ["U"], u => ["u"], V => ["A", "C", "G"], v => ["a", "c", "g"], W => ["A", "U"], w => ["a", "u"], Y => ["C", "U"], y => ["c", "u"] ); my %one_letter_to_three_letter_aa = ( A => "Ala", B => "Asx", C => "Cys", D => "Asp", E => "Glu", F => "Phe", G => "Gly", H => "His", I => "Ile", K => "Lys", L => "Leu", M => "Met", N => "Asn", P => "Pro", Q => "Gln", R => "Arg", S => "Ser", T => "Thr", U => "Sec", V => "Val", W => "Trp", X => "Xxx", Y => "Tyr", Z => "Glx", "*" => "***" ); #----------------------------------------------------------------------------- # Translate nucleotides to one letter protein: # # $seq = translate_seq( $seq [, $met_start] ) # #----------------------------------------------------------------------------- sub translate_seq { my $seq = uc shift; $seq =~ tr/UX/TN/; # make it DNA, and allow X $seq =~ tr/-//d; # remove gaps my $met = shift || 0; # a second argument that is true # forces first amino acid to be Met # (note: undef is false) my $imax = length($seq) - 2; # will try to translate 2 nucleotides! my $pep = ($met && ($imax >= 0)) ? "M" : ""; my $aa; for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) { $pep .= translate_uc_DNA_codon( substr($seq,$i,3) ); } return $pep; } #----------------------------------------------------------------------------- # Translate a single triplet with "universal" genetic code # Uppercase and DNA are performed, then translate_uc_DNA_codon # is called. # # $aa = translate_codon( $triplet ) # #----------------------------------------------------------------------------- sub translate_codon { my $codon = uc shift; # Make it uppercase $codon =~ tr/UX/TN/; # Make it DNA, and allow X return translate_uc_DNA_codon($codon); } #----------------------------------------------------------------------------- # Translate a single triplet with "universal" genetic code # Uppercase and DNA assumed # Intended for private use by translate_codon and translate_seq # # $aa = translate_uc_DNA_codon( $triplet ) # #----------------------------------------------------------------------------- sub translate_uc_DNA_codon { my $codon = shift; my $aa; # Try a simple lookup: if ( $aa = $genetic_code{ $codon } ) { return $aa } # With the expanded code defined above, this catches simple N, R # and Y ambiguities in the third position. Other codes like # GG[KMSWBDHV], or even GG, might be unambiguously translated by # converting the last position to N and seeing if this is in the # (expanded) code table: if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa } # Test that codon is valid and might have unambiguous aa: if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" } # ^^ # |+- for leucine YTR # +-- for arginine MGR # Expand all ambiguous nucleotides to see if they all yield same aa. # Loop order tries to fail quickly with first position change. $aa = ""; for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) { for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) { for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) { # set the first value of $aa if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } } # or break out if any other amino acid is detected elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" } } } } return $aa || "X"; } #----------------------------------------------------------------------------- # Translate with a user-supplied genetic code to translate a sequence. # Diagnose the use of upper versus lower, and T versus U in the supplied # code, and transform the supplied nucleotide sequence to match. # # translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] ) # #----------------------------------------------------------------------------- sub translate_seq_with_user_code { my $seq = shift; $seq =~ tr/-//d; # remove gaps *** Why? $seq =~ tr/Xx/Nn/; # allow X my $gc = shift; # Reference to hash of DNA alphabet code if (! $gc || ref($gc) ne "HASH") { die "translate_seq_with_user_code needs genetic code hash as secondargument."; } # Test the type of code supplied: uppercase versus lowercase my ($RNA_F, $DNA_F, $M, $N, $X); if ($gc->{ "AAA" }) { # Looks like uppercase code table $seq = uc $seq; # Uppercase sequence $RNA_F = "UUU"; # Uppercase RNA Phe codon $DNA_F = "TTT"; # Uppercase DNA Phe codon $M = "M"; # Uppercase initiator $N = "N"; # Uppercase ambiguous nuc $X = "X"; # Uppercase ambiguous aa } elsif ($gc->{ "aaa" }) { # Looks like lowercase code table $seq = lc $seq; # Lowercase sequence $RNA_F = "uuu"; # Lowercase RNA Phe codon $DNA_F = "ttt"; # Lowercase DNA Phe codon $M = "m"; # Lowercase initiator $N = "n"; # Lowercase ambiguous nuc $X = "x"; # Lowercase ambiguous aa } else { die "User-supplied genetic code does not have aaa or AAA\n"; } # Test the type of code supplied: UUU versus TTT my ($ambigs); if ($gc->{ $RNA_F }) { # Looks like RNA code table $seq =~ tr/Tt/Uu/; $ambigs = \%RNA_letter_can_be; } elsif ($gc->{ $DNA_F }) { # Looks like DNA code table $seq =~ tr/Uu/Tt/; $ambigs = \%DNA_letter_can_be; } else { die "User-supplied genetic code does not have $RNA_F or $DNA_F\n"; } my $imax = length($seq) - 2; # will try to translate 2 nucleotides! my $met = shift; # a third argument that is true # forces first amino acid to be Met # (note: undef is false) my $pep = ($met && ($imax >= 0)) ? $M : ""; my $aa; for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) { $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs ); } return $pep; } #----------------------------------------------------------------------------- # Translate with user-supplied genetic code hash. For speed, no error # check on the hash. Calling programs should check for the hash at a # higher level. # # Should only be called through translate_seq_with_user_code # # translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table ) # # $triplet speaks for itself # $code ref to the hash with the codon translations # $N character to use for ambiguous nucleotide # $X character to use for ambiguous amino acid # $ambig_table ref to hash with lists of nucleotides for each ambig code #----------------------------------------------------------------------------- sub translate_codon_with_user_code { my $codon = shift; my $gc = shift; my $aa; # Try a simple lookup: if ( $aa = $gc->{ $codon } ) { return $aa } # Test that codon is valid and might have unambiguous aa: my ($N, $X, $ambigs) = @_; if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N } if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X } # ^^ # |+- for leucine YTR # +-- for arginine MGR # Expand all ambiguous nucleotides to see if they all yield same aa. # Loop order tries to fail quickly with first position change. $aa = ""; for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) { for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) { for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) { # set the first value of $aa if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } } # break out if any other amino acid is detected elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" } } } } return $aa || $X; } #----------------------------------------------------------------------------- # Read a list of intervals from a file. # Allow id_start_end, or id \s start \s end formats # # @intervals = read_intervals(*FILEHANDLE) #----------------------------------------------------------------------------- sub read_intervals { my $fh = shift; my @intervals = (); while (<$fh>) { chomp; /^(\S+)_(\d+)_(\d+)(\s.*)?$/ # id_start_end WIT2 || /^(\S+)_(\d+)-(\d+)(\s.*)?$/ # id_start-end ??? || /^(\S+)=(\d+)=(\d+)(\s.*)?$/ # id=start=end Badger || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/ # id \s start \s end || next; # Matched a pattern. Store reference to (id, left, right): push @intervals, ($2 < $3) ? [ $1, $2+0, $3+0 ] : [ $1, $3+0, $2+0 ]; } return @intervals; } #----------------------------------------------------------------------------- # Take the union of a list of intervals # # @joined = join_intervals( @interval_refs ) #----------------------------------------------------------------------------- sub join_intervals { my @ordered = sort { $a->[0] cmp $b->[0] # first by id || $a->[1] <=> $b->[1] # next by left end || $b->[2] <=> $a->[2] # finally longest first } @_; my @joined = (); my $n_int = @ordered; my ($cur_id) = ""; my ($cur_left) = -1; my ($cur_right) = -1; my ($new_id, $new_left, $new_right); for (my $i = 0; $i < $n_int; $i++) { ($new_id, $new_left, $new_right) = @{$ordered[$i]}; # get the new data if ( ( $new_id ne $cur_id) # if new contig || ( $new_left > $cur_right + 1) # or not touching previous ) { # push the previous interval if ($cur_id) { push (@joined, [ $cur_id, $cur_left, $cur_right ]) } $cur_id = $new_id; # update the current interval $cur_left = $new_left; $cur_right = $new_right; } elsif ($new_right > $cur_right) { # extend the right end if necessary $cur_right = $new_right; } } if ($cur_id) { push (@joined, [$cur_id, $cur_left, $cur_right]) } return @joined; } 1;