Parent Directory
|
Revision Log
Revision 1.4 - (view) (download) (as text)
1 : | olson | 1.4 | # |
2 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
3 : | # for Interpretations of Genomes. All Rights Reserved. | ||
4 : | # | ||
5 : | # This file is part of the SEED Toolkit. | ||
6 : | # | ||
7 : | # The SEED Toolkit is free software. You can redistribute | ||
8 : | # it and/or modify it under the terms of the SEED Toolkit | ||
9 : | # Public License. | ||
10 : | # | ||
11 : | # You should have received a copy of the SEED Toolkit Public License | ||
12 : | # along with this program; if not write to the University of Chicago | ||
13 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
14 : | # Genomes at veronika@thefig.info or download a copy from | ||
15 : | # http://www.theseed.org/LICENSE.TXT. | ||
16 : | # | ||
17 : | |||
18 : | overbeek | 1.1 | |
19 : | $usage = "match_dna_to_aligned_prots dna_filename aligned_prot_filename"; | ||
20 : | |||
21 : | (($dna_filename = shift @ARGV) && ($aligned_prot_filename = shift @ARGV)) || | ||
22 : | die $usage; | ||
23 : | |||
24 : | open(DNA,"<$dna_filename") or die "failed to open dna"; | ||
25 : | open(PROT,"<$aligned_prot_filename") or die "failed to open prot"; | ||
26 : | while (defined($line = <DNA>)) | ||
27 : | { | ||
28 : | $line =~ />(\S+)/; | ||
29 : | $id = $1; | ||
30 : | $seq = <DNA>; | ||
31 : | chomp $seq; | ||
32 : | $dnaseqs{$id} = $seq; | ||
33 : | } | ||
34 : | |||
35 : | while (defined($line = <PROT>)) | ||
36 : | { | ||
37 : | $line =~ />(\S+)/; | ||
38 : | $id = $1; | ||
39 : | $seq = <PROT>; | ||
40 : | chomp $seq; | ||
41 : | $protseqs{$id} = $seq; | ||
42 : | } | ||
43 : | |||
44 : | # for $id (keys(%protseqs)) { print "$id $protseqs{$id}\n"; } | ||
45 : | # for $id (keys(%protseqs)) { print "$id $protseqs{$id}\n" if ($id =~ /1048/); } | ||
46 : | ## for $id (keys(%protseqs)) | ||
47 : | ## { | ||
48 : | ## $protseqs{$id} =~ /(\-+)/; | ||
49 : | ## $subseq=$1; | ||
50 : | ## # print "$id NUM ", length($subseq), $subseq, "\n" if ($id =~ /155920.*1503/); | ||
51 : | ## print "$id NUM=", length($subseq), "\n"; | ||
52 : | ## } | ||
53 : | |||
54 : | for $id (keys(%protseqs)) | ||
55 : | { | ||
56 : | $protseq = $protseqs{$id}; | ||
57 : | @protseq = split('',$protseq); | ||
58 : | $newprotseq = ''; | ||
59 : | for ($i=0; $i < @protseq; $i++) | ||
60 : | { | ||
61 : | $newprotseq .= "$protseq[$i] "; | ||
62 : | } | ||
63 : | @newprotseq = split('',$newprotseq); | ||
64 : | $protseqs{$id} = $newprotseq; | ||
65 : | # print "$id $newprotseq\n" if ($id =~ /1048/); | ||
66 : | $dnaseq = $dnaseqs{$id}; | ||
67 : | $dnaidx = 0; | ||
68 : | $newdnaseq = ''; | ||
69 : | for ($i=0; $i < @protseq; $i++) | ||
70 : | { | ||
71 : | $c = $protseq[$i]; | ||
72 : | if ($c =~ /[A-Z]/) | ||
73 : | { | ||
74 : | $newdnaseq .= substr($dnaseq,$dnaidx,3); | ||
75 : | $dnaidx += 3; | ||
76 : | } | ||
77 : | elsif ($c eq '-') | ||
78 : | { | ||
79 : | $newdnaseq .= "---"; | ||
80 : | } | ||
81 : | else | ||
82 : | { | ||
83 : | print STDERR "unrecognized char $c\n"; | ||
84 : | } | ||
85 : | } | ||
86 : | $dnaseqs{$id} = $newdnaseq; | ||
87 : | # print "$id $newdnaseq\n" if ($id =~ /1048/); | ||
88 : | # sanity check | ||
89 : | if (length($newdnaseq) != length($newprotseq)) | ||
90 : | { | ||
91 : | print STDERR "BAD LEN $id ", length($newdnaseq), " ", length($newprotseq), "\n"; | ||
92 : | } | ||
93 : | overbeek | 1.3 | $dnaid = "DNA_" . $id; |
94 : | $proid = "PRO_" . $id; | ||
95 : | overbeek | 1.1 | print ">$dnaid\n$newdnaseq\n"; |
96 : | print ">$proid\n$newprotseq\n"; | ||
97 : | } |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |