[Bio] / FigMetagenomeTools / m82genome.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/m82genome.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # convert BLASTs from M8 format to a summary of what and counts
4 :    
5 :     use strict;
6 :     use DBI;
7 :     use Bio::SeqIO;
8 :    
9 :     my $dbh=DBI->connect("DBI:mysql:ncbi", "rob", "forestry") or die "Can't connect to database\n";
10 :    
11 :     my $usage=<<EOF;
12 :    
13 :     $0 <options>
14 :    
15 :     -d directory of blast results in m8 (tab) format
16 :     -c cutoff (E value) below which to ignore results
17 :     -z count zeroes only
18 :     -s directory of sequences. If this is included the lengths of each sequence will be used
19 :    
20 :     EOF
21 :    
22 :    
23 :     my ($dir, $seqdir,$cutoff,$countzero)=('', '', 1000000,0);
24 :     while (@ARGV) {
25 :     my $test=shift(@ARGV);
26 :     if ($test eq "-d") {$dir=shift @ARGV}
27 :     elsif ($test eq "-s") {$seqdir=shift @ARGV}
28 :     elsif ($test eq "-c") {$cutoff = shift @ARGV}
29 :     elsif ($test eq "-z") {$countzero=1}
30 :     }
31 :    
32 :     die $usage unless $dir;
33 :    
34 :     &count_zero($dir) if $countzero;
35 :    
36 :     my $lengths;
37 :     if ($seqdir) {$lengths=seq_dir($seqdir)}
38 :    
39 :     opendir(DIR, $dir) || die "Can't open $dir";
40 :     while (my $file=readdir(DIR)) {
41 :     next if ($file =~ /^\./);
42 :     my $len='xx';
43 :     if ($lengths->{$file}) {$len=$lengths->{$file}}
44 :     if (-z "$dir/$file") # file has zero size
45 :     {
46 :     print "$file\t$len\t0\n";
47 :     next;
48 :     }
49 :     my %orghits; # hits by organism
50 :    
51 :     open (IN, "$dir/$file") || die "Can't open $dir/$file";
52 :     my %hits;
53 :     my %evalue;
54 :     my $sawhit;
55 :     my %totalhitlen;
56 :     while (<IN>) {
57 :     my @line=split /\t/;
58 :     next unless ($line[10] < $cutoff);
59 :     $sawhit++;
60 :     my $id;
61 :     if ($line[1] =~ /gi\|(\d+)\|/) {$id=$1}
62 :     else {print STDERR "Don't know how to handle $line[1]\n"; next}
63 :     $hits{$id}++;
64 :     $evalue{$id}=$line[10];
65 :     $totalhitlen{$id}=$line[3];
66 :     }
67 :     close IN;
68 :    
69 :     unless ($sawhit) {
70 :     # no significant hit
71 :     print "$file\t$len\t0\n";
72 :     print STDERR "Not a zero size but no sig hits less than $cutoff for $dir/$file\n";
73 :     next;
74 :     }
75 :    
76 :     foreach my $gi (keys %hits) {
77 :     my @row=$dbh->selectrow_array("select tax_id from gi_taxid_nucl where gi = $gi");
78 :     my $tax=$row[0];
79 :     unless ($tax) {
80 :     print STDERR "No tax gi for $gi\n";
81 :     print "$file\t$len\t$hits{$gi}\t$gi\t$evalue{$gi}\t\t$totalhitlen{$gi}\n";
82 :     next;
83 :     }
84 :     my $exc=$dbh->prepare("select * from names where tax_id = $tax");
85 :     $exc->execute || die $dbh->errstr;
86 :     my $name=''; my $badname='';
87 :     while (my @res=$exc->fetchrow_array) {
88 :     if ($res[4] eq "scientific name") {$name=$res[2]}
89 :     else {$badname=$res[2]}
90 :     }
91 :     if (!$name && $badname) {$name=$badname}
92 :     unless ($name) {
93 :     print STDERR "Couldn't parse tax id $tax for gi $gi\n";
94 :     print "$file\t$len\t$hits{$gi}\t$gi\t$evalue{$gi}\t\t$totalhitlen{$gi}\n";
95 :     }
96 :     push @{$orghits{$name}}, $gi;
97 :     unless (exists $evalue{$name}) {$evalue{$name}=$evalue{$gi}}
98 :     if ($evalue{$name} < $evalue{$gi}) {$evalue{$name}=$evalue{$gi}}
99 :     if ($gi == 45774915) {
100 :     print STDERR "For 45774915 ($name) Had $totalhitlen{$name} Now added $totalhitlen{$gi}\n";
101 :     }
102 :     $totalhitlen{$name}+=$totalhitlen{$gi};
103 :     if ($gi == 45774915) {
104 :     print STDERR "For 45774915 ($name) Had something, now have $totalhitlen{$name}\n";
105 :     }
106 :     }
107 :     foreach my $name (keys %orghits) {
108 :     print "$file\t$len\t", scalar(@{$orghits{$name}}), "\t", (join ", ", (@{$orghits{$name}})), "\t", $evalue{$name}, "\t$name\t$totalhitlen{$name}\n";
109 :     }
110 :     }
111 :    
112 :    
113 :    
114 :    
115 :     sub count_zero {
116 :     my $dir=shift;
117 :     my $count=0;
118 :     opendir(DIR, $dir) || die "Can't open %dir";
119 :     while (my $file=readdir(DIR)) {
120 :     next if ($file =~ /^\./);
121 :     $count++ if (-z "$dir/$file");
122 :     }
123 :     print "$count files had zero size\n";
124 :     exit(0);
125 :     }
126 :    
127 :     sub seq_dir {
128 :     my $seqdir=shift;
129 :     opendir(DIR, $seqdir) || die "Can't open $seqdir";
130 :     my $length;
131 :     while (my $file = readdir(DIR)) {
132 :     next if ($file =~ /^\./);
133 :     my $sin=Bio::SeqIO->new(-file=>"$seqdir/$file", -format=>"fasta");
134 :     while (my $seq=$sin->next_seq) {
135 :     $length->{$file}=$seq->length();
136 :     }
137 :     }
138 :     print STDERR "Read ", scalar(keys %$length), " lengths\n";
139 :     return $length;
140 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3