[Bio] / FigMetagenomeTools / m82genome.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/m82genome.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # convert BLASTs from M8 format to a summary of what and counts
4 :    
5 :     use strict;
6 :     use DBI;
7 :     use lib '/home/rob/perl';
8 :     use Bio::SeqIO;
9 :    
10 :     my $dbh=DBI->connect("DBI:mysql:ncbi", "rob", "forestry") or die "Can't connect to database\n";
11 :    
12 :     my $usage=<<EOF;
13 :    
14 :     $0 <options>
15 :    
16 :     -d directory of blast results in m8 (tab) format
17 :     -c cutoff (E value) below which to ignore results
18 :     -z count zeroes only
19 :     -s directory of sequences. If this is included the lengths of each sequence will be used
20 :    
21 :     EOF
22 :    
23 :    
24 :     my ($dir, $seqdir,$cutoff,$countzero)=('', '', 1000000,0);
25 :     while (@ARGV) {
26 :     my $test=shift(@ARGV);
27 :     if ($test eq "-d") {$dir=shift @ARGV}
28 :     elsif ($test eq "-s") {$seqdir=shift @ARGV}
29 :     elsif ($test eq "-c") {$cutoff = shift @ARGV}
30 :     elsif ($test eq "-z") {$countzero=1}
31 :     }
32 :    
33 :     die $usage unless $dir;
34 :    
35 :     &count_zero($dir) if $countzero;
36 :    
37 :     my $lengths;
38 :     if ($seqdir) {$lengths=seq_dir($seqdir)}
39 :    
40 :     opendir(DIR, $dir) || die "Can't open $dir";
41 :     while (my $file=readdir(DIR)) {
42 :     next if ($file =~ /^\./);
43 :     my $len='xx';
44 :     if ($lengths->{$file}) {$len=$lengths->{$file}}
45 :     if (-z "$dir/$file") # file has zero size
46 :     {
47 :     print "$file\t$len\t0\n";
48 :     next;
49 :     }
50 :     my %orghits; # hits by organism
51 :    
52 :     open (IN, "$dir/$file") || die "Can't open $dir/$file";
53 :     my %hits;
54 :     my %evalue;
55 :     my $sawhit;
56 :     my %totalhitlen;
57 :     while (<IN>) {
58 :     my @line=split /\t/;
59 :     next unless ($line[10] < $cutoff);
60 :     $sawhit++;
61 :     my $id;
62 :     if ($line[1] =~ /gi\|(\d+)\|/) {$id=$1}
63 :     else {print STDERR "Don't know how to handle $line[1]\n"; next}
64 :     $hits{$id}++;
65 :     $evalue{$id}=$line[10];
66 :     $totalhitlen{$id}=$line[3];
67 :     }
68 :     close IN;
69 :    
70 :     unless ($sawhit) {
71 :     # no significant hit
72 :     print "$file\t$len\t0\n";
73 :     print STDERR "Not a zero size but no sig hits less than $cutoff for $dir/$file\n";
74 :     next;
75 :     }
76 :    
77 :     foreach my $gi (keys %hits) {
78 :     my @row=$dbh->selectrow_array("select tax_id from gi_taxid_nucl where gi = $gi");
79 :     my $tax=$row[0];
80 :     unless ($tax) {
81 :     print STDERR "No tax gi for $gi\n";
82 :     print "$file\t$len\t$hits{$gi}\t$gi\t$evalue{$gi}\t\t$totalhitlen{$gi}\n";
83 :     next;
84 :     }
85 :     my $exc=$dbh->prepare("select * from names where tax_id = $tax");
86 :     $exc->execute || die $dbh->errstr;
87 :     my $name=''; my $badname='';
88 :     while (my @res=$exc->fetchrow_array) {
89 :     if ($res[4] eq "scientific name") {$name=$res[2]}
90 :     else {$badname=$res[2]}
91 :     }
92 :     if (!$name && $badname) {$name=$badname}
93 :     unless ($name) {
94 :     print STDERR "Couldn't parse tax id $tax for gi $gi\n";
95 :     print "$file\t$len\t$hits{$gi}\t$gi\t$evalue{$gi}\t\t$totalhitlen{$gi}\n";
96 :     }
97 :     push @{$orghits{$name}}, $gi;
98 :     unless (exists $evalue{$name}) {$evalue{$name}=$evalue{$gi}}
99 :     if ($evalue{$name} < $evalue{$gi}) {$evalue{$name}=$evalue{$gi}}
100 :     if ($gi == 45774915) {
101 :     print STDERR "For 45774915 ($name) Had $totalhitlen{$name} Now added $totalhitlen{$gi}\n";
102 :     }
103 :     $totalhitlen{$name}+=$totalhitlen{$gi};
104 :     if ($gi == 45774915) {
105 :     print STDERR "For 45774915 ($name) Had something, now have $totalhitlen{$name}\n";
106 :     }
107 :     }
108 :     foreach my $name (keys %orghits) {
109 :     print "$file\t$len\t", scalar(@{$orghits{$name}}), "\t", (join ", ", (@{$orghits{$name}})), "\t", $evalue{$name}, "\t$name\t$totalhitlen{$name}\n";
110 :     }
111 :     }
112 :    
113 :    
114 :    
115 :    
116 :     sub count_zero {
117 :     my $dir=shift;
118 :     my $count=0;
119 :     opendir(DIR, $dir) || die "Can't open %dir";
120 :     while (my $file=readdir(DIR)) {
121 :     next if ($file =~ /^\./);
122 :     $count++ if (-z "$dir/$file");
123 :     }
124 :     print "$count files had zero size\n";
125 :     exit(0);
126 :     }
127 :    
128 :     sub seq_dir {
129 :     my $seqdir=shift;
130 :     opendir(DIR, $seqdir) || die "Can't open $seqdir";
131 :     my $length;
132 :     while (my $file = readdir(DIR)) {
133 :     next if ($file =~ /^\./);
134 :     my $sin=Bio::SeqIO->new(-file=>"$seqdir/$file", -format=>"fasta");
135 :     while (my $seq=$sin->next_seq) {
136 :     $length->{$file}=$seq->length();
137 :     }
138 :     }
139 :     print STDERR "Read ", scalar(keys %$length), " lengths\n";
140 :     return $length;
141 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3