[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # renumber sequences and remove any exact duplicate sequences
4 :    
5 :     use strict;
6 :     use DBI;
7 :     use lib "$ENV{HOME}/MIG/Modules";
8 :     use Rob;
9 :    
10 :     my $dbh=DBI->connect("DBI:mysql:sequences", "rob", "forestry") or die "Can't connect to database sequences\n";
11 :    
12 :     my $usage=<<EOF;
13 :    
14 :     $0
15 :     -f fasta file
16 :     -q quality file
17 :    
18 :     OPTIONAL
19 :    
20 :     -n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.
21 :    
22 :     EOF
23 :    
24 :     my ($fastaf, $qualf, $genomeno);
25 :     while (@ARGV) {
26 :     my $t=shift;
27 :     if ($t eq "-f") {$fastaf=shift}
28 :     elsif ($t eq "-q") {$qualf=shift}
29 :     elsif ($t eq "-n") {$genomeno=shift}
30 :     }
31 :    
32 :     unless ($qualf) {$qualf = $fastaf.".qual"}
33 :    
34 :     die $usage unless (-e $fastaf && -e $qualf);
35 :     my ($count, $max);
36 :     if ($genomeno)
37 :     {
38 :     $count=$genomeno;
39 :     my $min=1E99;
40 :     $max=0;
41 :     my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
42 :     $exc->execute||die $dbh->errstr;
43 :     while (my @ret=$exc->fetchrow_array)
44 :     {
45 :     ($ret[0] < $min) ? ($min=$ret[0]) : 1;
46 :     ($ret[0] > $max) ? ($max=$ret[0]) : 1;
47 :     }
48 :     unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
49 :     print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
50 :     $max=$min;
51 :     }
52 :     else
53 :     {
54 :     # set the information for the genome table
55 :     print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
56 :     my $name=<STDIN>;
57 :     chomp($name);
58 :     my $exc=$dbh->prepare("select count from genome where name like '$name'");
59 :     $exc->execute || die $dbh->errstr;
60 :     ($count)=$exc->fetchrow_array;
61 :     die "The genome $name already appears in the database as number $count\n" if ($count);
62 :    
63 :     print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
64 :     my $host=<STDIN>;
65 :     chomp($host);
66 :    
67 :     print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
68 :     my $path=<STDIN>;
69 :     chomp($path);
70 :    
71 :     $exc=$dbh->prepare("insert into genome (count, name, host, assembled_sequence) values ('', '$name', '$host', '$path')");
72 :     $exc->execute || die $dbh->errstr;
73 :     $exc=$dbh->prepare("select count from genome where name like '$name'");
74 :     $exc->execute || die $dbh->errstr;
75 :     ($count)=$exc->fetchrow_array;
76 :    
77 :     $exc=$dbh->prepare("select count from DNA");
78 :     $exc->execute || die $dbh->errstr;
79 :     $max=0;
80 :     while (my @ret=$exc->fetchrow_array) {$max=$ret[0] if ($ret[0] > $max)}
81 :     $max++;
82 :    
83 :     print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
84 :     }
85 :    
86 :     # read and de replicate the fasta file
87 :     my $fasta=Rob->read_fasta($fastaf);
88 :     {
89 :     my $seqs;
90 :     map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
91 :     undef $fasta;
92 :     map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
93 :     }
94 :    
95 :     my $qual=Rob->read_fasta($qualf, 1);
96 :     map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;
97 :    
98 :     print "Inserting data into database and rewriting files\n";
99 :     open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
100 :     open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";
101 :    
102 :     foreach my $s (keys %$fasta)
103 :     {
104 :     unless ($qual->{$s}) {print STDERR "No quality for $s. Skipped\n"; next}
105 :     print FA ">$max\n", $fasta->{$s}, "\n";
106 :     print QU ">$max\n", $qual->{$s}, "\n";
107 :     unless ($genomeno)
108 :     {
109 :     my $exc=$dbh->prepare("insert into DNA (count, genome, original_name, trace_file, sequence_file, quality_file) values
110 :     ($max, '$count', '$s', '', '$count.fa', '$count.qual')");
111 :     $exc->execute || die $dbh->errstr;
112 :     }
113 :     $max++;
114 :     }
115 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3