[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # renumber sequences and remove any exact duplicate sequences
4 :    
5 :     use strict;
6 :     use DBI;
7 :     use Rob;
8 :    
9 : olson 1.2 my @opts;
10 :    
11 :     my $dbh=DBI->connect($FIG_Config::metagenome_dsn, $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass,
12 :     { RaiseError => 1 })
13 :     or die "Can't connect to database sequences\n";
14 :    
15 : olson 1.1
16 :     my $usage=<<EOF;
17 :    
18 :     $0
19 :     -f fasta file
20 :     -q quality file
21 : olson 1.2 -g genome-name
22 :     -h host-name
23 :     -p assembled-path
24 : olson 1.1 OPTIONAL
25 :    
26 :     -n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.
27 :    
28 :     EOF
29 :    
30 : olson 1.2 my ($fastaf, $qualf, $genomeno, $genome_name, $genome_host, $asm_path);
31 : olson 1.1 while (@ARGV) {
32 :     my $t=shift;
33 :     if ($t eq "-f") {$fastaf=shift}
34 :     elsif ($t eq "-q") {$qualf=shift}
35 :     elsif ($t eq "-n") {$genomeno=shift}
36 : olson 1.2 elsif ($t eq "-g") {$genome_name = shift}
37 :     elsif ($t eq "-h") {$genome_host = shift}
38 :     elsif ($t eq "-p") {$asm_path = shift}
39 : olson 1.1 }
40 :    
41 : olson 1.5 #unless ($qualf) {$qualf = $fastaf.".qual"}
42 : olson 1.1
43 : olson 1.5 die $usage unless (-e $fastaf);
44 : olson 1.1 my ($count, $max);
45 : olson 1.3
46 :     $dbh->begin_work();
47 :    
48 : olson 1.1 if ($genomeno)
49 :     {
50 :     $count=$genomeno;
51 :     my $min=1E99;
52 :     $max=0;
53 :     my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
54 :     $exc->execute||die $dbh->errstr;
55 :     while (my @ret=$exc->fetchrow_array)
56 :     {
57 :     ($ret[0] < $min) ? ($min=$ret[0]) : 1;
58 :     ($ret[0] > $max) ? ($max=$ret[0]) : 1;
59 :     }
60 :     unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
61 :     print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
62 :     $max=$min;
63 :     }
64 :     else
65 :     {
66 : olson 1.2 # set the information for the genome table#
67 :     #
68 :     # Assume that if the genome name was passed on the command line that genome_host
69 :     # was also passed (meaning that an undefined value for genome_host means it is not defined,
70 :     # not that we need to ask the user).
71 :     #
72 :    
73 :     my $ask_user;
74 :     if (!defined($genome_name))
75 :     {
76 :     $ask_user++;
77 : olson 1.1 print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
78 : olson 1.2 $genome_name = <STDIN>;
79 :     chomp($genome_name);
80 :     }
81 : olson 1.5
82 :     #
83 :     # Allow duplicate genome names.
84 :     #
85 :     # my $res = $dbh->selectall_arrayref("select count from genome where name = ?", undef, $genome_name);
86 :     # if (@$res)
87 :     # {
88 :     # die "The genome $genome_name already appears in the database as number $count\n";
89 :     # }
90 : olson 1.2
91 : olson 1.5 # $count = $res->[0]->[0];
92 : olson 1.1
93 : olson 1.2 if ($ask_user)
94 :     {
95 : olson 1.1 print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
96 : olson 1.2 my $genome_host=<STDIN>;
97 :     chomp($genome_host);
98 : olson 1.1
99 :     print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
100 : olson 1.2 my $asm_path=<STDIN>;
101 :     chomp($asm_path);
102 :     }
103 :    
104 :     $dbh->do("insert into genome (name, host, assembled_sequence) values (?, ?, ?)", undef,
105 :     $genome_name, $genome_host, $asm_path);
106 :    
107 :     my $res = $dbh->selectcol_arrayref("select count from genome where name = ?", undef, $genome_name);
108 :     $res or die $dbh->errstr;
109 :     $count = $res->[0];
110 :    
111 :     $res = $dbh->selectcol_arrayref("select max(count) from DNA");
112 :     $res or die $dbh->errstr;
113 :     $max = $res->[0];
114 :    
115 :     $max++;
116 :    
117 :     print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
118 : olson 1.1 }
119 : olson 1.3 $dbh->commit();
120 : olson 1.1
121 :     # read and de replicate the fasta file
122 :     my $fasta=Rob->read_fasta($fastaf);
123 :     {
124 :     my $seqs;
125 :     map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
126 :     undef $fasta;
127 :     map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
128 :     }
129 :    
130 : olson 1.5 my $qual;
131 :     if ($qualf)
132 :     {
133 :     $qual=Rob->read_fasta($qualf, 1);
134 :     map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;
135 :     }
136 : olson 1.1
137 :     print "Inserting data into database and rewriting files\n";
138 :     open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
139 : olson 1.5 if ($qualf)
140 :     {
141 :     open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";
142 :     }
143 : olson 1.1
144 : olson 1.5 $dbh->begin_work();
145 : olson 1.4 my $exc=$dbh->prepare(qq(insert into DNA ( genome, original_name, trace_file, sequence_file, quality_file)
146 :     values (?, ?, ?, ?, ?)));
147 : olson 1.2
148 : olson 1.1 foreach my $s (keys %$fasta)
149 :     {
150 : olson 1.4 my $id;
151 :    
152 : olson 1.5 if ($qualf)
153 :     {
154 :     if (!$qual->{$s})
155 :     {
156 :     print STDERR "No quality for $s. Skipped\n";
157 :     next;
158 :     }
159 :     }
160 : olson 1.4
161 :     if ($genomeno)
162 :     {
163 :     $id = $max++;
164 :     }
165 :     else
166 :     {
167 :     $exc->execute($count, $s, '', "$count.fa", "$count.qual") || die $dbh->errstr;
168 :     $id = $dbh->{mysql_insertid};
169 :     }
170 :    
171 :     print FA ">$id\n", $fasta->{$s}, "\n";
172 : olson 1.5 if ($qualf)
173 :     {
174 :     print QU ">$id\n", $qual->{$s}, "\n";
175 :     }
176 : olson 1.1 }
177 : olson 1.5 close(FA);
178 :     close(QU) if $qualf;
179 :     $dbh->commit();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3