[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # renumber sequences and remove any exact duplicate sequences
4 :    
5 :     use strict;
6 :     use DBI;
7 :     use Rob;
8 :    
9 : olson 1.2 my @opts;
10 :    
11 :     my $dbh=DBI->connect($FIG_Config::metagenome_dsn, $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass,
12 :     { RaiseError => 1 })
13 :     or die "Can't connect to database sequences\n";
14 :    
15 : olson 1.1
16 :     my $usage=<<EOF;
17 :    
18 :     $0
19 :     -f fasta file
20 :     -q quality file
21 : olson 1.2 -g genome-name
22 :     -h host-name
23 :     -p assembled-path
24 : olson 1.1 OPTIONAL
25 :    
26 :     -n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.
27 :    
28 :     EOF
29 :    
30 : olson 1.2 my ($fastaf, $qualf, $genomeno, $genome_name, $genome_host, $asm_path);
31 : olson 1.1 while (@ARGV) {
32 :     my $t=shift;
33 :     if ($t eq "-f") {$fastaf=shift}
34 :     elsif ($t eq "-q") {$qualf=shift}
35 :     elsif ($t eq "-n") {$genomeno=shift}
36 : olson 1.2 elsif ($t eq "-g") {$genome_name = shift}
37 :     elsif ($t eq "-h") {$genome_host = shift}
38 :     elsif ($t eq "-p") {$asm_path = shift}
39 : olson 1.1 }
40 :    
41 :     unless ($qualf) {$qualf = $fastaf.".qual"}
42 :    
43 :     die $usage unless (-e $fastaf && -e $qualf);
44 :     my ($count, $max);
45 : olson 1.3
46 :     $dbh->begin_work();
47 :    
48 : olson 1.1 if ($genomeno)
49 :     {
50 :     $count=$genomeno;
51 :     my $min=1E99;
52 :     $max=0;
53 :     my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
54 :     $exc->execute||die $dbh->errstr;
55 :     while (my @ret=$exc->fetchrow_array)
56 :     {
57 :     ($ret[0] < $min) ? ($min=$ret[0]) : 1;
58 :     ($ret[0] > $max) ? ($max=$ret[0]) : 1;
59 :     }
60 :     unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
61 :     print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
62 :     $max=$min;
63 :     }
64 :     else
65 :     {
66 : olson 1.2 # set the information for the genome table#
67 :     #
68 :     # Assume that if the genome name was passed on the command line that genome_host
69 :     # was also passed (meaning that an undefined value for genome_host means it is not defined,
70 :     # not that we need to ask the user).
71 :     #
72 :    
73 :     my $ask_user;
74 :     if (!defined($genome_name))
75 :     {
76 :     $ask_user++;
77 : olson 1.1 print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
78 : olson 1.2 $genome_name = <STDIN>;
79 :     chomp($genome_name);
80 :     }
81 :     my $res = $dbh->selectall_arrayref("select count from genome where name = ?", undef, $genome_name);
82 :     if (@$res)
83 :     {
84 :     die "The genome $genome_name already appears in the database as number $count\n";
85 :     }
86 :    
87 :     $count = $res->[0]->[0];
88 : olson 1.1
89 : olson 1.2 if ($ask_user)
90 :     {
91 : olson 1.1 print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
92 : olson 1.2 my $genome_host=<STDIN>;
93 :     chomp($genome_host);
94 : olson 1.1
95 :     print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
96 : olson 1.2 my $asm_path=<STDIN>;
97 :     chomp($asm_path);
98 :     }
99 :    
100 :     $dbh->do("insert into genome (name, host, assembled_sequence) values (?, ?, ?)", undef,
101 :     $genome_name, $genome_host, $asm_path);
102 :    
103 :     my $res = $dbh->selectcol_arrayref("select count from genome where name = ?", undef, $genome_name);
104 :     $res or die $dbh->errstr;
105 :     $count = $res->[0];
106 :    
107 :     $res = $dbh->selectcol_arrayref("select max(count) from DNA");
108 :     $res or die $dbh->errstr;
109 :     $max = $res->[0];
110 :    
111 :     $max++;
112 :    
113 :     print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
114 : olson 1.1 }
115 : olson 1.3 $dbh->commit();
116 : olson 1.1
117 :     # read and de replicate the fasta file
118 :     my $fasta=Rob->read_fasta($fastaf);
119 :     {
120 :     my $seqs;
121 :     map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
122 :     undef $fasta;
123 :     map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
124 :     }
125 :    
126 :     my $qual=Rob->read_fasta($qualf, 1);
127 :     map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;
128 :    
129 :     print "Inserting data into database and rewriting files\n";
130 :     open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
131 :     open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";
132 :    
133 : olson 1.4 my $exc=$dbh->prepare(qq(insert into DNA ( genome, original_name, trace_file, sequence_file, quality_file)
134 :     values (?, ?, ?, ?, ?)));
135 : olson 1.2
136 : olson 1.1 foreach my $s (keys %$fasta)
137 :     {
138 : olson 1.4 my $id;
139 :    
140 :     unless ($qual->{$s}) {print STDERR "No quality for $s. Skipped\n"; next}
141 :    
142 :     if ($genomeno)
143 :     {
144 :     $id = $max++;
145 :     }
146 :     else
147 :     {
148 :     $exc->execute($count, $s, '', "$count.fa", "$count.qual") || die $dbh->errstr;
149 :     $id = $dbh->{mysql_insertid};
150 :     }
151 :    
152 :     print FA ">$id\n", $fasta->{$s}, "\n";
153 :     print QU ">$id\n", $qual->{$s}, "\n";
154 : olson 1.1 }
155 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3