[Bio] / FigMetagenomeTools / renumber_sequences.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/renumber_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # renumber sequences and remove any exact duplicate sequences
4 :    
5 :     use strict;
6 :     use DBI;
7 :     use Rob;
8 :    
9 : olson 1.2 my @opts;
10 :    
11 :     my $dbh=DBI->connect($FIG_Config::metagenome_dsn, $FIG_Config::metagenome_dbuser, $FIG_Config::metagenome_dbpass,
12 :     { RaiseError => 1 })
13 :     or die "Can't connect to database sequences\n";
14 :    
15 : olson 1.1
16 :     my $usage=<<EOF;
17 :    
18 :     $0
19 :     -f fasta file
20 :     -q quality file
21 : olson 1.2 -g genome-name
22 :     -h host-name
23 :     -p assembled-path
24 : olson 1.1 OPTIONAL
25 :    
26 :     -n number of genome. If this is provided, then the data will not be added to the database, and only the files will be written.
27 :    
28 :     EOF
29 :    
30 : olson 1.2 my ($fastaf, $qualf, $genomeno, $genome_name, $genome_host, $asm_path);
31 : olson 1.1 while (@ARGV) {
32 :     my $t=shift;
33 :     if ($t eq "-f") {$fastaf=shift}
34 :     elsif ($t eq "-q") {$qualf=shift}
35 :     elsif ($t eq "-n") {$genomeno=shift}
36 : olson 1.2 elsif ($t eq "-g") {$genome_name = shift}
37 :     elsif ($t eq "-h") {$genome_host = shift}
38 :     elsif ($t eq "-p") {$asm_path = shift}
39 : olson 1.1 }
40 :    
41 : olson 1.2 $dbh->begin_work();
42 :    
43 : olson 1.1 unless ($qualf) {$qualf = $fastaf.".qual"}
44 :    
45 :     die $usage unless (-e $fastaf && -e $qualf);
46 :     my ($count, $max);
47 :     if ($genomeno)
48 :     {
49 :     $count=$genomeno;
50 :     my $min=1E99;
51 :     $max=0;
52 :     my $exc=$dbh->prepare("select count, genome from DNA where genome = $genomeno") || die $dbh->errstr;
53 :     $exc->execute||die $dbh->errstr;
54 :     while (my @ret=$exc->fetchrow_array)
55 :     {
56 :     ($ret[0] < $min) ? ($min=$ret[0]) : 1;
57 :     ($ret[0] > $max) ? ($max=$ret[0]) : 1;
58 :     }
59 :     unless ($min && $max) {die "$genomeno does not appear to be in the database. Please confirm why we couldn't get min and max?"}
60 :     print STDERR "Reading files starting writing at sequence $min. There were ", $max-$min+1, " sequences\n";
61 :     $max=$min;
62 :     }
63 :     else
64 :     {
65 : olson 1.2 # set the information for the genome table#
66 :     #
67 :     # Assume that if the genome name was passed on the command line that genome_host
68 :     # was also passed (meaning that an undefined value for genome_host means it is not defined,
69 :     # not that we need to ask the user).
70 :     #
71 :    
72 :     my $ask_user;
73 :     if (!defined($genome_name))
74 :     {
75 :     $ask_user++;
76 : olson 1.1 print "This is the information for the Genome table in the sequences database\nPlease enter the name of the genome:\n";
77 : olson 1.2 $genome_name = <STDIN>;
78 :     chomp($genome_name);
79 :     }
80 :     my $res = $dbh->selectall_arrayref("select count from genome where name = ?", undef, $genome_name);
81 :     if (@$res)
82 :     {
83 :     die "The genome $genome_name already appears in the database as number $count\n";
84 :     }
85 :    
86 :     $count = $res->[0]->[0];
87 : olson 1.1
88 : olson 1.2 if ($ask_user)
89 :     {
90 : olson 1.1 print "If the sequence is for a phage, and the host is known, please enter the host (or leave blank):\n";
91 : olson 1.2 my $genome_host=<STDIN>;
92 :     chomp($genome_host);
93 : olson 1.1
94 :     print "If the sequence has been assembled, please enter the path to the assembled sequence (or leave blank):\n";
95 : olson 1.2 my $asm_path=<STDIN>;
96 :     chomp($asm_path);
97 :     }
98 :    
99 :     $dbh->do("insert into genome (name, host, assembled_sequence) values (?, ?, ?)", undef,
100 :     $genome_name, $genome_host, $asm_path);
101 :    
102 :     my $res = $dbh->selectcol_arrayref("select count from genome where name = ?", undef, $genome_name);
103 :     $res or die $dbh->errstr;
104 :     $count = $res->[0];
105 :    
106 :     $res = $dbh->selectcol_arrayref("select max(count) from DNA");
107 :     $res or die $dbh->errstr;
108 :     $max = $res->[0];
109 :    
110 :     $max++;
111 :    
112 :     print "The new genome is number $count and the first sequence will be $max\nReading and dereplicating the fasta file\n";
113 : olson 1.1 }
114 :    
115 :     # read and de replicate the fasta file
116 :     my $fasta=Rob->read_fasta($fastaf);
117 :     {
118 :     my $seqs;
119 :     map {$fasta->{$_} =~ s/\s+//g; $seqs->{$fasta->{$_}}=$_} keys %$fasta;
120 :     undef $fasta;
121 :     map {$fasta->{$seqs->{$_}}=$_} keys %$seqs;
122 :     }
123 :    
124 :     my $qual=Rob->read_fasta($qualf, 1);
125 :     map {$qual->{$_} =~ s/\s+/ /g} keys %$qual;
126 :    
127 :     print "Inserting data into database and rewriting files\n";
128 :     open (FA, ">$count.fa") || die "Can't open $count.fa for writing\n";
129 :     open (QU, ">$count.qual") || die "Can't open $count.qual for writing\n";
130 :    
131 : olson 1.2 my $exc=$dbh->prepare(qq(insert into DNA (count, genome, original_name, trace_file, sequence_file, quality_file)
132 :     values (?, ?, ?, ?, ?, ?)));
133 :    
134 : olson 1.1 foreach my $s (keys %$fasta)
135 :     {
136 :     unless ($qual->{$s}) {print STDERR "No quality for $s. Skipped\n"; next}
137 :     print FA ">$max\n", $fasta->{$s}, "\n";
138 :     print QU ">$max\n", $qual->{$s}, "\n";
139 :     unless ($genomeno)
140 :     {
141 : olson 1.2 $exc->execute($max, $count, $s, '', "$count.fa", "$count.qual") || die $dbh->errstr;
142 : olson 1.1 }
143 :     $max++;
144 :     }
145 :    
146 : olson 1.2 $dbh->commit();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3