[Bio] / FigKernelScripts / svr_find_fused_genes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_find_fused_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : fangfang 1.1 #
2 :     # This is a SAS Component
3 :     #
4 :    
5 :     #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     use strict;
23 :     use Data::Dumper;
24 :     use Carp;
25 :     use Getopt::Long;
26 :    
27 : fangfang 1.2 =head1 svr_find_fused_genes
28 :    
29 :     svr_find_fused_genes [options] [<seqs.fa] >fusion.tbl
30 :    
31 :     Find genes that are homologous to the query genes and prints a list
32 :     of fusions among them.
33 :    
34 :     Examples:
35 :    
36 :     svr_find_fused_genes -peg 'fig|83333.1.peg.784'
37 :    
38 :     svr_find_fused_genes < query.fasta > fusions.table
39 :    
40 :     svr_find_fused_genes -o hits.fasta -r report -t tree.html < query.fasta > fusion.table
41 :    
42 :     =head2 Introduction
43 :    
44 :     usage: svr_find_fused_genes [options] [<seqs.fa] >fusion.tbl
45 :    
46 :     -a n_processor - number of processors to use (D = 4)
47 :     -b database - database to search against: SEED (D), PSEED, PPSEED, FASTA file name, FIG genome ID
48 :     -i min_ident - minimum fraction identity (D = 0.1)
49 :     -u max_q_uncov - maximum unmatched query (D = 100)
50 :     -r report_file - output file of psiblast report
51 :     -o output - output psiblast search hits in fasta
52 :     -t html_tree - output html tree painted with fusion genes
53 :     -l - run search locally if a local database is specified
54 :     -peg fid - query gene ID
55 :    
56 :     =head2 Command-Line options
57 :    
58 :     =over 4
59 :    
60 :     =item -a n_processor
61 :    
62 : fangfang 1.3 Number of processors to use (D = 2 locally or 4 remotely)
63 : fangfang 1.2
64 :     =item -b database
65 :    
66 :     Database for psiblast to search against. It can be a FASTA file name,
67 :     a FIG genome ID, or a string, SEED, PSEED or PPSEED, to indicate one of the
68 :     preconfigured database of all protein sequences from complete
69 :     genomes. The default is SEED.
70 :    
71 :     =item -i min_ident
72 :    
73 :     Minimum fraction identity used in psi-blast search (D = 0.1).
74 :    
75 :     =item -l
76 :    
77 :     With the -l option, psiblast search is run locally. The database must
78 :     be a local FASTA file.
79 :    
80 :     =item -u max_q_uncov
81 :    
82 :     Maximum unmatched query, c-term or n-term (D = 100).
83 :    
84 :     =item -o output_file
85 :    
86 :     With the -o option, psiblast hits are written to a FASTA file.
87 :    
88 :     =item -r report_file
89 :    
90 :     Output file name for psiblast records produced as a 11-column table containing:
91 :    
92 :     [ subject_id, bit_score, e_value,
93 :     subject_length, status,
94 :     fraction_ident, fraction_positive,
95 :     query_uncov_n_term, query_uncov_c_term,
96 :     subject_uncov_n_term, subject_uncov_c_term ]
97 :    
98 :     =item -t html_tree
99 :    
100 :     With the -t option, a HTML tree is generated with the functional roles
101 :     of sequences colored and the fused genes highlighted.
102 :    
103 :     =back
104 :    
105 :     =head2 Input
106 :    
107 :     The input search profile is a FASTA file read from STDIN or a PEG ID
108 :     specified with the -peg option.
109 :    
110 :     =head2 Output
111 :    
112 :     The set of predicted fused genes is written to STDOUT.
113 :    
114 :     [ fused_gene_ID, subject_uncov_n_term, subject_uncov_c_term ]
115 :    
116 :     =cut
117 :    
118 : fangfang 1.1
119 :     use AlignTree;
120 :     use ATserver;
121 :     use SeedAware;
122 :     use SeedUtils;
123 :    
124 :     use ffxtree;
125 :     use gjoalignment;
126 :     use gjoseqlib;
127 :    
128 :     my $usage = <<"End_of_Usage";
129 :    
130 :     usage: svr_find_fused_genes [options] [<seqs.fa] >fusion.tbl
131 :    
132 :     -a n_processor - number of processors to use (D = 4)
133 :     -b database - database to search against: SEED (D), PSEED, PPSEED, FASTA file name, FIG genome ID
134 : fangfang 1.2 -i min_ident - minimum fraction identity (D = 0.1)
135 :     -u max_q_uncov - maximum unmatched query (D = 100)
136 : fangfang 1.1 -r report_file - output file of psiblast report
137 :     -o output - output psiblast search hits in fasta
138 :     -t html_tree - output html tree painted with fusion genes
139 :     -l - run search locally if a local database is specified
140 :     -peg fid - query gene ID
141 :    
142 :     End_of_Usage
143 :    
144 :     my ($help, $local, $db, $nthread, $uncov, $min_ident, $report_file, $html_tree, $output, $query_peg);
145 :    
146 :     GetOptions("h|help" => \$help,
147 :     "l|local" => \$local,
148 :     "a=i" => \$nthread,
149 :     "b|db=s" => \$db,
150 :     "i=f" => \$min_ident,
151 :     "u=i" => \$uncov,
152 :     "o=s" => \$output,
153 :     "r=s" => \$report_file,
154 :     "t=s" => \$html_tree,
155 :     "peg=s" => \$query_peg);
156 :    
157 :     $help and die $usage;
158 :    
159 :     my $opts;
160 :    
161 :     $opts->{nthread} = $nthread || ($local ? 2 : 4);
162 :     $opts->{db} = gjoseqlib::read_fasta($db) if -s $db;
163 :     $opts->{db} ||= $ENV{SAS_SERVER} || 'SEED';
164 :     $opts->{max_q_uncov} = $uncov || 100;
165 :     $opts->{min_ident} = $min_ident || 0.1;
166 :    
167 :     $opts->{incremental} = 1;
168 :     $opts->{fast} = 1;
169 :     $opts->{max_query_nseq} = 200;
170 :    
171 :     if ($query_peg) {
172 :     my $data = SeedAware::run_gathering_output("echo '$query_peg' | svr_fasta -protein -fasta") or die "Abort: no query sequences.\n";
173 :     $opts->{profile} = gjoseqlib::read_fasta(\$data);
174 :     } else {
175 :     $opts->{profile} = gjoseqlib::read_fasta();
176 :     }
177 :    
178 :     my ($AT, $hits, $report, $ret);
179 :    
180 :     if ($local) {
181 :     ($hits, $report) = AlignTree::psiblast_search($opts);
182 :     } else {
183 :     $AT = ATserver->new();
184 :     $ret = $AT->psiblast_search($opts);
185 :     $hits = $ret->{rv};
186 :     $report = $ret->{report};
187 :     }
188 :    
189 :     if ($report_file) {
190 :     my $report_string = join("\n", map { join "\t", @$_ } sort { $b->[1] <=> $a->[1] } values %$report) . "\n";
191 :     AlignTree::print_string($report_file, $report_string);
192 :     }
193 :    
194 :     gjoseqlib::print_alignment_as_fasta($output, $hits) if $output;
195 :    
196 :     my @incl = grep { $report->{$_}->[4] =~ /included/i } keys %$report or die "No sequences found.\n";
197 :     my @uncov1 = sort { $a <=> $b } map { $report->{$_}->[9] } @incl;
198 :     my @uncov2 = sort { $a <=> $b } map { $report->{$_}->[10] } @incl;
199 :     my $mid1 = $uncov1[int(@uncov1/2)];
200 :     my $mid2 = $uncov2[int(@uncov2/2)];
201 :    
202 :     my @fusions;
203 :     for (@incl) {
204 :     my ($u1, $u2) = @{$report->{$_}}[9, 10];
205 :     my $fu1 = $u1 if $u1 > max($mid1 + 50, 100);
206 :     my $fu2 = $u2 if $u2 > max($mid2 + 50, 100);
207 :     if ($fu1 || $fu2) {
208 :     print join("\t", $_, $u1, $u2) . "\n";
209 :     my $str = join("\t", $_, 'Fusion:');
210 :     $str .= " beg=$fu1" if $fu1;
211 :     $str .= " end=$fu2" if $fu2;
212 :     push @fusions, $str;
213 :     }
214 :     }
215 :    
216 :     if ($html_tree) {
217 :     my $tmpdir = SeedAware::location_of_tmp();
218 :     my $tmpin1 = SeedAware::new_file_name("$tmpdir/psiblast_hits", 'fa');
219 :     my $tmpin2 = SeedAware::new_file_name("$tmpdir/fusion_ends", 'txt');
220 :     my $fusion = join("", map { $_."\n" } @fusions);
221 :     gjoseqlib::print_alignment_as_fasta($tmpin1, $hits);
222 :     AlignTree::print_string($tmpin2, $fusion);
223 :     SeedUtils::run("svr_align_seqs -mafft <$tmpin1 | svr_tree | svr_tree_to_html -c role -nc 20 -anno -p none -f $tmpin2 -d $tmpin2 >$html_tree");
224 :     for ($tmpin1, $tmpin2) { unlink $_ if -e $_ };
225 :     }
226 :    
227 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3