[Bio] / FigKernelScripts / svr_find_hypos_for_cluster.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_find_hypos_for_cluster.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : overbeek 1.1 use strict;
2 :     use SeedEnv;
3 :     use Data::Dumper;
4 :     use Carp;
5 :    
6 :     #
7 :     # This is a SAS Component
8 :     #
9 :    
10 :    
11 :     =head1 svr_find_hypos_for_cluster
12 :    
13 :     Get candidates for a specific role by finding genes with no real
14 :     assignment of function yet that are connected to a cluster. We will
15 :     consider a hypothetical "connected to a cluster" iff
16 :    
17 :     1. it has a strong functional coupling score to a member of the cluster or
18 :     2. it occurs between the bounding members of the cluster
19 :    
20 :     ------
21 :    
22 :     Example:
23 :    
24 :     svr_gap_filled_reactions_and_roles -g 273035.4 | svr_find_clusters_relevant_to_role -g 273035.4 -r 2 -n 100 | svr_find_hypos_for_cluster
25 :    
26 :     would produce a 5-column table [Genome,reaction,role,cluster-of-genes-connected-to-role,peg-for-hypothetical]
27 :    
28 :     ------
29 :    
30 :     The standard input should be a tab-separated table (i.e., each line
31 :     is a tab-separated set of fields). Normally, the last field in each
32 :     line would contain a cluster represented as a comma-separated list of genes.
33 :     If some other column contains the clusters, use
34 :    
35 :     -c N
36 :    
37 :     where N is the column (from 1) that contains the role in each case.
38 :    
39 :     This is a pipe command. The input is taken from the standard input, and the
40 :     output is to the standard output.
41 :    
42 :    
43 :     =head2 Command-Line Options
44 :    
45 :     =over 4
46 :    
47 :     =item -c N
48 :    
49 :     Specifies which column in the input table contains the clusters. Defaults
50 :     to the last column in the input file.
51 :    
52 :     =back
53 :    
54 :     =head2 Output Format
55 :    
56 :     The standard output is a tab-delimited file. Each line will contain
57 :     the input fields followed by a score and a PEG with a hypothetical function that
58 :     is connected to the cluster. The score is either the functional-coupling score or
59 :     0 (for a hypo that is embedded, but not functionally coupled).
60 :    
61 :     =cut
62 :    
63 :     use SAPserver;
64 :     my $sapO = SAPserver->new();
65 :     use Getopt::Long;
66 :    
67 :     my $usage = "usage: svr_find_hypos_for_cluster [-c N]";
68 :    
69 :     my $column;
70 :     my $rc = GetOptions('c=i' => \$column);
71 :    
72 :     if (! $rc) { print STDERR $usage; exit }
73 :    
74 :     my @lines = map { chomp; [split(/\t/,$_)] } <STDIN>;
75 :     my @clusters = map { $_->[$column-1] } @lines;
76 :     my @pegs = map { split(/,/,$_) } @clusters;
77 :     my %pegs_in_clust = map { ($_ => 1) } @pegs;
78 :     @pegs = keys(%pegs_in_clust);
79 :     my $fcH = $sapO->conserved_in_neighborhood( -ids => \@pegs );
80 :     my $locH = $sapO->fid_locations( -ids => \@pegs);
81 :     my %cluster_to_loc = map { ($_ => &loc_of_region_containing($_,$locH)) } @clusters;
82 :     my %locs = map { ($cluster_to_loc{$_} => 1) } keys(%cluster_to_loc);
83 :     my @regions = keys(%locs);
84 :     my $loc2G = $sapO->genes_in_region( -locations => \@regions );
85 :     my %other_genes = map { (! $pegs_in_clust{$_}) ? ($_ => 1) : () } map { @{$loc2G->{$_}} } keys(%$loc2G);
86 :     my @otherG = keys(%other_genes);
87 :     my $funcH = $sapO->ids_to_functions( -ids => \@otherG);
88 :     my @hypo = grep { &SeedUtils::hypo($funcH->{$_}) } keys(%$funcH);
89 :     my %hypoH = map { $_ => 1 } @hypo;
90 :     foreach my $line (@lines)
91 :     {
92 :     if (my $cluster1 = $line->[$column-1])
93 :     {
94 :     my %best_sc;
95 :     foreach my $peg (split(/,/,$cluster1))
96 :     {
97 :     my $conn = $fcH->{$peg};
98 :     if ($conn)
99 :     {
100 :     foreach my $tuple (@$conn)
101 :     {
102 :     my($sc,$peg2,$func2) = @$tuple;
103 :     if ((! $pegs_in_clust{$peg2}) && (&SeedUtils::hypo($func2)))
104 :     {
105 :     if ((! ($_ = $best_sc{$peg2})) || ($_ < $sc))
106 :     {
107 :     $best_sc{$peg2} = $sc;
108 :     }
109 :     }
110 :     }
111 :     }
112 :     }
113 :     my $loc1 = $cluster_to_loc{$cluster1};
114 :     my %other_genes1 = map { (! $pegs_in_clust{$_}) ? ($_ => 1) : () } @{$loc2G->{$loc1}};
115 :     my @poss = grep { $hypoH{$_} } keys(%other_genes1);
116 :     foreach my $peg (@poss)
117 :     {
118 :     if (! defined($best_sc{$peg})) { $best_sc{$peg} = 0 }
119 :     }
120 :     foreach my $peg (sort { $best_sc{$b} <=> $best_sc{$a} } keys(%best_sc))
121 :     {
122 :     print join("\t",(@$line,$best_sc{$peg},$peg)),"\n";
123 :     }
124 :     }
125 :     }
126 :    
127 :     sub loc_of_region_containing {
128 :     my($cluster,$locH) = @_;
129 :    
130 :     my @pegs = split(/,/,$cluster);
131 :     my @pegs_with_locs = sort { ($a->[1] cmp $b->[1]) or ($a->[2] <=> $b->[2]) }
132 :     map { my $peg = $_; my $loc = $locH->{$peg}->[0];
133 :     ($loc =~ /^(\d+\.\d+):(\S+)_(\d+)([-+])(\d+)/) ? [$peg,$2,$3,$4,$5] : () } @pegs;
134 :    
135 :     my($peg1,$contig1,$start1,$strand1,$length1) = @{$pegs_with_locs[0]};
136 :     my($peg2,$contig2,$start2,$strand2,$length2) = @{$pegs_with_locs[1]};
137 :    
138 :     if ($contig1 ne $contig2) { die "contig1=$contig1 contig2=$contig2" }
139 :     my $min = &SeedUtils::min( $start1, ($strand1 eq "+") ? ($start1+($length1-1)) : ($start1-($length1-1)),
140 :     $start2, ($strand2 eq "+") ? ($start2+($length2-1)) : ($start2-($length2-1)));
141 :     my $max = &SeedUtils::max( $start1, ($strand1 eq "+") ? ($start1+($length1-1)) : ($start1-($length1-1)),
142 :     $start2, ($strand2 eq "+") ? ($start2+($length2-1)) : ($start2-($length2-1)));
143 :     my $genome = &SeedUtils::genome_of($peg1);
144 :     my $loc = "$genome:$contig1" . "_" . $min . "_" . $max;
145 :     return $loc;
146 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3