[Bio] / FigKernelScripts / filter_and_merge_gene_sets.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/filter_and_merge_gene_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : overbeek 1.2 ########################################################################
2 : olson 1.1 use strict;
3 :     use ExpressionDir;
4 :     use Data::Dumper;
5 : overbeek 1.2 use SeedEnv;
6 : olson 1.1
7 :     #
8 :     # Single-script replacement for this pipeline:
9 :     #
10 :     # &run("cat $exp_ed/coregulated.clusters $exp_ed/coregulated.subsys | cut -f1 | $FIG_Config::bin/merge_gene_sets | filter_on_known $exp_ed/raw_data.tab > $exp_ed/merged.clusters");
11 :    
12 :     @ARGV > 1 or die "Usage: filter_and_merge_gene_sets expr-dir set-file [set-file ...]\n";
13 :    
14 :     my $expr_dir = shift;
15 :    
16 :     my @files = @ARGV;
17 :    
18 :     my $exprO = ExpressionDir->new($expr_dir);
19 :    
20 : overbeek 1.2 my (%genes_to_sets);
21 :     my @sets;
22 : olson 1.1
23 : overbeek 1.2 my $nxt = 0;
24 : olson 1.1 for my $file (@files)
25 :     {
26 :     open(my $fh, "<", $file) or die "Cannot open $file: $!";
27 :     while (<$fh>)
28 :     {
29 :     my($peg_list) = /^(\S+)/;
30 :     my @genes = split ",", $peg_list;
31 : overbeek 1.2 $sets[$nxt] = \@genes;
32 : olson 1.1 foreach my $gene (@genes)
33 :     {
34 : overbeek 1.2 push(@{$genes_to_sets{$gene}},$nxt);
35 : olson 1.1 }
36 : overbeek 1.2 $nxt++;
37 : olson 1.1 }
38 :     close($fh);
39 :     }
40 :    
41 : overbeek 1.2 foreach my $gene (sort { &SeedUtils::by_fig_id($a,$b) } keys %genes_to_sets)
42 : olson 1.1 {
43 : overbeek 1.2 my $in = $genes_to_sets{$gene};
44 :     if ((! defined($in)) || (@$in ==0))
45 :     {
46 :     print STDERR &Dumper($gene,$genes_to_sets{$gene},$in); die "HERE";
47 :     }
48 :    
49 :     if (@$in > 1)
50 :     {
51 :     my @to_merge = sort { $a <=> $b } @$in;
52 :     my $i;
53 :     for ($i=1; ($i < @to_merge); $i++)
54 :     {
55 :     my $from = $to_merge[$i];
56 :     my $to = $to_merge[0];
57 :     if (&can_merge($sets[$from],$sets[$to]))
58 :     {
59 :     &report($gene,\@sets,$from,$to);
60 :     foreach my $gene1 (@{$sets[$from]})
61 :     {
62 :     my $x = $genes_to_sets{$gene1};
63 :     my $y = [$to,grep { ($_ != $from) && ($_ != $to) } @$x];
64 :     $genes_to_sets{$gene1} = $y;
65 :     }
66 :     my %new_set = map { $_ => 1 } (@{$sets[$from]},@{$sets[$to]});
67 :     my $set1 = [sort { &SeedUtils::by_fig_id($a,$b) } keys(%new_set)];
68 :     $sets[$to] = $set1;
69 :     $sets[$from] = undef;
70 :     }
71 :     }
72 :     }
73 : olson 1.1 }
74 :    
75 :     #
76 :     # start of filter_on_known
77 :     #
78 :    
79 :     my $raw_dataF = $exprO->expr_dir . "/rma_normalized.tab";
80 :     open(R, "<", $raw_dataF) or die "Cannot open $raw_dataF: $!";
81 :     my %known_pegs;
82 :     while (<R>)
83 :     {
84 :     if ($_ =~ /^(fig\|\d+\.\d+\.[^\.]+\.\d+)/)
85 :     {
86 :     $known_pegs{$1} = 1;
87 :     }
88 :     }
89 : overbeek 1.2 close(R);
90 : olson 1.1
91 : overbeek 1.2 my @sets = sort { @$b <=> @$a } grep { defined($_) } @sets;
92 :     foreach my $set (@sets)
93 : olson 1.1 {
94 : overbeek 1.2 my @pegs = sort { &SeedUtils::by_fig_id($a,$b) } grep { $known_pegs{$_} } @$set;
95 : olson 1.1 if (@pegs > 1)
96 :     {
97 :     print join(",",@pegs),"\n";
98 :     }
99 :     }
100 :    
101 : overbeek 1.2 sub report {
102 :     my($gene,$sets,$from,$to) = @_;
103 :    
104 :     print STDERR "merging $from and $to on $gene\n";
105 :     print STDERR "merging ",
106 :     join(",",sort { &SeedUtils::by_fig_id($a,$b) } @{$sets->[$from]})," and ",
107 :     join(",",sort { &SeedUtils::by_fig_id($a,$b) } @{$sets->[$to]}),"\n";
108 :     }
109 :    
110 :     sub can_merge {
111 :     my($set1,$set2) = @_;
112 : olson 1.1
113 : overbeek 1.2 my %h1 = map { $_ => 1 } @$set1;
114 :     my $n = grep { $h1{$_} } @$set2;
115 :     return ($n > 2) && ((@$set1 + @$set2) < 50);
116 : olson 1.1 }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3