[Bio] / FigKernelScripts / p3-find-couples.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/p3-find-couples.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Fri Oct 19 16:06:14 2018 UTC revision 1.5, Sat Oct 20 01:59:13 2018 UTC
# Line 44  Line 44 
44    
45  If the sequence ID is already present in the input file, the name of the column containing the sequence ID.  If the sequence ID is already present in the input file, the name of the column containing the sequence ID.
46    
47    =item verbose
48    
49    If specified, status messages will be written to STDERR.
50    
51  =back  =back
52    
53  =cut  =cut
# Line 51  Line 55 
55  use strict;  use strict;
56  use P3DataAPI;  use P3DataAPI;
57  use P3Utils;  use P3Utils;
58    use Math::Round;
59    
60  # Get the command-line options.  # Get the command-line options.
61  my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(),  my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(),
62          ['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }],          ['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }],
63          ['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }],          ['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }],
64          ['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'],          ['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'],
65          ['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature']          ['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature'],
66            ['verbose|v', 'display progress messages on STDERR'],
67          );          );
68  # This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form  # This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form
69  # [category, start, end].  # [category, start, end].
# Line 68  Line 73 
73  my $seqCol = $opt->sequence;  my $seqCol = $opt->sequence;
74  my $maxGap = $opt->maxgap;  my $maxGap = $opt->maxgap;
75  my $minCount = $opt->mincount;  my $minCount = $opt->mincount;
76    my $debug = $opt->verbose;
77  # Get the category column.  # Get the category column.
78  my ($catCol) = @ARGV;  my ($catCol) = @ARGV;
79  if (! defined $catCol) {  if (! defined $catCol) {
# Line 95  Line 101 
101      push @selects, 'start', 'end';      push @selects, 'start', 'end';
102      $queryNeeded = 1;      $queryNeeded = 1;
103  }  }
104    if ($queryNeeded && $debug) {
105        print STDERR "PATRIC queries will be needed.\n";
106    }
107  # Find the category column.  # Find the category column.
108  $catCol = P3Utils::find_column($catCol, $outHeaders);  $catCol = P3Utils::find_column($catCol, $outHeaders);
109  # Form the full header set and write it out.  # Form the full header set and write it out.
110  if (! $opt->nohead) {  if (! $opt->nohead) {
111      my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count');      my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count', 'percent');
112      P3Utils::print_cols(\@headers);      P3Utils::print_cols(\@headers);
113  }  }
114    # These are used for status messages.
115    my $count = 0;
116    my $period = ($queryNeeded ? 1000 : 100000);
117    # This counts the category occurrences.
118    my %catCounts;
119  # Loop through the input.  # Loop through the input.
120  while (! eof $ih) {  while (! eof $ih) {
121      my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);      my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
# Line 115  Line 129 
129      for my $couplet (@$couplets) {      for my $couplet (@$couplets) {
130          my ($fid, $line) = @$couplet;          my ($fid, $line) = @$couplet;
131          my $category = $line->[$catCol];          my $category = $line->[$catCol];
132            $count++;
133          if ($category) {          if ($category) {
134              my ($start, $end, $sequence);              my ($start, $end, $sequence);
135              my $fidData = $rows{$fid};              my $fidData = $rows{$fid};
# Line 145  Line 160 
160              push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end];              push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end];
161          }          }
162      }      }
163        print STDERR "$count features processed.\n" if $debug && $count % $period == 0;
164  }  }
165  # Now we have category and position data for each feature sorted by sequence.  # Now we have category and position data for each feature sorted by sequence.
166  # For each list, we sort by start position and figure out what qualifies as a couple.  # For each list, we sort by start position and figure out what qualifies as a couple.
167  # The couples are counted in this hash, which is keyed by "element1\telement2".  # The couples are counted in this hash, which is keyed by "element1\telement2".
168  my %couples;  my %couples;
169  for my $contig (keys %contigs) {  print STDERR scalar(keys %contigs) . " contigs ready to examine.\n" if $debug;
170    for my $contig (sort keys %contigs) {
171      my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}};      my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}};
172        print STDERR "Processing $contig with " . scalar(@features) . " features.\n" if $debug;
173      # We process one feature at a time, and stop when there are none left      # We process one feature at a time, and stop when there are none left
174      # to couple with it.      # to couple with it.
175      my $feat = shift @features;      my $feat = shift @features;
176      while (scalar @features) {      while (scalar @features) {
177          my $cat1 = $feat->[0];          my $cat1 = $feat->[0];
178            $catCounts{$cat1}++;
179          # Compute the latest start position that qualifies as a couple.          # Compute the latest start position that qualifies as a couple.
180          my $limit = $feat->[2] + $maxGap;          my $limit = $feat->[2] + $maxGap;
181          # Loop through the remaining features until we hit the limit.          # Loop through the remaining features until we hit the limit.
# Line 171  Line 190 
190      }      }
191  }  }
192  # Sort the couples and output them.  # Sort the couples and output them.
193    print STDERR "Writing couples.\n" if $debug;
194  my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples;  my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples;
195  for my $couple (@couples) {  for my $couple (@couples) {
196      P3Utils::print_cols([$couple, $couples{$couple}]);      my ($cat1, $cat2) = split /\t/, $couple;
197        my $count = $couples{$couple};
198        my $total = ($catCounts{$cat1} + $catCounts{$cat2}) / 2;
199        P3Utils::print_cols([$couple, $count, Math::Round::nearest(0.01, $count * 100 / $total)]);
200  }  }

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3