[Bio] / FigKernelScripts / svr_scanN.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_scanN.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : overbeek 1.1 ########################################################################
2 :     use strict;
3 :     use Data::Dumper;
4 :     use Carp;
5 :    
6 :    
7 :     =head1 svr_scanN
8 :    
9 :     Scan genomes for a designated pattern
10 :    
11 :     This tool is based on scan_for_matches, which can be run without this wrapper.
12 :     Scan_for_matches has numerous options that have not been worked into
13 :     this simplified version.
14 :    
15 :     ------
16 :    
17 :     Example:
18 :    
19 :     svr_scanN -g 83333.1 'p1=10...15 4...6 ~p1'
20 :    
21 :     would scan the E.coli genome for instances of hirpin loops (in this
22 :     case, a stem of 10 to 15 characters, and a loop of 4 to 6 characters).
23 :     The output sould be a 3-column table. The first column would contain
24 :     the string in a contig that matched the pattern, the second the
25 :     location (genome:contig_begin_end), and the third the PEG id.
26 :    
27 :     ------
28 :    
29 :     Normally, the genome is given on the command line, and the contigs
30 :     of that genome are scanned. If the -g option is not used, genome
31 :     IDs are taken from standard input.
32 :    
33 :     The standard input should be a tab-separated table (i.e., each line is
34 :     a tab-separated set of fields). Normally, the last field in each line
35 :     would contain the genome IDs. If some other column contains the
36 :     genome IDs, use
37 :    
38 :     -c N
39 :    
40 :     where N is the column (from 1) that contains the PEG in each case.
41 :    
42 :     This is a pipe command. The input is taken from the standard input, and the
43 :     output is to the standard output.
44 :    
45 :     =head2 Command-Line Options
46 :    
47 :     =over 4
48 :    
49 :     =item -c Column
50 :    
51 :     This is used only if the column containing genome IDs is not the last.
52 :    
53 :     =item -s [search just the single strand; default is to search both strands]
54 :    
55 :     =back
56 :    
57 :     =head2 Output Format
58 :    
59 :     The standard output is a tab-delimited file. It consists of the input
60 :     file with three extra columns added (the matched string, the location,
61 :     and the matched genome). Note that when the pattern is made up of
62 :     multiple components, you get embedded blanks within the field giving
63 :     the matched string.
64 :    
65 :     =cut
66 :    
67 :     use SeedUtils;
68 :     use SAPserver;
69 :     my $sapO = SAPserver->new();
70 :     use Getopt::Long;
71 :    
72 :     my $usage = "usage: svr_scanN [-c column] [-s] [-g genome] Pattern ";
73 :    
74 :     my $column;
75 :     my $single_strand;
76 :     my $genome;
77 :     my $rc = GetOptions('c=i' => \$column,
78 :     'g=s' => \$genome,
79 :     's' => \$single_strand);
80 :     if (! $rc) { print STDERR $usage; exit }
81 :     (@ARGV > 0) || die "you need to specifiy a pattern";
82 :    
83 :     my @lines;
84 :     if ($genome)
85 :     {
86 :     @lines = ([$genome]);
87 :     }
88 :     else
89 :     {
90 :     @lines = map { chomp; [split(/\t/,$_)] } <STDIN>;
91 :     }
92 :     (@lines > 0) || exit;
93 :    
94 :     if (! $column) { $column = @{$lines[0]} }
95 :     my @genomes = map { $_->[$column-1] } @lines;
96 :     my $matches = &scan_for_matches($sapO,$ARGV[0],\@genomes,$single_strand);
97 :     foreach $_ (@lines)
98 :     {
99 :     my $hits = $matches->{$_->[$column-1]};
100 :     foreach my $hit (@$hits)
101 :     {
102 :     print join("\t",@$_,@$hit),"\n";
103 :     }
104 :     }
105 :    
106 :     sub scan_for_matches {
107 :     my($sapO,$pat,$all_genomes,$single_strand) = @_;
108 :    
109 :     my $hitsF = "tmp.scanP.hits.$$";
110 :     my $patternF = "tmp.scanP.pattern.$$";
111 :     open(TMP,">",$patternF) || die "could not open $patternF";
112 :     print TMP $pat,"\n";
113 :     close(TMP);
114 :     my $complement = $single_strand ? '' : '-c';
115 :    
116 :     open(HITS,"| scan_for_matches $complement $patternF > $hitsF") || die "could not run scan_for_matches";
117 :    
118 :     my @genomes = @$all_genomes;
119 :     my $hitsH;
120 :    
121 :     while (@genomes > 0)
122 :     {
123 :     $_ = (@genomes >= 100) ? 100 : @genomes;
124 :     my @next_set = splice(@genomes,0,$_);
125 :     my $genomeH = $sapO->genome_contigs( -ids => \@next_set );
126 :     my @contigs = map { @{$genomeH->{$_}} } keys(%$genomeH);
127 :     my $seqH = $sapO->contig_sequences( -ids => \@contigs );
128 :     foreach my $genome (@next_set)
129 :     {
130 :     my $contigs = $genomeH->{$genome};
131 :     foreach my $contig (@$contigs)
132 :     {
133 :     my $seq = $seqH->{$contig};
134 :     if ($seq)
135 :     {
136 :     print HITS ">$contig\n$seq\n";
137 :     }
138 :     }
139 :     }
140 :     }
141 :     close(HITS);
142 :     open(HITS,"<",$hitsF) || die "could not open $hitsF";
143 :     while (defined($_ = <HITS>) && ($_ =~ /^>(\d+\.\d+):(\S+)\:\[(\d+),(\d+)\]/))
144 :     {
145 :     my $genome = $1;
146 :     my $contig = $2;
147 :     my $beg = $3;
148 :     my $end = $4;
149 :     my $str = <HITS>; chomp $str;
150 :     push(@{$hitsH->{$genome}},[join("\t",($contig,$beg,$end)),$str]);
151 :     }
152 :     unlink($hitsF,$patternF);
153 :     return $hitsH;
154 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3