[Bio] / FigMetagenomeTools / trim_repeats.pl Repository:
ViewVC logotype

Annotation of /FigMetagenomeTools/trim_repeats.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl -w
2 :    
3 :     # trim repeats. Save only those that start or stop between given coordinates or those longer than a given length
4 :    
5 :     # Copyright Rob Edwards
6 :     #
7 :     # You may distribute this script under the GPL. See below.
8 :    
9 :     =head1 NAME
10 :    
11 :     trim_repeats.pl - remove repeats not in a specific region or shorter than a defined length
12 :    
13 :     =head1 SYNOPSIS
14 :    
15 :     You can run this script from the command line with the command
16 :    
17 :     perl -w trim_repeats.pl -f <filename> -b <beginning> -e <end> -m <minimum length>
18 :    
19 :     =head1 DESCRIPTION
20 :    
21 :     This script will read in a genbank file and remove any repeats that are outside of
22 :     a given range. I use it to cut down the number of repeats in a file.
23 :    
24 :     =head1 AUTHOR - Rob Edwards
25 :    
26 :     Email redwards@utmem.edu
27 :    
28 :     The author gratefully acknowledges all the help from the members of the BioPerl team.
29 :    
30 :     =cut
31 :    
32 :    
33 :    
34 :    
35 :     use strict;
36 :    
37 :     my ($file, $start, $stop, $min)=('', 0,0,0);
38 :     my ($rangeb, $rangee);
39 :    
40 :     while (@ARGV) {
41 :     my $t=shift @ARGV;
42 :     if ($t eq "-f") {$file=shift @ARGV}
43 :     elsif ($t eq "-b") {$start=shift @ARGV}
44 :     elsif ($t eq "-e") {$stop =shift @ARGV}
45 :     elsif ($t eq "-m") {$min =shift @ARGV}
46 :     elsif ($t eq "-r") {$t = shift; ($rangeb, $rangee)=split /\.\./, $t}
47 :     }
48 :    
49 :     unless ($file) {die "$0\n-f file name\n-b beginning\n-e end\n-r range (must be something like 200.300) to cover\n-m minimum\nIf beginning or end are not supplied length of sequence is used"}
50 :    
51 :     unless ($stop) {
52 :     # quickly see how long the sequence is. Could get it from the first line, but ech.
53 :     open(IN, $file) || die "Can't open $file";
54 :     my ($keep, $seq);
55 :     while (<IN>) {
56 :     # Example first line:
57 :     #LOCUS Phage_1 79274 bp dna linear UNK
58 :     if (/^LOCUS.*?(\d+)\s+bp/) {$stop=$1;last}
59 :     if (/ORIGIN/) {$keep=1; next}
60 :     next unless ($keep);
61 :     chomp; s/\s//g; s/\d//g;
62 :     $seq .= $_;
63 :     }
64 :     close IN;
65 :     if (!$stop && $seq) {$stop=length($seq)}
66 :     unless ($stop) {die "Can't parse the length from $file??"}
67 :     }
68 :    
69 :     if ($start>$stop) {($start, $stop)=($stop, $start)}
70 :    
71 :     print STDERR "Looking for repeats from $start to $stop with minimum length $min\n";
72 :    
73 :     open (IN, $file) || die "Can't open $file";
74 :    
75 :     my $out=$file;
76 :     $out =~ s/.gbk$/.trimmed.gbk/;
77 :     $out =~ s/.gb$/.trimmed.gbk/;
78 :     if ($out eq $file) {$out .= ".trimmed.gbk"}
79 :    
80 :     my $skip;
81 :     open (OUT, ">$out") || die "Can't open $out for writing";
82 :     while (my $l=<IN>) {
83 :     if ($skip) {
84 :     next if ($l =~ /^\s+\//);
85 :     undef $skip;
86 :     }
87 :     unless ($l =~ /^\s+repeat/) {print OUT $l; next}
88 :     if ($l =~ /repeat_region/) {print OUT $l; next}
89 :     # repeat has the format join(411..423,13244..13256)
90 :     $l=~m/(\d+)\.\.(\d+)\,(\d+)\.\.(\d+)/;
91 :     my ($one, $two, $three, $four)=($1, $2, $3, $4);
92 :     unless ($one && $two && $three && $four) {die "Can't parse numbers from $l\n"}
93 :     my $keep;
94 :     foreach my $pos ($one, $two, $three, $four) {
95 :     if ((($two-$one+1) > $min) && $pos > $start && $pos < $stop) {$keep=1}
96 :     }
97 :     if ($rangeb && $rangee) {
98 :     if (($one < $rangeb || $two < $rangeb) && ($three > $rangee || $four > $rangee)) {$keep=1}
99 :     else {undef $keep}
100 :     }
101 :     if ($keep) {print OUT $l; next}
102 :     else {$skip=1}
103 :     }
104 :    
105 :     # Copyright 2002-2004 Rob Edwards
106 :     # For updates, more information, or to discuss the scripts
107 :     # please contact Rob Edwards at redwards@utmem.edu or via http://www.salmonella.org/
108 :     #
109 :     # This file is part of Rob's Scripts developed by Rob Edwards.
110 :     #
111 :     # These scripts are free software; you can redistribute and/or modify
112 :     # them under the terms of the GNU General Public License as published by
113 :     # the Free Software Foundation; either version 2 of the License, or
114 :     # (at your option) any later version.
115 :     #
116 :     # They are distributed in the hope that they will be useful,
117 :     # but WITHOUT ANY WARRANTY; without even the implied warranty of
118 :     # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
119 :     # GNU General Public License for more details.
120 :     #
121 :     # You should have received a copy of the GNU General Public License
122 :     # in the file (COPYING) along with these scripts; if not, write to the Free Software
123 :     # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
124 :     #
125 :     #

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3