[Bio] / FigKernelScripts / create_ff_covering_skeleton.pl Repository:
ViewVC logotype

View of /FigKernelScripts/create_ff_covering_skeleton.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Jan 26 16:37:38 2009 UTC (11 years ago) by arodri7
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011
Changes since 1.1: +4 -2 lines
fixed script that creates a directory structure for a FIGfam for its sequence coverage

use strict;
use FIG;
use FigFams;
use FigFam;
use FIG_Config;


=head
 usage: create_ff_covering_skeleton FIGfam outputPath [FIGfam_directory]

create_ff_covering_skeleton creates the basic directory structure and files for a FIGfam
in order to create the assertions for FIGfams.

The inputs to the script are:

FIGfam - any figfam in the FIGfam release
outputPath - the path in which the direcotry structure will be built on
FIGfam_directory - this is optional. You may give the path to the FIGfam release you want to use.
                   If none is given, it will use the latest FIGfam release.


The output is the directory structure with all the files needed.

=cut

my $fig = new FIG;
my $tmp = "$FIG_Config::temp/$$";
&FIG::run("mkdir $tmp");
my $usage = "usage: create_ff_covering_skeleton FIGfam outputPath [FIGfam_directory]\n";

my ($fam, $path);
if (scalar @ARGV > 1){ 
    ($fam) = shift @ARGV;
    ($path) = shift @ARGV;
}
else{
    die $usage;
}

die $usage if ($fam !~ /FIG\d+/);

my ($figfam_path);
if (scalar @ARGV > 0){
    $figfam_path = shift @ARGV;
}
else{
    $figfam_path = $fig->get_figfams_data();
}

my $figfam = new FigFam($fig, $fam, $figfam_path);
my $ff_path = $figfam->{dir};

# make top directory
my $work_dir = "$path/$fam";
&FIG::run("mkdir $work_dir");

# copy pegs and seqs files from figfam directory to covering directory
&FIG::run("cp $ff_path/PEGs $work_dir/$fam.pegs");
&FIG::run("cp $ff_path/PEGs.fasta $work_dir/$fam.seqs");
&FIG::run("mkdir $work_dir/Top");
$work_dir .= "/Top";

# fill in this directory by copying the seqs and running representative_sequences
&FIG::run("cp $ff_path/PEGs.fasta $work_dir/seqs");

my $reps_qty = `grep -c '>' "$work_dir/seqs"`;
chomp $reps_qty;

print STDERR "SEQ QTY: $reps_qty\n";
&get_reps("$work_dir", "$work_dir/seqs", $tmp) if ($reps_qty > 50);
&FIG::run("rm -rf $tmp");

sub get_reps{
    my ($dir, $fasta, $tmp) = @_;
    
    my $max = 50;
    my $min = 25;
    my $reps_qty = `grep -c '>' "$work_dir/seqs"`;
    chomp $reps_qty;

    my $cutoff = 0.50;
    my $i = 0;
    my $range = $cutoff;
    while (($reps_qty > $max) || ($reps_qty < $min) || ($cutoff >= 1)) {
	&FIG::run("rm -rf $tmp");
	&FIG::run("mkdir $tmp");
	&FIG::run("representative_sequences -s $cutoff -b -d $tmp/Groups < $fasta > $tmp/reps");
	$reps_qty = `grep -c '>' "$tmp/reps"`;
	chomp $reps_qty;
	print STDERR "$fasta, QTY: $reps_qty, $cutoff\n";
	($cutoff, $range) = &get_cutoff($max, $min, $reps_qty, $cutoff, $range);
	$i++;
    }

    &FIG::run("mv  $tmp/Groups $dir/.");
    &FIG::run("mv $tmp/reps $dir/.");
    
    &make_GenomesDir("$dir/GenomesDir", "$dir/reps");
    &make_Children("$dir/Children", "$dir/Groups");
}

sub make_Children{
    my ($dir, $groupsdir) = @_;

    my @groups = glob ("$groupsdir/*");
    &FIG::run("mkdir $dir");

    foreach my $group (@groups){
	my @values = split(/\//, $group);
	my $name = pop(@values);
	
	# get the count for each file
	my $reps_qty = `grep -c '>' "$group"`;
        chomp $reps_qty;

	if ($reps_qty > 50){
	    # create group directory
	    &FIG::run("mkdir $dir/$name");

	    # copy the seqs file
	    &FIG::run("cp $group $dir/$name/seqs");

	    # create the reps
	    print STDERR "SEQ QTY: $reps_qty\n";
	    &get_reps("$dir/$name", "$dir/$name/seqs", "$FIG_Config::temp/$$");
	}
	elsif (($reps_qty > 1) && ($reps_qty < 50)){
	    # create group directory
	    &FIG::run("mkdir $dir/$name");

	    # copy the seqs file
	    &FIG::run("cp $group $dir/$name/seqs");

	    # make the GenomesDir directory
	    &make_GenomesDir("$dir/$name/GenomesDir", "$group");
	}
    }
}

sub make_GenomesDir{
    my ($dir, $repsfile) = @_;

    &FIG::run("mkdir $dir");
    &FIG::run("mkdir $dir/Sims");
    &FIG::run("mkdir $dir/Seqs");

    open (FH, $repsfile);
    while (my $line = <FH>){
	if ($line =~ m/>fig\|(\d+\.\d+)\./){
	    my $genome = $1;
	    
	    # get the pegs for the genome
	    &FIG::run("pegs $genome > $dir/Seqs/$genome.ids");
	    
	    # get the fasta sequences
	    &FIG::run("get_translations < $dir/Seqs/$genome.ids > $dir/Seqs/$genome");

	    # make the blast dbs
	    &FIG::run("formatdb -i $dir/Seqs/$genome -p T");

	    # clear directory
	    &FIG::run("rm $dir/Seqs/$genome.ids");
	}
    }
}

sub get_cutoff {
    my ($max, $min, $qty, $cutoff, $range) = @_;
    
    my $new_range = $range/2;
    if ($qty > $max){
	$cutoff = $cutoff - ($new_range);
    }
    elsif ($qty < $min) {
	$cutoff = $cutoff + ($new_range);
    }
    else {
	$cutoff = $cutoff;
    }

    return ($cutoff,$new_range);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3