[Bio] / FigKernelScripts / build_figfam_pssm_db.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_figfam_pssm_db.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Thu May 17 12:50:39 2007 UTC (12 years, 9 months ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +91 -27 lines
Major modifications to &load_family_ids()

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

$SIG{HUP} = 'IGNORE';

use strict;
use warnings;

use FIG;
use FIG_Config;
use File::Basename;

my $self  = basename($0);
my $usage = "$self  FIGfams_dir Output_DB_Path  [FamID_1 FamID_2 FamID_3 ...]";

my ($figfams_dir, $output_db_path);
(($figfams_dir = shift) && ($output_db_path = shift))
    || die "\n   usage: $usage\n\n";

die "Nonexistent FIGfams dir"  if (!-d $figfams_dir);
my ($output_db, $output_dir) = fileparse($output_db_path);

my @fam_ids;
if (@ARGV) {
    @fam_ids = @ARGV;
}
else {
    (@fam_ids = &load_family_ids($figfams_dir))
	|| die "No family IDs could be read from $figfams_dir";
}

my $tmp_dir = "$FIG_Config::temp/tmp$$";
if (-d $tmp_dir) { system("/bin/rm -fR $tmp_dir"); }
&FIG::verify_dir($tmp_dir);

my $profile_filenames    = "$tmp_dir/$output_db.pn";
open(PROFILE_NAMES, ">$profile_filenames")
    || die "Could not write-open $profile_filenames";

my $master_seq_filenames = "$tmp_dir/$output_db.sn";
open(SEQUENCE_NAMES, ">$master_seq_filenames")
    || die "Could not write-open $master_seq_filenames";

my ($seqfile_name, $seqfile_path, $fam_align, $tmp_align);
foreach my $fam_id (@fam_ids) {
    print STDERR "Processing $fam_id\n" if $ENV{VERBOSE};
    my ($full_name, $fam_path) = &full_name($figfams_dir, $fam_id);
    if (!-e $full_name) {
	warn "Missing $full_name --- rebuilding family\n";
	unlink("$fam_path/built");
	system("echo 'set $fam_id' | figfam $figfams_dir");
	if (!-e $full_name) {
	    warn "Problem rebuilding $fam_id --- skipping\n\n";
	    next;
	}
    }
    
    my $num_pegs = 0;
    if (($_ = `grep -c "^>" $full_name`) && ($_ =~ m/^(\d+)/)) {
	$num_pegs = $1;
	if ($num_pegs < 2) {
	    my $s = ($num_pegs == 1) ? qq(s) : qq();
	    warn "Skipping small family $fam_id ($num_pegs member$s)\n\n";
	    next;
	}
    }
    
    open( TMP, "<$full_name") 
	|| die "Could not read-open $full_name";
    my ($seq_id, $seqP) = &FIG::read_fasta_record(\*TMP);
    close(TMP) || die "Could not close $full_name";
    
    $seqfile_name = "$fam_id.csq";
    $seqfile_path = "$tmp_dir/$seqfile_name";
    print SEQUENCE_NAMES "$seqfile_name\n";
    open( TMP, ">$seqfile_path")
	|| die "Could not write-open $seqfile_path";
    print TMP ">$seq_id\n$$seqP\n";
    close(TMP) || die "Could not close $seqfile_path";
    
    if ((!-e "$full_name.psq")
	|| ((-M "$full_name.psq") > (-M "$full_name"))
	) {
	&FIG::run("formatdb -i $full_name -p T");
    }
    
    $fam_align = "$full_name.aln";
    if ((!-e "$fam_align")
	|| ((-M "$fam_align") > (-M "$full_name"))
	) {
	warn "Rebuilding $full_name.aln\n" if $ENV{VERBOSE};
	&FIG::run("runclustalw $full_name");
    }
    
    $tmp_align = "$tmp_dir/tmp.aln";
    my $num_lines = 0;
    if (($_ = `wc -l $fam_align`) && ($_ =~ m/^(\d+)/)
	&& ($num_lines = $1 - 3) && ($num_lines > 0)) {
	&FIG::run("tail -$num_lines $fam_align > $tmp_align");
    }
    else {
	die "Could not count lines of $fam_align";
    }
    
    my $profile_name = "$fam_id.chk";
    print PROFILE_NAMES "$profile_name\n";
    
    &FIG::run("cd $tmp_dir;  blastpgp -i $seqfile_path -d $full_name -j 1 -B $tmp_align -C $profile_name > /dev/null 2> /dev/null");
    
    print STDERR "\n" if $ENV{FIG_VERBOSE};
}

&FIG::run("cd $tmp_dir;  makemat  -P $output_db -d $FIG_Config::global/nr");

&FIG::run("cd $tmp_dir;  copymat  -P $output_db -r T");

&FIG::run("cd $tmp_dir;  formatdb -i $output_db -o T");

&FIG::run("mv $tmp_dir/$output_db* $output_dir");

system("/bin/rm -fR $tmp_dir");

exit(0);



sub load_family_ids {
    my ($figfams_dir) = @_;
    my @figfam_ids;
    
    opendir( PAGES, "$figfams_dir/FIGFAMS")
	|| die "Could not opendir $figfams_dir/FIGFAMS/";
    (my @pages = grep { m/^\d{3}$/ } readdir(PAGES))
	|| die "No pages read from $figfams_dir/FIGFAMS/";
    closedir(PAGES)
	|| die "Could not closedir $figfams_dir/FIGFAMS/";
#   print STDERR ("Read ", (scalar @pages), " pages from $figfams_dir/FIGFAMS/\n")
#	if $ENV{VERBOSE};

    my $tock = 0;
    foreach my $page (@pages) {
	print STDERR "." if $ENV{VERBOSE};
	if (++$tock >= 100)  { print STDERR "\n"; $tock = 0; }
	
	opendir( FAMILIES, "$figfams_dir/FIGFAMS/$page")
	    || die "Could not opendir $figfams_dir/FIGFAMS/$page/";
	(my @families = grep { m/^FIG\d+$/ } readdir(FAMILIES))
	    || die "No pages read from $figfams_dir/FIGFAMS/$page/";
	closedir(FAMILIES)
	    || die "Could not closedir $figfams_dir/FIGFAMS/$page/";
#	print STDERR ("Read ", (scalar @families), " families from $figfams_dir/FIGFAMS/$page/\n")
#	    if $ENV{VERBOSE};
	
	foreach my $family (@families) {
	    if (-d "$figfams_dir/FIGFAMS/$page/$family/Split") {
		opendir( SPLIT, "$figfams_dir/FIGFAMS/$page/$family/Split")
		    || die "Could not opendir $figfams_dir/FIGFAMS/$page/$family/Split";
		(my @subfams = map { "$family.$_" } grep { m/^\d+$/ } readdir(SPLIT));
		closedir(SPLIT)
		    || die "Could not closedir $figfams_dir/FIGFAMS/$page/$family/Split";
#		print STDERR ("Read ", (scalar @subfams), " subfamilies from $figfams_dir/FIGFAMS/$page/$family/Split/\n")
#		    if $ENV{VERBOSE};
		
		if (@subfams > 0) {
		    push @figfam_ids, @subfams;
		}
		else {
		    push @figfam_ids, $family;
		}
	    }
	    else {
		push @figfam_ids, $family;
	    }
	}
    }
    print STDERR "\n" if  $ENV{VERBOSE};
    
    return @figfam_ids;
}
 
sub full_name {
    my ($figfams_dir, $fam_id) = @_;
    my 	($fam, $page, $full_name, $fam_path);
    
    if ($fam_id =~ m/^(FIG\d+(\d{3}))/) {
	($fam, $page) = ($1, $2);
	
	$fam_path = "$figfams_dir/FIGFAMS/$page/$fam";
	if ($fam_id =~ m/^FIG\d+\.(\d+)/) {
	    $full_name = "$fam_path/Split/$1";
	}
	else {
	    $full_name = "$fam_path/PEGs.fasta";
	}
    }
    else {
	die "Invalid FIGfam ID: $fam_id";
    }
    
    return ($full_name, $fam_path);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3