[Bio] / FigKernelScripts / add_phage_from_rast.sh Repository:
ViewVC logotype

View of /FigKernelScripts/add_phage_from_rast.sh

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri May 3 17:29:19 2013 UTC (6 years, 6 months ago) by redwards
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
 a set of scripts for adding phage genomes to the seed

#!/bin/bash

## A shell script to add a phage directly to phantome from rast. Written by Rob, March 18th, 2012
## Use and edit at will!

## Much of this is going to use existing scripts and code, so they may need editing too!!

## Start with a RAST ID

if [ $# == 0 ];
then
	echo "
Usage: `basename $0` \"<email address>\" <list of rast job ids>

For example `basename $0` \"raedwards@gmail.com\" 1234 23145

You will get an email when the genome(s) are added, and when the sims are complete and loading.

The number we need is the RAST ID number, which is an integer. The genome ID (which is something like 66666666.1234) will not work.

"
	exit $E_BADARGS
fi

EMAIL=$1;
shift;
# this is so that I get cc'd on all the emails from this script. Please don't remove me, I like to know what is happening!
EMAIL=$EMAIL,raedwards@gmail.com


## If we get this far, we're going to continue. Lets set up our perl environment to include fig stuff

PERL5LIB=/home/fig/FIGdisk/config:/home/fig/FIGdisk/dist/current/linux-debian-x86_64/lib:/home/fig/FIGdisk/dist/current/linux-debian-x86_64/lib/FigKernelPackages:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/FigKernelPackages:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/FortyEight:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/MGRAST:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/ModelSEED:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/PPO:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/RAST:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/SeedViewer:/home/fig/FIGdisk/dist/releases/current/linux-debian-x86_64/lib/WebApplication:/home/fig/FIGdisk/env/2008-0612/linux-debian-x86_64/lib/perl5:/home/fig/FIGdisk/env/2008-0612/linux-debian-x86_64/lib/perl5/5.10.0:/home/fig/FIGdisk/env/2008-0612/linux-debian-x86_64/lib/perl5/5.10.0/x86_64-linux:/home/fig/FIGdisk/env/2008-0612/linux-debian-x86_64/lib/perl5/site_perl/5.10.0:/home/fig/FIGdisk/env/2008-0612/linux-debian-x86_64/lib/perl5/site_perl/5.10.0/:/home/fig/FIGdisk/env/2008-0612/linux-debian-x86_64/lib/perl5/site_perl/5.10.0/x86_64-linux:/home/fig/FIGdisk/config:$PERL5LIB

DATE=`date "+%Y%m%d"`
DATE=$DATE.$$ ## We append the process id so that it is now unique!
mkdir "phage_$DATE";
cd "phage_$DATE";

echo "Our job id is $DATE and we are processing data in $PWD"
echo "Retrieving the job from rast"

/home/redwards/bioinformatics/bin/RAST-retrieve-jobs.pl $@  # note, this uses Rob's account which is a superuser account :)

## untar the new archive
for tarfile in $(ls *.tar); do 
	echo "Untarrting $tarfile";
	tar xf $tarfile; 
done

## figure out which genomes we have 
for DIR in *;
do 
	if [ -e $DIR/GENOME ];
	then
		GENOMELIST=("${GENOMELIST[@]}" "$DIR");
	fi
done


if [ ${#GENOMELIST[@]} == 0 ];
then
	echo "We couldn't find any genome directorys in $PWD"
	exit
fi

for GENOME in ${GENOMELIST[@]};
do
	echo "Adding genome $GENOME";
	# adding to the database
	/home/fig/FIGdisk/FIG/bin/fig add_genome RobE $GENOME;
	# marking as phage makes it appear in phantome
	/home/fig/FIGdisk/FIG/bin/mark_genome_as_phage $GENOME;
done


echo "We successfully added ${#GENOMELIST[@]} genomes, but the sims are out of date, We will add them now"


## Identify proteins that are already there, and those for which you need sims. 
## This will make two files, a peg.synonyms.new file and a missing_sims.fasta file. 
## This will also add the new pegs to peg_synonyms if they are not there

echo "Correcting the sims"
/home/fig/FIGdisk/FIG/bin/sims_after_adding_genome ${GENOMELIST[@]};

## Rewrite the peg.synonyms to include the new data
echo "Making new peg synonyms file"
/home/fig/FIGdisk/FIG/bin/rewrite_peg_syns -n peg.synonyms.new -o /home/fig/FIGdisk/FIG/Data/Global/ -d .
mv /home/fig/FIGdisk/FIG/Data/Global/peg.synonyms /home/fig/FIGdisk/FIG/Data/Global/peg.synonyms.$DATE
cp ./peg.synonyms /home/fig/FIGdisk/FIG/Data/Global/
## and load the new peg synonyms in the background. This takes a while!
echo "Loading the peg synonyms file"
/home/fig/FIGdisk/FIG/bin/load_peg_mapping & 


## Remake the non-redundant database to include the new proteins. In fact, what we have is a bunch of symlinks
## and so we can roll back at any time. We append our new proteins to the end of the existing proteins 
## and set a new symlink.

echo "Updating the non-redundant database into /home/fig/FIGdisk/FIG/Data/nr_v024_extended_by_rob/nr.$DATE"
cat /home/fig/FIGdisk/FIG/Data/Global/nr missing_sims.fasta > /home/fig/FIGdisk/FIG/Data/nr_v024_extended_by_rob/nr.$DATE
rm /home/fig/FIGdisk/FIG/Data/Global/nr
chmod a+rwx /home/fig/FIGdisk/FIG/Data/Global/nr # this is so the next user can unlink it !!
ln -s /home/fig/FIGdisk/FIG/Data/nr_v024_extended_by_rob/nr.$DATE /home/fig/FIGdisk/FIG/Data/Global/nr


echo "Done with most of the adding. Now we need to run blast";


######### Running the BLAST
#
# The way that this works is that we copy the data to anthill, our cluster, and run the blastp searches over there
# then we concatenate the results, copy them back, and put them in the database
#
# Of course, we don't know how long the blast searches will take to run (it may be a while, or a few minutes)
# and so we have to create a script that runs when they are complete and copies the data back. This is also put on anthill.

##
## It also requires that I have your public key in my anthill authorized_keys file ... which of course
## means you can log into anthill as me.
	

##  The script to copy data from anthill to edwards.

echo "Writing the anthill sending code to $PWD/copy_from_anthill.sh"
cat << EOF > copy_from_anthill.sh
#!/bin/bash

## Autogenerated code from add_phage_from_rast.sh
## This code is automatically created and copied to anthill, and is run when the blasts are complete

## these are the commands to get the data back again. 
## We switch to be user fig on the return
## note that all of these commands will be run from anthill
cd /home/redwards/phage/update_$DATE; cat blasts/*blastp > missing_sims.fasta.$DATE.blastp;
rm -f nr*
scp missing_sims.fasta.$DATE.blastp fig@edwards.sdsu.edu:/home/fig/FIGdisk/FIG/Tmp
ssh fig@edwards.sdsu.edu "cd /home/fig/FIGdisk/FIG/Tmp/; ./update_sims.$DATE.sh"
EOF


### After we have the sims back, we need to run these steps that add them to the database, etc
# These are the final steps, but we need to add them to a script  that we will run when we are done
# note that we call this script from the last line of the command above!

echo "Writing the fig updating code to /home/fig/FIGdisk/FIG/Tmp/update_sims.$DATE.sh"
cat << EOS > /home/fig/FIGdisk/FIG/Tmp/update_sims.$DATE.sh
#!/bin/bash

## Reformat the sims to add the lengths of the proteins, and flip the sims.
## The flipping means that if we have searched a -> b we get b -> a as well.

/home/fig/FIGdisk/FIG/bin/reformat_sims /home/fig/FIGdisk/FIG/Data/Global/nr < missing_sims.fasta.$DATE.blastp > phage_sims.blastp.$DATE
/home/fig/FIGdisk/FIG/bin/flip_sims phage_sims.blastp.$DATE phage_sims.blastp.flipped.$DATE
	
## put the new sims in place and index the sims. Note that we have to do this from /home/fig/FIGdisk (that is a bug in index_sims)
## but we need to make sure that the database gets the right relative file location (thanks to JP for finding this!)
## so we pushd that location and then popd back out of it
mv phage_sims.blastp*$DATE /home/fig/FIGdisk/FIG/Data/NewSims/
pushd /home/fig/FIGdisk/
/home/fig/FIGdisk/FIG/bin/index_sims FIG/Data/NewSims/phage_sims.blastp*$DATE &
popd

## email the user and let them know we have had success!!
echo -e "We have added the sims for your new phage genomes.\nWe are updating the databases now" |  /usr/bin/mail -s "Sims added and indexing running" "$EMAIL"

EOS

chmod a+x /home/fig/FIGdisk/FIG/Tmp/update_sims.$DATE.sh




## these lines will do the blast for you, but at the moment we don't have a way of getting that data back again automatically. Working on that, though
echo -e "Copying the files onto anthill into phage/update_$DATE"
ssh redwards@anthill.sdsu.edu mkdir -p phage/update_$DATE
scp copy_from_anthill.sh missing_sims.fasta /home/fig/FIGdisk/FIG/Data/Global/nr redwards@anthill.sdsu.edu:phage/update_$DATE
echo "Running formatdb and submitting blast this will take 10-15 minutes"
ssh redwards@anthill.sdsu.edu "cd /home/redwards/phage/update_$DATE;
chmod +x copy_from_anthill.sh;
/usr/local/blast-2.2.18/bin/formatdb -i nr -pT; 
/home/redwards/bioinformatics/cluster/split_blast_queries_hold_jobs.pl -f missing_sims.fasta -n 60 -d blasts -db nr -p blastp -m 8 -e 0.01 -h /home/redwards/phage/update_$DATE/copy_from_anthill.sh"


## send an email with the success part of the story
cat << EOM > /tmp/message.$DATE
Success!

Your phages were added to the database. We are waiting for the sims, but they will be done soon, and you should get an email when they are complete.

Check out http://edwards.sdsu.edu/anthillganglia to see how the cluster is doing.

This is a list of the genomes that we added for you:

EOM

for GENOME in ${GENOMELIST[@]};
do
	echo $GENOME >> /tmp/message.$DATE
done

/usr/bin/mail -s "Phage submissions success" "$EMAIL" < /tmp/message.$DATE


## We're done
exit 0




MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3