[Bio] / FigKernelScripts / correct_gff_missing_contigs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/correct_gff_missing_contigs.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.5 - (download) (as text) (annotate)
Thu Dec 6 20:44:09 2007 UTC (12 years, 4 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +2 -2 lines
correcting a weird error


# there is a problem where the contigs are not being copied correctly from the GFF3 files. This script will take a fasta contigs file, and a genome directory, and do a very simple error check, and then copy the data across
# this should be run as a complement to seed2gff during the install in the unlikely case that you have some sequences that don't have features on them. This is almost exclusively a problem for the 454 metagenomes, so I haven't consituted a wide-eyed fix that could involve altering the gff3 spec.

use raelib;
use strict;

my $usage="$0 [-orgdir organism-dir] <fasta file> <genome ID>";

my $orgdir;

my $rae = new raelib;

while (@ARGV)
    my $arg = $ARGV[0];
    if ($arg =~ /^-(.*)/)
	my $flag = $1;
	if ($flag eq 'orgdir')
	    $orgdir = shift;
	    die $usage;

my ($faf, $gid)=@ARGV;
die $usage unless ($faf && $gid);

# first open the fasta file and get a list of all ids
my $fasta=$rae->read_fasta($faf);

# figure out the contigs for the genome id
# we're not going to use the database in case it's a new genome and we haven't loaded the data yet

my $cf;
if ($orgdir)
    $cf = "$orgdir/contigs";
my $cfa=$rae->read_fasta($cf);
my @contigs=keys %$cfa;

my $ncontigs=scalar(@contigs);
my $nfasta=scalar(keys %$fasta);

# make sure that the contigs for the genome id are a subset of the contigs in the fasta file

my @notfoundcontigs;
map {push @notfoundcontigs, $_ if ($_ && !$fasta->{$_})} @contigs;
if (@notfoundcontigs)
    print STDERR "There was a problem, because the following ", scalar(@notfoundcontigs), " contigs out of $ncontigs total were not found in the fasta file that you have provided. We can not overwrite these data, and you'll have to do it manually: ", join (", ", @contigs), "; and : ", join (", ", @notfoundcontigs), "\n";

# now check that we have less contigs in the current contigs than in the fasta file. If there is more there is an error since they should have been found above. If there is the same number there is no point copying the data across.

if ($ncontigs > $nfasta)
    die "There are more contigs in the contigs file than in the fasta file, but we didn't find them missing. Huh?";
elsif ($ncontigs == $nfasta)
    warn "There are the same number of contigs in the genome contigs file and in the fasta file, so there is no point copying\n";
    exit 0;

# if we get here we want to copy the data

open(OUT, ">$cf") || die "Can't write to $cf";
map {print OUT ">$_\n", $fasta->{$_}, "\n"} sort keys %$fasta;
close OUT;

print STDERR "There were $ncontigs contigs in the genome, and $nfasta sequences in the fasta file, so we rewrote them\n";
print STDERR "Wrote the sequences to the contigs file. You will probably have to reload everything for it all to work, eh?\n";

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3