[Bio] / FigKernelScripts / extract_halo_genomes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/extract_halo_genomes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Sun Feb 24 15:21:29 2013 UTC (6 years, 8 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.6: +24 -12 lines
compute correspondences for a pan genome

# The process of generating a usable halo goes like this:
# 
# 	1. Get a file with genome IDs in the first "column"
# 
# 	2. Run 
# 		extract_halo_genomes -d Halo -f < genome.IDs
# 
# 	   This builds a directory representing the halo.  There is
# 	   a separate subdirectory for each entity-type or relationship type.
# 
# 	3.  Then run 
# 
# 		generate_prolog_facts Halo
# 
# 	    This builds a file for each entity or relationship type.
# 	    The file will be "facts.pl" placed in the appropriate
# 	    subdirectory.
# 
# 	4.  Then run 
# 
# 		make_supporting_predicates Halo > halo_support.pl
# 
#             This generates extraction/access predicates for the prolog
# 	    facts.
# 
# Then, move it all to the "prolog machine".  There, you run
# 
# 	perl compile_into_prolog.pl Halo saved_state
# 	sicstus
# 	| ?- restore(saved_state),compile(halo_support), compile(cs_predicates).
# 
# Then, you should have access to the entire framework.
# 

use strict;
use Data::Dumper;
use SeedEnv;
use Getopt::Long;

my $usage = "usage:  perl extract_halo_genomes -d Dir [-f] < genome.ids";
my($dir);
my $full = 0;

my $rc  = GetOptions('d=s' => \$dir, 
		     'f' => \$full);
if ((! $rc) || (! $dir)) { print STDERR $usage; exit }


# acceptable entities; we pick up implied relationships automatically
my %halo = (
	('Genome' => 1),
	('TxonomicGrouping' => 2),
	('OTU' => 2),
	('SSRow' => 2),
	('Model' => 2),
	('Feature' => 2),
	('Contig' => 3),
#	('Annotation' => 3),
	('Variant' => 3),
	('LocationInstance' => 3),
	('BioMass' => 3),
	('CompoundInstance' => 3),
	('ReactionInstance' => 3),
	('Pairing' => 3),
	('Subsystem' => 3),
	('Role' => 99),
	('SubsystemClass' => 4),
	('Location' => 4),
	('PairSet' => 4),
	('ProteinSequence' => 4),
	('AtomicRegulon' => 4),
	('Family' => 5),
	('Source' => 99),
	('SSCell' => 5),
	('Publication' => 5),
	('LocalizedCompound' => 5),
	('Compound' => 99),
	('Reaction' => 99),
	('Media' => 7),
	('Complex' => 99)
    );

my %er_field_types;
my %keep_rel;

if (! -d $dir) 
{ 
    mkdir($dir,0777) 
}
else
{
    system "rm -r $dir/*";              # else delete previous versions
}

my %paired_relationship;
$/ = "\n//\n";
foreach $_ (`ermodel_to_text < ~parrello/CdmiData/Published/KSaplingDBD.xml`)
{
    $/ = "\n";
    if ($_ =~ /^entity\s+(\S+)\n(.*)\n\/\/\n/s)
    {
	my $entity = $1;
	next if (! $halo{$entity});
	my $field_data = $2;
	my $typed_fields = { map { (! $_->[2]) ? ($_->[0],$_->[1]) : () } 
			     map { [split(/\t/,$_)] } 
			     split(/\n/,$field_data) 
			     };
	if ($entity eq 'Subsystem') 
	{ 
	    delete($typed_fields->{'notes'});
	    delete($typed_fields->{'description'});
	}
	elsif ($entity eq 'Variant')
	{
	    delete($typed_fields->{'comment'});
	    delete($typed_fields->{'role_rule'});
	}
	$er_field_types{$entity} = $typed_fields;
    }
    elsif ($_ =~ /^relationship\s+(\S+)\t(\S+)\t(\S+)\t\S+\t(\S+)\n(.*)\n\/\/\n/s)
    {
	my $rel1 = $1;
	my $e1 = $2;
	my $e2 = $3;
	my $rel2 = $4;
	my $field_data = $5;
	$paired_relationship{$rel1} = $rel2;
	$paired_relationship{$rel2} = $rel1;

	next if ((! $halo{$e1}) || (! $halo{$e2}));
	my $typed_fields = { map { (! $_->[2]) ? ($_->[0],$_->[1]) : () } 
			     map { [split(/\t/,$_)] } 
			     split(/\n/,$field_data) 
			     };
	$er_field_types{$rel1} = $typed_fields;

	my($from_link,$to_link) = ($typed_fields->{from_link},$typed_fields->{to_link});
	# now flip the from and to link types
	$typed_fields->{from_link} = $to_link;
	$typed_fields->{to_link}   = $from_link;

	$er_field_types{$rel2} = $typed_fields;
	$keep_rel{$e1}->{$rel1} = $e2;
	$keep_rel{$e2}->{$rel2} = $e1;
    }
    $/ = "\n//\n";
}

my %keep_these_entities;
my %genomes = map { ($_ =~ /^(\S+)/) ? ($1 => 1) : () } <STDIN>;
foreach my $g (keys(%genomes))
{
    $keep_these_entities{'Genome'}->{$g} = 1;
}

my @stack = ('Genome');               # Initially, only 'Genome' remains to be expanded
my %seen = ( Genome => 1 );

while (my $nxt_entity = shift @stack)
{
    print STDERR "expanding $nxt_entity\n";
    my @from_instances = sort keys(%{$keep_these_entities{$nxt_entity}});  # sorted instances of entity
    &dump_entity($dir,$nxt_entity,\%er_field_types,\%keep_these_entities,$full);
    foreach my $rel (keys(%{$keep_rel{$nxt_entity}}))
    {
	my $e2 = $keep_rel{$nxt_entity}->{$rel};
	if (! $seen{$e2})
	{
	    print STDERR "    $rel: $nxt_entity => $e2\n";
	    &dump_relationship($dir,
			       $nxt_entity,
			       $e2,
			       $rel,
			       $paired_relationship{$rel},
			       \%er_field_types,
			       \@from_instances,
			       \%keep_these_entities,
			       $full);
	}
    }
    $seen{$nxt_entity} = 1;
    print STDERR "expanded $nxt_entity\n";
    @stack = sort { $halo{$a} <=> $halo{$b} } grep { ! $seen{$_} } keys(%keep_these_entities);
}

sub dump_entity {
    my($dir,$entity,$er_field_types,$keep_these_entities,$full) = @_;
    print STDERR "Dumping $entity\n";

    my $typesH = $er_field_types->{$entity};
    my @fields = &order(keys(%$typesH));
    if (! $full)
    {
	@fields = grep { $_ eq "id" } @fields;
    }
    mkdir("$dir/$entity",0777);
    open(TYPE,">$dir/$entity/types") || die "could not open $dir/$entity/types";
    my $typed_fields = join("\t",map { $_ . "," . $typesH->{$_} } @fields);
    print TYPE $typed_fields, "\n";
    close(TYPE);

    open(E,">$dir/$entity/entity") || die "could not open $dir/$entity/entity";
    my $values = &get_entity_values($entity,\@fields,$keep_these_entities->{$entity},$full);
    foreach my $tuple (@$values)
    {
	print E join("\t",(@$tuple)),"\n";
    }
    close(E);
    print STDERR "\tdumped $entity\n";
}

sub get_entity_values {
    my($entity,$fields,$instances,$full) = @_; 

    if (keys(%$instances) == 0) { return [] }
    if (! $full) { return [map { [$_] } keys(%$instances)] }

    my @values = ();
    my @flds = grep { $_ ne 'id' } @$fields;
    if (@flds == 0)
    {
	@values = map { [$_] } keys(%$instances);
    }
    else
    {
	my $cmd = "| get_entity_$entity -f " . "'" . join(",",@flds) . "' > tmp$$";
	print STDERR "\tRunning: $cmd\n";
	if (open(CMD,$cmd))
	{
	    foreach $_ (keys(%$instances)) { print CMD "$_\n" }
	    close(CMD);
	    @values = map { chop; [split(/\t/,$_)] } `cat tmp$$`;
	    unlink("tmp$$");
	}
	else
	{
	    print STDERR "FAILED TO EXECUTE $cmd\n";
	}
    }
    return \@values;
}

sub dump_relationship {
    my($dir,$e1,$e2,$rel1,$rel2,$er_field_types,$from_instances,$keep_these_entities,$full) = @_;

    print STDERR "Dumping relationship: $e1 => $rel1 / $e2 =>$rel2\n";
    my $typesH1 = $er_field_types->{$rel1};
    my $typesH2 = $er_field_types->{$rel2};
    my @fields1 = &order(keys(%$typesH1));
    if (! $full)
    {
	@fields1 = grep { ($_ eq "from_link") || ($_ eq "to_link") } @fields1;
    }
    my @fields2 = @fields1;
    my $values = &get_relationship_values($rel1,$e2,\@fields1,$from_instances,$full);
    foreach my $tuple (@$values) 
    { 
	$keep_these_entities->{$e2}->{$tuple->[1]} = 1;
    }
    &write_rel_dir($dir,$rel1,\@fields1,$typesH1,$values);
    my @flipped_vals = map { my $tmp = $_->[0]; $_->[0] = $_->[1]; $_->[1] = $tmp; $_ } @$values;
    &write_rel_dir($dir,$rel2,\@fields2,$typesH2,\@flipped_vals);
}

sub write_rel_dir {
    my($dir,$rel,$fields,$typesH,$values) = @_;

    mkdir("$dir/$rel",0777);
    open(TYPE,">$dir/$rel/types") || die "could not open $dir/$rel/types";
    my $typed_fields = join("\t",map { $_ . "," . $typesH->{$_} } @$fields);
    print TYPE $typed_fields, "\n";
    close(TYPE);

    open(R,">$dir/$rel/relationship") || die "could not open $dir/$rel/relationship";
    foreach my $tuple (@$values)
    {
	print R join("\t",(@$tuple)),"\n";
    }
    close(R);
    print STDERR "\tdone dumping $rel\n";
}

sub get_relationship_values {
    my($rel,$e2,$fields,$from_instances,$full) = @_; 

    if (@$from_instances == 0) { return [] }
    my @flds = grep { ($_ ne 'from_link') } @$fields;

    my @values = ();
    my $cmd = "| get_relationship_$rel -rel " . "'" . join(",",@flds) . "' > tmp$$";
    print STDERR "\trunning command (",scalar @$from_instances,"): $cmd\n";
    if (open(CMD,$cmd))
    {
	foreach $_ (@$from_instances) { print CMD "$_\n" }
	close(CMD);
	@values = map { chop; [split(/\t/,$_)] } `cat tmp$$`;
	unlink("tmp$$");
    }
    else
    {
	print STDERR "FAILED TO EXECUTE $cmd\n";
    }
    return \@values;
}

sub order {
    my(@fldsI) = @_;

    my @flds = sort @fldsI;
    my $i;
    for ($i=0; ($i < @flds); $i++)
    {
	if ($flds[$i] eq 'id')
	{
	    my $tmp = $flds[$i];
	    splice(@flds,$i,1);
	    splice(@flds,0,0,$tmp);
	}
	elsif ($flds[$i] eq 'from_link')
	{
	    my $tmp = $flds[$i];
	    splice(@flds,$i,1);
	    splice(@flds,0,0,$tmp);
	}
	elsif ($flds[$i] eq 'to_link')
	{
	    my $tmp = $flds[$i];
	    splice(@flds,$i,1);
	    splice(@flds,1,0,$tmp);
	}
    }
    return @flds;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3