[Bio] / FigKernelScripts / fastasize.c Repository:
ViewVC logotype

View of /FigKernelScripts/fastasize.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Dec 5 18:59:41 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +17 -16 lines
Doh, put proper comments in for C code.

/*
 * Copyright (c) 2003-2006 University of Chicago and Fellowship
 * for Interpretations of Genomes. All Rights Reserved.
 *
 * This file is part of the SEED Toolkit.
 * 
 * The SEED Toolkit is free software. You can redistribute
 * it and/or modify it under the terms of the SEED Toolkit
 * Public License. 
 *
 * You should have received a copy of the SEED Toolkit Public License
 * along with this program; if not write to the University of Chicago
 * at info@ci.uchicago.edu or the Fellowship for Interpretation of
 * Genomes at veronika@thefig.info or download a copy from
 * http://www.theseed.org/LICENSE.TXT.
 */


/*  fastasize.c
 *
 *  Usage:  fastasize -t < fasta_file  > nseq nresidues
 *  or      fastasize    < fasta_file  > id nresidues ...
 *
/*  These include files are appropriate for Machintosh OS X  */

#include <stdio.h>
#include <ctype.h>   /*  isspace() */
#include <stdlib.h>  /*  exit()    */
#include <unistd.h>  /*  read()    */

#define  BUFLEN    (256*1024)
#define  INPLEN    ( 64*1024)
#define  IDLEN     ( 16*1024)
#define  DFLT_INDEX_INTERVAL  10000

#define  fillbuf(buf, len)  read( (int) 0, (void *) buf, (size_t) len )

/*  Function prototypes:  */

void report_seq( char * id, unsigned long seqlen );

void report_ttl( int n_seq, unsigned long long ttllen );

void usage( char *prog );


unsigned char  buffer[BUFLEN];
char           idbuf[IDLEN+1];


int main ( int argc, char **argv ) {
    unsigned long long  ttllen;
    unsigned char  *bptr;
    unsigned long   c, seqlen;
    int             totalonly, n_seq, idlen, ntogo;

    /* -t flag returns only the total */

    totalonly = 0;
    if ( ( argc > 1 ) && ( argv[1][0] == '-'  )
                      && ( argv[1][1] == 't'  )
                      && ( argv[1][2] == '\0' )
       ) {
	totalonly = 1;
	argc--;
    }
    else if (argc > 1 ) { usage( argv[0] ); }

    idbuf[0] = '\0';  /* initialize to empty string */
    bptr   = buffer;  /* pointer to next character in buffer */
    ntogo  =  0;      /* unused characters in input buffer */
    n_seq  =  0;
    seqlen =  0;
    ttllen =  0;

    /*  Process one line of input  */

    while ( 1 ) {
	if ( ntogo <= 0 ) {
	    if ( ( ntogo = fillbuf( buffer, BUFLEN ) ) <= 0 ) {
		if ( totalonly ) {
		    ttllen += seqlen;
		    report_ttl( n_seq, ttllen );
		}
		else {
		    report_seq( idbuf, seqlen );
		}
		exit( ntogo );
	    }
	    bptr= buffer;
	}
	c = *bptr++; ntogo--;

	/*  Line could start with >, or be sequence data  */

	if ( c == '>' ) {

	    /*  New sequence.  Is there an previous sequence to report?  */

	    if ( ! totalonly ) report_seq( idbuf, seqlen );

	    /*  Adjust cumulative values and reset sequence values  */

	    ttllen += seqlen;
	    seqlen = 0;

	    if ( ntogo <= 0 ) {
		if ( ( ntogo = fillbuf( buffer, BUFLEN ) ) <= 0 ) {
		    if ( totalonly ) report_ttl( n_seq, ttllen );
		    exit( ntogo );
		}
		bptr = buffer;
	    }
	    c = *bptr++; ntogo--;

	    /*  Make a copy of the new id  */

	    idlen = 0;
	    while ( ( ! isspace(c) ) && ( idlen < IDLEN ) ) {
		idbuf[ idlen++ ] = c;
		if ( ntogo <= 0 ) {
		    if ( ( ntogo = fillbuf( buffer, BUFLEN ) ) <= 0 ) {
		        if ( totalonly ) report_ttl( n_seq, ttllen );
		        exit(ntogo);
		    }
		    bptr = buffer;
		}
		c = *bptr++; ntogo--;
	    }
	    idbuf[ idlen ] = '\0';

	    /*  report truncated id  */

	    if ( ! isspace(c) ) {
		fprintf( stderr, "Sequence id truncated to %d characters:\n", (int) IDLEN );
		fprintf( stderr, ">%s\n", idbuf );
	    }

	    /*  Flush the rest of the input line  */

	    while ( c != '\n' ) {
		if ( ntogo <= 0 ) {
		    if ( ( ntogo = fillbuf( buffer, BUFLEN ) ) <= 0 ) {
		        if ( totalonly ) report_ttl( n_seq, ttllen );
		        exit(ntogo);
		    }
		    bptr = buffer;
		}
		c = *bptr++; ntogo--;
	    }

	    n_seq++;  /*  First data for a new sequence  */
	}

	/*  Not an id line, so it's data:  */

	else {
	    while ( c != '\n' ) {  /* finish the line */
		if ( ! isspace( c ) ) {
		    seqlen++;
		}

		/*  Next character  */

		if ( ntogo <= 0 ) {
		    if ( ( ntogo = fillbuf( buffer, BUFLEN ) ) <= 0 ) {
		        if ( totalonly ) {
		            ttllen += seqlen;
		            report_ttl( n_seq, ttllen );
		        }
			else {
			    report_seq( idbuf, seqlen );
			}
		        exit( ntogo );
		    }
		    bptr = buffer;
		}
		c = *bptr++; ntogo--;
	    }
	}
	/*  Go back to top to process next line  */
    }

    exit( 0 );   /*  never get here  */
}


void report_seq( char * id, unsigned long seqlen ) {
    if ( id && id[0] ) {
	printf( "%s\t%lu\n", id, seqlen );
    }
}


void report_ttl( int n_seq, unsigned long long ttllen ) {
    printf( "%d\t%llu\n", n_seq, ttllen );
}


void usage( char *prog ) {
    fprintf( stderr,
            "Usage:  %s -t < fasta_file  > nseq \\t nresidues\n"
            "or      %s    < fasta_file  > id \\t nresidues\\n ...\n",
             prog, prog
           );
    exit(0);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3