[Bio] / FigKernelScripts / index_contig_files.c Repository:
ViewVC logotype

View of /FigKernelScripts/index_contig_files.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Jan 3 21:00:55 2006 UTC (14 years, 1 month ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +8 -4 lines
fix to work on 64-bit linux

/*
 * Copyright (c) 2003-2006 University of Chicago and Fellowship
 * for Interpretations of Genomes. All Rights Reserved.
 *
 * This file is part of the SEED Toolkit.
 * 
 * The SEED Toolkit is free software. You can redistribute
 * it and/or modify it under the terms of the SEED Toolkit
 * Public License. 
 *
 * You should have received a copy of the SEED Toolkit Public License
 * along with this program; if not write to the University of Chicago
 * at info@ci.uchicago.edu or the Fellowship for Interpretation of
 * Genomes at veronika@thefig.info or download a copy from
 * http://www.theseed.org/LICENSE.TXT.
 */


/*  index_contig_files.c
 *
 *  Usage:  index_contig_files [ index_interval ] < file_list  > seeks_and_lengths
 *  or      index_contig_files -v   (to return version number on standard output)
 *
 *  contigs_file_list contains one or more lines of form:
 *
 *      OrgID \t FileNumber \t FileName \n
 *
 *  Seek records are of form (nucs are 0 base numbering):
 *
 *      OrgID \t ContigId \t FirstNucOnLine \t DesiredNuc \t FileNumber \t LineSeek \n
 *
 *  index_contig_files always provides the seek to the exact nucleotide, so
 *  FirstNucOnLine == DesiredNuc.  The default indexing interval is 10000,
 *  but the value should be passed in on the command line.
 *
 *  Length records are of the form:
 *
 *      OrgID \t ContigId \t ContigLength \t CheckSum \t MD5 CheckSUm\n
 *
 *  Compile with:
 *
 *      cc -O index_contig_files.c md5.c -o index_contig_files
 *
 *  Version History:
 *
 *      1.01: Added MD5 checksum, requiring md5.o.
 */

#define  VERSION  "1.01"

/*  These include files are appropriate for Machintosh OS X  */

#include <stdio.h>
#include <ctype.h>   /*  isspace() */
#include <stdlib.h>  /*  exit()    */
#include <fcntl.h>   /*  O_RDONLY  */
#include <unistd.h>  /*  read(), close() */


#include <stdint.h>  /* int32_t  */

/* From the MD5 code */

#include "EXTERN.h"
#include "perl.h"
typedef struct {
  U32 signature;   /* safer cast in get_md5_ctx() */
  U32 A, B, C, D;  /* current digest */
  U32 bytes_low;   /* counts bytes in message */
  U32 bytes_high;  /* turn it into a 64-bit counter */
  U8 buffer[128];  /* collect complete 64 byte blocks */
} MD5_CTX;

extern void MD5Update(MD5_CTX* ctx, const U8* buf, STRLEN len);
extern void MD5Init(MD5_CTX *ctx);
extern void MD5Final(U8* digest, MD5_CTX *ctx);
extern char* hex_16(const unsigned char* from, char* to);
extern char* base64_16(const unsigned char* from, char* to);

/*  This seems to be defined somewhere above, I cannot figure
 *  out where.  It might need to be uncommented, or the
 *  include files changed on other systems.
 */
/*  int open( char * name, int mode, int perms );  */

#define  MAXERROR  10
#define  BUFLEN    (256*1024)
#define  INPLEN    ( 64*1024)
#define  IDLEN     ( 16*1024)
#define  DFLT_INDEX_INTERVAL  10000

#define  isnuc(c)  isnuc_array[ c ]

static int isnuc_array[256] = {
/*  0  1  2  3  4  5  6  7  8  9  a  b  e  d  e  f        */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 1 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 3 */
    0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,  /* 4 */
    0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,  /* 5 */
    0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,  /* 6 */
    0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,  /* 7 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 8 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 9 */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* a */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* b */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* c */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* d */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* e */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   /* f */
/*  0  1  2  3  4  5  6  7  8  9  a  b  e  d  e  f        */
};


/*  From the discussion of cksum:
 *
 *                The Open Group Base Specifications Issue 6
 *                       IEEE Std 1003.1, 2004 Edition
 *  Copyright  2001-2004 The IEEE and The Open Group, All Rights reserved.
 *
 *  http://www.opengroup.org/onlinepubs/009695399/utilities/cksum.html
 */

static unsigned long crctab[] = {
0x00000000,
0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6,
0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd,
0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac,
0x5bd4b01b, 0x569796c2, 0x52568b75, 0x6a1936c8, 0x6ed82b7f,
0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a,
0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039,
0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58,
0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033,
0xa4ad16ea, 0xa06c0b5d, 0xd4326d90, 0xd0f37027, 0xddb056fe,
0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95,
0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4,
0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0,
0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5,
0x2ac12072, 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16,
0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca, 0x7897ab07,
0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c,
0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1,
0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b,
0xbb60adfc, 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698,
0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d,
0x94ea7b2a, 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e,
0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, 0xc6bcf05f,
0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34,
0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80,
0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb,
0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a,
0x58c1663d, 0x558240e4, 0x51435d53, 0x251d3b9e, 0x21dc2629,
0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, 0x3f9b762c,
0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff,
0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e,
0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65,
0xeba91bbc, 0xef68060b, 0xd727bbb6, 0xd3e6a601, 0xdea580d8,
0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3,
0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2,
0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74,
0x857130c3, 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640,
0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, 0x7b827d21,
0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, 0x6c47164a,
0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, 0x18197087,
0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec,
0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d,
0x2056cd3a, 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce,
0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb,
0xdbee767c, 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18,
0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, 0x89b8fd09,
0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662,
0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf,
0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4
};


/*  Function prototypes:  */

typedef uint32_t crc_value_t;

void report_len( char * org, char * id, unsigned long seqlen, crc_value_t crc, MD5_CTX * );

int  index_one ( char * org_id, char * file_num, int fd, int index_interval );

void usage(char *prog);


char           inpbuf[INPLEN];
unsigned char  buffer[BUFLEN];
char           idbuf[IDLEN+1];


int main ( int argc, char **argv ) {
    int    infile, index_interval;
    char  *bptr, *org_id, *file_num, *file_name;
    unsigned int  c;

    /* -v flag returns version */

    if ( ( argc == 2 ) && ( argv[1][0] == '-'  )
                       && ( argv[1][1] == 'v'  )
                       && ( argv[1][2] == '\0' )
       ) {
        printf( "%s\n", VERSION );
        return 0;
    }

    if ( ( argc == 2 ) && ( sscanf( argv[1], "%d", &index_interval ) == 1 ) ) {
        if ( index_interval < 100 ) {
            fprintf( stderr, "index_interval (%d) must be >= 100\n", index_interval );
            usage( argv[0] );
        }
    }
    else if ( argc == 1 ) {
        index_interval = DFLT_INDEX_INTERVAL;   /* default index interval */
    }
    else {
        usage( argv[0] );
    }

    org_id = inpbuf;
    while ( fgets( inpbuf, INPLEN,  stdin ) ) {
	bptr = inpbuf;

	/*  Find the end of the organism id */

	while ( ( c = *bptr ) && ( c != '\t' ) ) bptr++;
	if ( ! c ) continue;
	*bptr++ = '\0';   /* convert tab to end-of-string */
	file_num = bptr;  /* next character is start of file number */

	/*  Find the end of the file number */

	while ( ( c = *bptr ) && ( c != '\t' ) ) bptr++;
	if ( ! c ) continue;
	*bptr++ = '\0';    /* convert tab to end-of-string */
	file_name = bptr;  /* next character is start of file name */

	/*  Find the end of the file name (strip the newline) */

	while ( ( c = *bptr ) && ( c != '\n' ) && ( c != '\r' ) ) bptr++;
	*bptr++ = '\0';    /* convert newline to end-of-string */

	/*  Open the file and pass the descriptor pointer to the reader */

	if ( ( infile = open( file_name, O_RDONLY, 0 ) ) < 0 ) {
	    fprintf( stderr, "Failed to open contigs file: %s\n", file_name );
	    continue;
	}
	(void) index_one( org_id, file_num, infile, index_interval );
	(void) close( infile );
    }

    return 0;
}


int index_one ( char * org_id, char * file_num, int infile, int index_interval ) {
    unsigned char  *bptr;
    /* Maintain an unsigned char version of the current input char, for passing to MD5Update */
    char   chr;
    unsigned long   c;
    unsigned long   seqlen;
    crc_value_t     crc;
    MD5_CTX	    ctx;
    long long       seek;
    int             idlen, ntogo, nfills, index_point, nerror;

    idbuf[0] = '\0';  /* initialize to empty string */
    index_point = 0;
    bptr   = buffer;
    ntogo  = 0;
    nfills = 0;
    seqlen = 0;
    nerror = 0;
    crc    = 0;
    MD5Init(&ctx);

    /* Next line in file */

    while ( 1 ) {
        if ( ntogo <= 0 ) {
            ntogo = read( infile, (void *) buffer, (size_t) BUFLEN );
            if ( ntogo <= 0 ) {
                report_len( org_id, idbuf, seqlen, crc, &ctx );
                return ntogo;
            }
            nfills++; bptr= buffer;
        }
        c = *bptr++; ntogo--;
	chr = tolower(c);

        /* could be > or sequence */

        if ( c == '>' ) {   /*  identifier line: copy the id to idbuf */

            /*  Is there a length to report?  */

            report_len( org_id, idbuf, seqlen, crc, &ctx );
            seqlen = 0;
	    MD5Init(&ctx);
            crc = 0;

            /*  Make a copy of the new id  */

            idlen = 0;
	    if ( ntogo <= 0 ) {
		ntogo = read( infile, (void *) buffer, (size_t) BUFLEN );
		if ( ntogo <= 0 ) {
		    report_len( org_id, idbuf, seqlen, crc, &ctx );
		    return ntogo;
		}
		nfills++; bptr = buffer;
	    }
	    c = *bptr++; ntogo--;
	    chr = tolower(c);
            while ( ( ! isspace(c) ) && ( idlen < IDLEN ) ) {
                idbuf[ idlen++ ] = c;
		if ( ntogo <= 0 ) {
		    ntogo = read( infile, (void *) buffer, (size_t) BUFLEN );
		    if ( ntogo <= 0 ) {
		        report_len( org_id, idbuf, seqlen, crc, &ctx );
		        return ntogo;
		    }
		    nfills++; bptr = buffer;
		}
		c = *bptr++; ntogo--;
		chr = tolower(c);
            }
            idbuf[ idlen ] = '\0';

            /*  report truncated id  */

            if ( ! isspace(c) ) {
                fprintf( stderr, "For organism %s, contig id truncated to %d characters:\n", org_id, (int) IDLEN );
                fprintf( stderr, ">%s\n", idbuf );
            }

            /*  Flush the rest of the input line  */

            while ( c != '\n' ) {
		if ( ntogo <= 0 ) {
		    ntogo = read( infile, (void *) buffer, (size_t) BUFLEN );
		    if ( ntogo <= 0 ) {
		        report_len( org_id, idbuf, seqlen, crc, &ctx );
		        return ntogo;
		    }
		    nfills++; bptr = buffer;
		}
		c = *bptr++; ntogo--;
		chr = tolower(c);
            }

            index_point = 0;  /*  next sequence milestone to report  */
        }

        /*  Not an id line, so it's data:  */

        else {
            while ( c != '\n' ) {
                if ( isnuc( c ) ) {

                    /*  If we are at the next index point, record the seek location  */

                    if ( ( seqlen >= index_point ) && idbuf[0] ) {
                        seek = ( nfills - 1 ) * (long long) BUFLEN
                             + ( bptr - buffer - 1 );
                        printf( "%s\t%s\t%d\t%d\t%s\t%lld\n", org_id, idbuf,
                                 index_point, index_point, file_num, seek
                              );
                        index_point += index_interval;
                    }
                    seqlen++;
                    crc = (crc << 8) ^ crctab[ (crc >> 24) ^ c ];
		    MD5Update(&ctx, &chr, 1);
                }

                /*  All non-nucleotides should be white space */

                else if ( ! isspace( c ) ) {

                    /*  Current perl code counts illegal characters as nucleotides;
                     *  so (for now) we do the same
                     */

                    if ( ( seqlen >= index_point ) && idbuf[0] ) {
                        seek = ( nfills - 1 ) * (long long) BUFLEN
                             + ( bptr - buffer - 1 );
                        printf( "%s\t%s\t%d\t%d\t%s\t%lld\n", org_id, idbuf,
                                index_point, index_point, file_num, seek
                              );
                        index_point += index_interval;
                    }
                    seqlen++;
                    crc = (crc << 8) ^ crctab[ (crc >> 24) ^ c ];
		    MD5Update(&ctx, &chr, 1);

                    /*  But let's add an error message: */

                    if ( nerror++ <= MAXERROR ) {
                        if ( nerror <= MAXERROR ) {
                            fprintf( stderr, "Invalid nucleotide (%c) in %s contig %s\n",
                                              c, org_id, idbuf
                                   );
                        }
                        else {
                            fprintf( stderr, "Etc.\n");
                        }
                    }
                }

                /*  Next character  */

		if ( ntogo <= 0 ) {
		    ntogo = read( infile, (void *) buffer, (size_t) BUFLEN );
		    if ( ntogo <= 0 ) {
		        report_len( org_id, idbuf, seqlen, crc, &ctx );
		        return ntogo;
		    }
		    nfills++; bptr = buffer;
		}
		c = *bptr++; ntogo--;
		chr = tolower(c);
            }
        }
    }

    return( 0 );   /*  never get here  */
}


void report_len( char * org, char * id, unsigned long seqlen, crc_value_t crc, MD5_CTX *ctx ) {
    if ( seqlen && id && id[0] ) {

	unsigned char digest[16];
	char result[33];
	
        /*  Finish the crc calculation with length bytes, ...  */

        crc_value_t n;
        n = seqlen;
        while (n != 0) {
            crc = (crc << 8) ^ crctab[ (crc >> 24) ^ (n & 0xFF) ];
            n >>= 8;
        }

        /*  ... and bitwise complement  */

	MD5Final(digest, ctx);
	hex_16(digest, result);
        printf( "%s\t%s\t%lu\t%lu\t%s\n", org, id, seqlen, ~crc, result  );
    }
}


void usage(char *prog) {
    fprintf( stderr,
             "Usage:  %s [ index_interval ] < file_list  > seeks_and_lengths\n"
             "or      %s -v   (to return version number on standard output)\n",
             prog, prog
           );
    exit(0);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3