/* nrdb - quasi-nonredundant database generator Copyright (C) 1997 by Warren R. Gish. All Rights Reserved. Usage: nrdb [options] file1 [file2 [file3 [file4] ...]] Usage: nrdb [options] file1 id1 [file2 id2 [file3 id3 [file4 id4] ...]] where each file# argument is the name of an input file in FASTA format; and each id# (identifier) argument is an arbitrary (possibly zero-length) character string to be prepended to each sequence name read from the corresponding input file. By default, the nonredundant database is sent to standard output, but only after all input files have been read in the order specified. Statistics about the number of redundant sequence records and residues are reported to standard error at the end of a successful execution. See the call to the getopt() function and its associated switch-case statement below for a brief description of the available command line options. Example: nrdb -o outfile pir swissprot genpept gpupdate Updated: May 1997 with compression for ambiguous DNA sequences Author: W. Gish, NCBI, February 1991 */ #include #include #include #include #include #define EXTERN #include "nrdb.h" static int nfiles; NRFilePtr *files; static int maxcatfiles = INT_MAX; SeqStr ss; SeqName sn; size_t minlen = 1; int prefix_flag = 0; void usage(); main(argc, argv) int argc; char **argv; { struct rlimit rl; int i, j, c; SeqStrPtr new, old; SeqNamePtr snp; module = str_dup(misc_basename(argv[0], NULL)); while ((c = getopt(argc, argv, "pa:o:h:l:c:C:d:n:iL")) != -1) { switch (c) { case 'p': prefix_flag = 1; break; case 'a': /* append output to the specified file */ ofp = fopen(optarg, "a+"); if (ofp == NULL) fatal(1, "Unable to open specified output file"); break; case 'o': /* write output to the specified file (overwriting) */ ofp = fopen(optarg, "w"); if (ofp == NULL) fatal(1, "Unable to open specified output file"); break; case 'h': fprintf(stderr, "\nThe -h option is no longer valid and will be ignored for now.\n"); break; case 'l': /* minimum required sequence length */ if (sscanf(optarg, "%u", &minlen) != 1) fatal(1, "Invalid -l# minimum length"); break; case 'c': /* no. of input files for which definitions will be concatenated to any existing definition(s) */ if (sscanf(optarg, "%d", &maxcatfiles) != 1 || maxcatfiles < 0) fatal(1, "Invalid -c# max. number of concatenating files"); break; case 'n': /* max. no. of sequence names reported per sequence */ if (sscanf(optarg, "%d", &desc_max) != 1 || desc_max < 0) fatal(1, "invalid -n#: max. number of sequences names to report per sequence: %s", optarg); break; case 'i': /* skip comparison of first database against itself */ skip1 = TRUE; break; case 'C': if (chdir(optarg) == -1) fatal(1, "Couldn't change to directory: %s", optarg); break; case 'L': /* lock data in memory */ #ifdef DATLOCK if (plock(DATLOCK) == 0) plocked = 1; else break; #ifdef RLIMIT_RSS if (getrlimit(RLIMIT_RSS, &rl) != 0) break; rl.rlim_cur *= 0.67; setrlimit(RLIMIT_RSS, &rl); #endif /* RLIMIT_RSS */ #else /* !DATLOCK */ fprintf(stderr, "\nWARNING: the -L option is ignored on this computing platform.\n"); #endif /* !DATLOCK */ break; case 'd': /* delimiter between multiple descriptions */ if (strlen(optarg) == 1 && !isdigit(*optarg)) { delim = *optarg; break; } if (strncasecmp(optarg, "0x", 2) == 0) { if (sscanf(optarg+2, "%lx", &i) != 1) fatal(1, "Hex conversion error on delimiter %s", optarg); } else if (sscanf(optarg, "%d", &i) != 1) fatal(1, "Decimal conversion error on delimiter %s", optarg); if (i >= (1< \"%s\"\n", optarg); default: usage(); break; } } if (optind == argc) usage(); if (prefix_flag && (argc - optind)%2 != 0) fatal(1, "a filename was specified with no matching identifier"); initalpha(); if (ofp == NULL) { ofp = stdout; if (ofp == NULL) fatal(1, "could not open output file"); } #ifdef _IOFBF setvbuf(ofp, NULL, _IOFBF, 256*KBYTE); #endif nfiles = (argc - optind) / (1 + prefix_flag); files = (NRFilePtr *)ckalloc0(sizeof(NRFilePtr)*nfiles); SeqStr_InitBase(); for (i=optind, j = 0; iseqlen == 0) { curfp->numnull++; continue; } if (new->seqlen < minlen) { curfp->lencnt++; curfp->lenres += new->seqlen; continue; } if ((i != optind || !skip1) && (old = SeqStr_AlreadyFound(new)) != NULL) { curfp->dupres += old->seqlen; curfp->nummatches++; if (j < maxcatfiles || old->name1->nrfp->filenum == curfp->filenum) SeqStr_AppendName(old, new->name1); continue; } else { SeqStr_Append(new); continue; } } NRFile_Close(curfp); } Report(listhead); fflush(stdout); fprintf(stderr, "\n\nProgressive Statistics:\n\n"); fprintf(stderr, "%11s", ""); fprintf(stderr, "--------- Records --------- "); fprintf(stderr, "-------------- Residues -----------\n"); fprintf(stderr, "Database "); fprintf(stderr, " Read Duplicate Written "); fprintf(stderr, " Read Duplicate Written\n"); for (i=optind; inumseqs; nummatches += curfp->nummatches; totres += curfp->totres; dupres += curfp->dupres; lencnt += curfp->lencnt; lenres += curfp->lenres; numnull += curfp->numnull; fprintf(stderr, "%-10s %7s %7s %7s ", misc_basename(curfp->filename, NULL), Ultostr(curfp->numseqs,1), Ultostr(curfp->nummatches,1), Ultostr(curfp->numseqs - curfp->nummatches - curfp->lencnt - curfp->numnull,1) ); fprintf(stderr, "%11s %11s %11s\n", Ultostr(curfp->totres,1), Ultostr(curfp->dupres,1), Ultostr(curfp->totres - curfp->dupres - curfp->lenres,1) ); } fprintf(stderr, "\n%-10s %7s %7s %7s ", "Totals:", Ultostr(numseqs,1), Ultostr(nummatches,1), Ultostr(numseqs - nummatches - numnull - lencnt,1) ); fprintf(stderr, "%11s %11s %11s\n", Ultostr(totres,1), Ultostr(dupres,1), Ultostr(totres - dupres - lenres,1) ); if (numnull != 0) fprintf(stderr, "\nTotal no. of zero-length sequences encountered: %d\n", numnull); fprintf(stderr, "\nNo. of base word hits: %s (%s total)\n", Ultostr(numhits,1), Ultostr(numtothits,1)); fprintf(stderr, "No. of 32-bit hash hits: %s\n", Ultostr(numhashhits,1)); fprintf(stderr, "Total memory allocated: %0.3lf MB\n", (double)totalloced() / (KBYTE*KBYTE)); fprintf(stderr, "Longest comment line read: %s\n", Ultostr(maxnamelen,1)); fprintf(stderr, "Longest comment line written: %s\n", Ultostr(maxwnamelen,1)); fprintf(stderr, "Longest sequence read: %s\n", Ultostr(maxseqlen,1)); if (minlen > 1) fprintf(stderr, "Sequences less than %d residues in length: %s\n", minlen, Ultostr(lencnt,1)); exit(0); } void usage() { fprintf(stderr, "nrdb 2.0.1 -- quasi-nonredundant database generator\n\n"); fprintf(stderr, "Copyright (C) 1997,1998 by Warren R. Gish. All Rights Reserved.\n"); fprintf(stderr, "\nUsage:\n\n %s [options] file1 [file2 [file3 ...]]\n", module); fprintf(stderr, "\nUsage:\n\n %s -p [options] file1 id1 [file2 id2 [file3 id3 ...]]\n", module); fprintf(stderr, "\nwhere options are:\n"); fprintf(stderr, "\t-o filename\t-- name of file in which to save output\n"); fprintf(stderr, "\t-a filename\t-- name of file to which output should be appended\n"); fprintf(stderr, "\t-l#\t\t-- min. required sequence length\n"); fprintf(stderr, "\t-i\t\t-- do not check first file for duplicates\n"); fprintf(stderr, "\t-c#\t\t-- no. of input files for concatenating descriptions\n"); fprintf(stderr, "\t-C directory\t-- change directory\n"); #ifdef DATLOCK fprintf(stderr, "\t-L\t\t-- lock data pages in memory (super-user only)\n"); #endif fprintf(stderr, "\t-n#\t\t-- max. no. of seq. descriptions to report per sequence\n"); fprintf(stderr, "\t-d#\t\t-- delimiter between consecutive descriptions\n"); fprintf(stderr, "\t-p\t\t-- use id prefixes on command line\n"); exit(1); }