/************************************************************************/ /* orf.c Jean-Michel Claverie march 92 1.0 */ /* module to extract all orfs from a nucleotide sequence */ /* within a given range, default is 60 nt to 10000 nt */ /* note: only the given strand is studied. */ /* **********************************************************************/ /* USAGE: orf seq_name */ /************************************************************************/ /* output is a collection of protein sequences in fasta format */ /* sorted by decreasing size. */ /************************************************************************/ /* WARNING: Standard eukaryotic genetic code is used */ /************************************************************************/ #include #include /* default values for orf size range */ #define ORFMINSIZE 60 #define ORFMAXSIZE 10000 /* max original seq size */ #define LIMIT 1000000 /* max number of results */ #define RLIMIT 5000 char *CODGEN="xflvispatcrgsyhdnflvispatcrgsyhdnllvmspatwrgr*qekllvispat*rgr*qek"; main (argc, argv) int argc; char *argv[]; { int fileto_char(), cl_strg(), LOWER(); char lower(); char *SEQ_END,*last_codon,*valid[RLIMIT][2],*current_start,*end_orf,*current; char seq[LIMIT]; char orf[ORFMAXSIZE+1], *ps; int i, j, size , lorf, mxorf, nvalid, phase; int orf_min=ORFMINSIZE, orf_max=ORFMAXSIZE; int value[RLIMIT], indx[RLIMIT] ; /* -------------------- reading arguments ----------------------------- */ if (argc <2 || argc > 4) /* usage */ { fprintf (stderr, "\n USAGE: %s seq_name \n",argv[0]); exit(0); } if (argc >2) /* read ORF min size */ if (1!=sscanf(argv[2], "%d", &orf_min)) {fprintf (stderr, "\n ERROR: minimal size? %s\n", argv[2]);exit(0); } if (argc >3) /* read ORF max size */ if (1!=sscanf(argv[3], "%d", &orf_max)) {fprintf (stderr, "\n ERROR: maximal size? %s\n", argv[3]);exit(0); } if (orf_max < orf_min || orf_min <6) {fprintf (stderr, "\n ERROR: orf min or max?\n");exit(0); } /* -------------------------- get the sequence ------------------------ */ if (1> fileto_char(argv[1],seq, LIMIT)) { fprintf(stderr,"\n ERROR: sequence file %s\n",argv[1]); exit(0); } size=cl_strg(seq); LOWER(seq); SEQ_END=seq+size-1 ; /* -1 for size */ last_codon= SEQ_END -2; for (nvalid=0, phase=0; phase<3; phase++) { current_start= seq + phase; while (SEQ_END-current_start > 3 && nvalid < RLIMIT) { mxorf = is_orf(current_start,last_codon); end_orf= current_start + mxorf -1 ; lorf= end_orf - current_start + 1; if ( lorf >= orf_min && lorf <= orf_max) { valid[nvalid][0] = current_start; valid[nvalid][1] = end_orf; nvalid++; } current_start = end_orf + 4; } /* end one phase */ if (nvalid ==RLIMIT) { fprintf (stderr,"\n WARNING: orf number limited to %d first\n", nvalid); } } if (!nvalid) { fprintf(stderr,"\n No orfs within the requested size range\n"); exit (0); } fprintf(stderr,"\n %d orfs within the requested size range\n", nvalid); /****************************************************************************/ /* insert here the computation of any predictive index */ /****************************************************************************/ /* size of ORF in nucleotides */ for (i=0; i%s %d %d aa size: %d \n", argv[1],(current_start-seq+1),(end_orf -seq+1),value[i]/3); for (j=69, ps=orf; *ps != '\0'; ps++, j--) { putchar((int)*ps); if (!j) { putchar((int)'\n'); j=70; } } putchar((int)'\n'); } } /********************************************************************** */ /* int intsort_db() jean-Michel Claverie april 91 SGI */ /* module to sort a int array of known size */ /* input: fval[size] int array to be sorted */ /* order int : 1 for ascending, o for descending order */ /* output: isort[size] int array of sorted indices */ /* return: size */ /* If no related index array isort[size] is to be co-sorted, */ /* a NULL pointer must be passed for isort. */ /********************************************************************** */ int sort_db (fval, size, order, isort) int *isort, size, order; int *fval; { int *f1 , *f0, f; int *i1, *i0, c, i; if (size<2) return size; if (isort==NULL) { i=1; while (i *f0) { f= (*f1); *f1 = (*f0); *f0 =f; /* swap values */ if ( i >1) i--; else i++; } else i++; } if (order) { rev_i (fval,size);} } else { for (i=0, i0=isort; i *f0) { f= (*f1); *f1 = (*f0); *f0 =f; /* swap values */ c= (*i1); *i1 = (*i0); *i0 =c; /* swat index */ if ( i >1) i--; else i++; } else i++; } if (order) { rev_i (fval,size); rev_i (isort,size); } } return size; } /************************************************************************/ /* is_orf() Jean-Michel Claverie April 1991 SGI */ /* module to determine where is the next stop codon */ /* return the int position in the sequence where first STOP codon */ /* occured. */ /* Input: genetic code: string */ /* seq string pointer, end string pointer last usable position */ /************************************************************************/ int is_orf(seq, end) char *seq, *end ; { char *ps; for (ps=seq; ps<=end; ps+=3) if (*(CODGEN+cod_n(ps,3))=='*') break; return (ps-seq); } /************************************************************************/ /* cod_nx.c JMC dec 1990 SGI */ /* VERSION 2 of cod_n MARCH 1991: reacts better on unknown codons */ /* This module compute the non-zero code of a (DNA) subsequence */ /* arguments : a *char pointer to the current position of the seq */ /************************************************************************/ /* unknown codons returns zero, and a warning message on stderr */ /* true codons are coded in [1-64] */ /************************************************************************/ int cod_n(seq,ktuple) char *seq; int ktuple; { static int val[11] = {1,4,16,64,256,1024,4096,16384,65536,262144,1048576}; char *n = "tcga"; /* allowed nucleotides */ char *a; int i; int code = 0; int *v; if(ktuple>11 || ktuple<1){printf("\n ERROR ktuple value? cod_n\n"); exit(0);} for(i=0,v=val; ipmin ; pmax--, pmin++) { i=(*pmin); *pmin= (*pmax); *pmax=i; } return size; } /* ------------------------------------------------------------------ */ /* fileto_char JMC nov 1990 SGI */ /* #include */ /* This module read a text file into a character array (closed string) */ /* and return the number of usable characters (not including '\0') */ /* argument IN : */ /* fname: valid input file name. */ /* limit: char array [limit], the number of usable chars is (limit-1) */ /* argument OUT: */ /* *array : pointer to the char array to be filled */ /* return: */ /* 0 for existing empty file */ /* size= number of usable char (not including '\0') */ /* -1 for error upon file opening */ /* limit in case of overflow (limit-1) chars are usable */ /* all read chars including '\n' are included in char array */ /* file is closed upon return */ /* ------------------------------------------------------------------ */ int fileto_char(fname,array,limit) char *array, *fname; int limit; { FILE *Fin; int c,i; /* c is needed to detect EOF */ if((Fin=fopen(fname,"r"))==NULL) /* test for successful opening */ { fprintf(stderr,"\n ERROR: unable to open file %s \n",fname); return (-1); } for(i=0;(c=fgetc(Fin))!=EOF && i< (limit-1); i++,array++) *array=c ; *array='\0'; /* close the string and replace EOF */ fclose(Fin); /* close the file for easy re-entry */ if (i==(limit-1) && c != EOF) /* test for overflow */ { fprintf(stderr,"\n WARNING truncated reading of file: %s",fname); return (limit); /* flag for overflow */ } return(i); /* return the number of usable chars */ } /* ---------------------------------------------------------------------*/ /* #include */ /* cl_strg.c JMC nov 1990 SGI */ /* for space char cleaning and compacting a string */ /* this module gets a string with a \0 delimiter in and */ /* 1) remove all "blank characters", */ /* 2) write a '\0' right after the last non-blank character, */ /* 3) return the size=number of usable non blank characters. */ /* Use this module to provide a bona fide file name */ /* argument IN: */ /* inarray, a string with a least a End-Of-String delimiter */ /* argument OUT: */ /* outarray,the same string with no leading, trailing or intervening */ /* blanks, ended by '\0'). */ /* return: */ /* the number of non blank characters, excluding the final '\0'. */ int cl_strg(inarray) char *inarray; { char *outarray; int size=0; outarray=inarray; /* we will work within the same string */ while(isspace((int)*inarray)) inarray++; /* skeep all leading blanks */ while(*inarray != '\0') /* to the end of the input string */ { if(!isspace((int)*inarray)) {*outarray= *inarray; outarray++; size++;} /* copy it */ inarray++; /* move forward, don't copy */ } *outarray='\0'; /* close the string */ return(size); /* return size */ } /* -------------------------------------------------------------------- */ /* LOWER() Jean-Michel Claverie SGI Jan 91 */ /* module to convert a closed STRING from upper (A) to lower (a) case */ /* argument: input : a closed string */ /* return: number of char in string, excluding the ending '\0' */ /* -------------------------------------------------------------------- */ int LOWER(array) char *array; { char *pc; pc=array; while (*pc !='\0'){ *pc=lower(*pc); pc++;} return(pc-array); } /* -------------------------------------------------------------------- */ /* lower() Jean-Michel Claverie SGI Jan 91 */ /* module to convert a character from upper (A) to lower (a) case */ /* argument: input : a char */ /* return: the converted char */ /* -------------------------------------------------------------------- */ char lower(c) char c; { if (c>='A' && c<='Z') return (c + 'a'-'A'); else return(c); }