/* ** C program to read in sequences in Fasta format ** test them for internal repeats ** and print them out with the internal repeats flagged ** ** David J. States and Jean Michel Claverie ** May 12, 1992 , REV 3.0 October 28, 1993 ** ** Please cite: Jean Michel Claverie & David J. States (1993) ** Computers and Chemistry 17: 191-201. ** */ #include #include #include #include #include #include #include "datadefs.h" typedef struct { char *title; char *seq; int len; int max; } Sequence; #define TMPSIZE 32000 #define STRINGSIZE 32000 int ascend=1; int descend=1; int verbose=0; int ncut=4; int mcut=1; double pcut=0.01; int scut=0; char subchar='X'; int repeats=0; int *iseq,*hit; int* Allocate(nbytes) int nbytes; { int* ptr; ptr = (int*)malloc(nbytes); if (ptr==NULL) { perror("Allocate memory failure"); exit(1); } return(ptr); } void Free(ptr) int *ptr; { if (ptr != NULL) free(ptr); } char Translate(ch) char ch; { if (isupper(ch)) ch = tolower(ch); return(ch); } int AlphaToN2(ch,alphabet) int ch; char *alphabet; { int i; char *p; if (isupper(alphabet[0]) && islower(ch)) ch = toupper(ch); if (islower(alphabet[0]) && isupper(ch)) ch = tolower(ch); p = strchr(alphabet,ch); if (p != NULL) i = (p - alphabet); else i = 22; return(i); } void ReadSequence(F,s) FILE *F; Sequence *s; { char ch; int l,ich; s->len = l = 0; if (s->title == NULL) { s->title = (char*)Allocate(STRINGSIZE*sizeof(char)); } fgets(s->title,STRINGSIZE,F); if (s->title[strlen(s->title)-1] == '\n') s->title[strlen(s->title)-1] = '\0'; if (feof(F)) return; if (s->seq == NULL) { s->seq = (char*)Allocate(TMPSIZE*sizeof(int)); s->max = TMPSIZE; iseq = (int*)Allocate(TMPSIZE*sizeof(int)); hit = (int*)Allocate(TMPSIZE*sizeof(int)); } for (ich=getc(F); (ich!=EOF && ich!=(int)'>'); ich=getc(F) ) { ch = (char)ich; if (isprint(ch) && (ch != ' ')) s->seq[l++]=Translate(ch); if (l>=s->max) { s->seq = (char*)realloc(s->seq, (s->max+TMPSIZE)*sizeof(char)); s->max += TMPSIZE; iseq = (int*)realloc(iseq,(s->max+TMPSIZE)*sizeof(int)); hit = (int*)realloc(hit, (s->max+TMPSIZE)*sizeof(int)); } } s->seq[l] = '\0'; s->len = l; if (ich==(int)'>') ungetc(ich,F); } /* ** Calculate the expected information content, in nats, ** for an alignment described by the scoring matrix m ** given the sequence composition f */ double ExpectedInformation(m,lambda,f) SCOREMATRIX m; double lambda,f[20]; { int i,j; double sum,tot,fij,eij; sum = tot = 0.0; for (i=0; i<20; i++) for (j=0; j<20; j++) { fij = f[i]*f[j]; tot += fij; eij = m[i][j]*fij*exp(lambda*m[i][j]); sum += eij; } return(lambda*sum/tot); } /* ** Subroutine to actually process the sequence for repeats ** and write out the output */ void ProcessSeq(s,m,lambda,K,H) Sequence *s; SCOREMATRIX m; double lambda,K,H; { int i,j,k,off,sum,beg,end,top,noff; int topcut,fallcut; double s0; if (s->len == 0) return; for (i=0; ilen; i++) iseq[i] = AlphaToN2(s->seq[i],alphabet); for (i=0; ilen; i++) hit[i]=0; noff = s->len-1; if (ncut>0) noff=ncut; /* ** Determine the score cutoff so that pcut will be the fraction ** of random sequence eliminated assuming lambda, K, and H are ** characteristic of the database as a whole */ if (scut!=0) { topcut = scut; } else { s0 = - log( pcut*H / (noff*K) ) / lambda; if (s0>0) topcut = floor(s0 + log(s0)/lambda + 0.5); else topcut = 0; } fallcut = (int)log(K/0.001)/lambda; if (verbose) { printf("# Score cutoff = %d, Search from offsets %d to %d\n", topcut,mcut,noff); if (ascend && descend) printf("# both members of each repeat flagged\n"); else if (ascend) printf("# last member of each repeat flagged\n"); else if (descend) printf("# first member of each repeat flagged\n"); printf("# lambda = %.3f, K = %.3f, H = %.3f\n", lambda,K,H); } for (off=mcut; off<=noff; off++) { sum=top=0; beg=off; end=0; for (i=off; ilen; i++) { sum += m[iseq[i]][iseq[i-off]]; if (sum>top) { top=sum; end=i; } if (top>=topcut && top-sum>fallcut) { for (k=beg; k<=end; k++) { if (ascend) hit[k] = 1; if (descend) hit[k-off] = 1; } sum=top=0; beg=end=i+1; } else if (top-sum>fallcut) { sum=top=0; beg=end=i+1; } if (sum<0) { beg=end=i+1; sum=top=0; } } if (top>=topcut) { for (k=beg; k<=end; k++) { if (ascend) hit[k] = 1; if (descend) hit[k-off] = 1; } } } for (i=0; ilen; i++) { s->seq[i] = toupper(s->seq[i]); if (hit[i] ^ repeats) { if (subchar==0) { s->seq[i] = tolower(s->seq[i]); } else { s->seq[i] = subchar; } } } printf("%s\n",s->title); for (i=0; ilen; i+=60) { for (j=i; jlen; j++) putchar((int)s->seq[j]); putchar((int)'\n'); } } main (argc,argv) int argc; char *argv[]; { int i,ich; int *m; char ch,*filename; double lambda,K,H; Sequence s; FILE *F; K = 0.2; m = (int*)pam120; lambda = lambda120; s.title=s.seq=NULL; filename = NULL; for (i=1; iall diagonals\n"); printf(" [-m min-search-offset] default 1\n"); printf(" [-a] [-d] ascending or descending\n"); printf(" [-.] [-x] [-o] output options\n"); printf(" [-r] print repeats instead of unique\n"); printf(" [-v] verbose output\n"); exit(0); } if (strcmp(filename,"-")==0) F = stdin; else F = fopen(filename,"r"); if (F==NULL) { perror("Unable to open sequence file"); exit(1); } H = ExpectedInformation(m,lambda,fdayhoff); s.len=0; for (ich=getc(F); ich!=EOF; ich=getc(F)) { ch = (char)ich; if (ch=='>') { ungetc(ich,F); ProcessSeq(&s,m,lambda,K,H); ReadSequence(F,&s); } } ProcessSeq(&s,m,lambda,K,H); exit(0); /* to return 0 */ }