#include #include /****************************************************************************/ /* xblast.c 2.0 Jean-Michel Claverie Dec 92 */ /* module to read a BLAST output and to generate a filtered query */ /* This version includes the -rd option */ /* Default option: " X " the matching part */ /* -r: keep the matching part, "X" the rest */ /* -d: build a db of each individual contiguous matching segment */ /* each defline is then annotated with the [positions] */ /****************************************************************************/ #define MAXLINE 200 #define DEFLIMIT 500 #define SEQLIMIT 500000 #define MASK 'x' main (argc, argv) int argc; char *argv[]; { int fgt_line(), gt_1stword(), gt_fasta(); FILE *FIN, *QRY; char line[MAXLINE], *pline, word[MAXLINE], X= MASK; char defline[DEFLIMIT],defline2[DEFLIMIT],seq[SEQLIMIT], newseq[SEQLIMIT], *s, *ns; int c, i, pos1, pos2, size, OptR=0 , OptD=0, Opt=0, match; /*********************** test arguments and usages *************************/ if(argc<3 || argc >5) {fprintf(stderr,"\n USAGE: %s [-r] blast.output query \n",argv[0]); fprintf(stderr,"\n pipe | %s [-r] + query \n",argv[0]); fprintf(stderr," %s [-d] blast.output query \n",argv[0]); fprintf(stderr," pipe | %s [-d] + query \n",argv[0]); exit(0);} /************************ fopen and tests ********************************/ if (!strcmp(argv[1],"-r")) OptR=1; /* reverse option set */ if (!strcmp(argv[1],"-d")) OptD=1; /* reverse and divide options set */ Opt = OptR + OptD; /* Option offset */ if (*argv[1+Opt]=='+') FIN=stdin; else if (NULL==(FIN=fopen(argv[1+Opt],"r"))) { fprintf(stderr," ERROR: opening %s ?\n",argv[1+OptR]); exit(0);} if (NULL==(QRY=fopen(argv[2+Opt],"r"))) { fprintf(stderr," ERROR: opening %s ?\n",argv[2+Opt]); exit(0);} if (argc==4+Opt) X= (*argv[3+Opt]); /**************************** get fasta sequence in *************************/ if (2>=(size=gt_1stfasta(QRY, defline, seq))) { fprintf(stderr," ERROR: query %s ?\n",argv[2+Opt]); exit(0);} /****************************************************************************/ while (1) /* skipping until Sequences */ { c=fgt_line(FIN,MAXLINE,line); if (c==EOF) { fprintf(stderr," ERROR: bad BLAST output %s ?\n",argv[1]); exit(0); } gt_1stword (line,word); if(!strcmp("Sequences",word)) break; /* found Sequences */ } if (!OptR && !OptD) /* default: X matching segments */ { while (1) /* Looking for Query line */ { c=fgt_line(FIN,MAXLINE,line); if (c==EOF) break; c=gt_1stword (line,word); if(!strcmp("Query:",word)) /* found Query line */ { pline=line+c; c=gt_1stword (pline,word); sscanf(word,"%d",&pos1); pline += c; c=gt_1stword (pline,word); /* skip sequence */ pline += c; c=gt_1stword (pline,word); sscanf(word,"%d",&pos2); if (pos1> pos2) { c= pos1; pos1= pos2; pos2= c;} if (pos1 < 1 || pos2 > size) {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);} for (c=pos1-1; c pos2) { c= pos1; pos1= pos2; pos2= c;} if (pos1 < 1 || pos2 > size) {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);} for (c=pos1-1; c pos2) { c= pos1; pos1= pos2; pos2= c;} if (pos1 < 1 || pos2 > size) {fprintf(stderr," ERROR: query/ output inconsistency ?\n"); exit(0);} for (c=pos1-1; c */ /* function to input a line from an opened steam file into a char array,*/ /* with checking on the max length < max */ /* file "records" must be delimited by '\n' */ /* string is filled out until \n is encountered, or max-1 chars read */ /* or EOF occurs */ /* arguments: */ /* Fin: FILE pointer to input stream */ /* max: array dimension [], max length including the final \0 */ /* line: string or char * */ /* return: the number of usable chars in the string (\0 not included) */ /* : or max if capacity was exceeded, line contains */ /* max-1 usable characters. */ /* : 0 if NEW LINE was encountered first, line starts with '\0' */ /* :-1 if EOF was encountered, line starts with '\0' */ /* -------------------------------------------------------------------- */ fgt_line (Fin,max,line) int max; char line[]; FILE *Fin; { int c, i; for (i=0; (c=fgetc(Fin))!= EOF && c!='\n' && i<(max-1);i++) line[i]=c; line[i]='\0'; /* replace NEW LINE or last by '\0', don't count */ if (c==EOF) return(-1); /* EOF encountered */ if (i==(max-1) && c!='\n') { while (fgetc(Fin)!='\n'); /* clean buffer */ return(max); } /* flag overflow */ return (i); } /* sgt_line do the same things from a string */ /* ------------------------------------------------------------------- */ /* int gt_1stword() Jean-Michel Claverie feb 1991 SGI */ /* module to read the first word of a closed string */ /* ARGUMENTS */ /* IN: line, a closed string */ /* OUT: word, a closed string, WITH sufficient space allocation */ /* RETURN: next: the relative position of the first characters NOT */ /* belonging to the first word in the string */ /* thus, no more word in the string returns 0 (false) */ /* Note : the next call can then use : line + next, as a pointer to */ /* initiate the search for the next 1st word in the string */ /* -------------------------------------------------------------------- */ int gt_1stword (line,word) char *line, *word; { char *p; for ( p=line; isspace(*p) ; p++); /* skeep leading blanks */ if (*p == '\0') return (0) ; /* no word in line */ while (!isspace(*p) && *p!='\0'){ *word =(*p) ; word++; p++;} *word='\0'; return (p-line); } /********************************************************************** */ /* int gt_1stfasta () jean-Michel Claverie feb 92 SGI */ /* module to load the 1st defline and sequence from a fasta format file */ /* (DEFLIMIT-1) chars of the first line are stored */ /* Definition line must start with a " > " */ /* Sequence size input limited at SEQLIMIT-1 */ /* in: */ /* *Fin : "r" opened file (fasta db) */ /* char *defline, char *seq : char arrays for defline, sequence */ /* out: */ /* defline as a closed string, */ /* seq as a closed string, no blank, lower or upper */ /* return: sequence size or EOF (-1) if no more entry */ /********************************************************************** */ int gt_1stfasta(Fin, defline, seq) FILE *Fin; char *defline, *seq; { int i, c, size; char *in; /* loading all definition line up to deflimit */ if (EOF==(c=fgetc(Fin))) return c; /* end of fasta file */ if (c!='>') { fprintf(stderr,"\n ERROR: missing '>' in def line\n"); exit(0); } *defline='>'; for (i=DEFLIMIT-2, in=defline+1 ; i; i--, in++) { c=fgetc(Fin); if (c==EOF) {fprintf(stderr,"\n ERROR: unexpected EOF in def line\n"); exit(0); } *in = c; if (*in=='\n') break; /* end of def line reached */ } if (!i) /* DEFLIMIT was reached, go find end of line */ { while (EOF !=(c=fgetc(Fin))) if (c=='\n') break; if (c==EOF) {fprintf(stderr,"\n ERROR: unexpected EOF in def line %s\n", defline); exit(0); } } *in= '\0'; /* defline closure */ /* loading sequence up to EOF, next '>' or SEQLIMIT */ i=SEQLIMIT-1; size=0; while (EOF != (c=fgetc(Fin)) && c !='>') { if (!isspace (c)) { *seq =c; seq++; i--; size++; } if (!i) { fprintf (stderr,"\n ERROR: seq overflow %s %d \n", defline, size); exit(0); } } *seq='\0'; ungetc(c,Fin); return size; }