/* mutseqer.c * Michael E Sparks (mespar1@iastate.edu) * Last modified: 16 July 2007 * * Program to read in a file of FASTA-formatted nucleotide sequences, * e.g., cDNA sequences, of not more than MAXLENGTH-1 residues apiece, * and introduce a command line-specified, length-based percentage * (perc2mut) of * 1) random insertions or deletions of up to a specified maximum * length (maxindellen) or fixed percentage of the sequence length * (MAXPERCOFLEN), whichever is smaller, and/or * 2) base substitutions, * for the purpose of simulating random sequencing errors. * * Copyright (c) 2004,2007 Michael E Sparks * All rights reserved. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #define ALFSIZE 4 /* Cardinality of nt alphabet */ #define DELSYM '%' /* not element of nuctype[]! */ #define INSSYM '^' /* not element of nuctype[]! */ #define LINELEN 60 /* for pretty-printing results */ #define MAXLENGTH ((int)10E+6) /* Maximum length of sequence * * to mutate+1 */ #define MAXPERCOFLEN 0.10 /* Bound on indel length */ #define RESULTSTREAM stdout /* Stream to report results to */ #define USAGE "\a\nUSAGE: %s seqs2mut.fas perc2mut maxindellen\n\n" /* prototypes */ char *mutTheSeq(char *sequence,int seqlen,int perc2mut,int maxindel); void prettyprintseq(char *seq,int linelen,FILE *stream); /* main app */ int main(int argc,char *argv[]) { char *linein=NULL, /* buffer to slurp input into */ *sequence=NULL, /* buffer to store pristine seq */ *mutseqed=NULL, /* buffer to store mutated seq */ *commcheck=NULL, /* verify fasta format */ *progname=NULL, /* basename of executable */ *infilename=NULL; /* input stream name */ FILE *infile=NULL; /* connect to input file */ int seqlen, /* length of sequence */ perc2mut, /* user-specified mutation %'age */ maxindel, /* user-specified max indel len */ c, /* stores parsed characters */ i,j; /* iterator variables */ /* Parse command line parameters */ if(argc<3) { fprintf(stderr,USAGE,argv[0]); exit(EXIT_FAILURE); } /* find executable basename, assuming UNIX-like pathnames */ if((progname=strrchr(argv[0],(int)'/'))!=NULL) ++progname; else progname=argv[0]; /* is perc2mut a valid percentage? */ perc2mut=atoi(argv[2]); if(perc2mut<0||perc2mut>100) { fprintf(stderr,USAGE,argv[0]); fprintf(stderr," (%i not a valid percentage)\n",perc2mut); exit(EXIT_FAILURE); } infilename=argv[1]; if((infile=fopen(infilename,"rt"))==NULL) { fprintf(stderr,"Can't open %s!\n",infilename); exit(EXIT_FAILURE); } maxindel=atoi(argv[3]); /* Allocate static buffers */ if((linein=(char*)malloc(sizeof(char)*MAXLENGTH))==NULL|| (sequence=(char*)malloc(sizeof(char)*MAXLENGTH))==NULL) { fprintf(stderr,"Insufficient memory.\n"); exit(EXIT_FAILURE); } else linein[0]=sequence[0]='\0'; /* Process input sequences */ srand(time(NULL)); (void)fgets(linein,MAXLENGTH,infile); linein[strlen(linein)-1]='\0'; if((commcheck=(char*)memchr(linein,'>',1))==NULL) { free(linein); free(sequence); fclose(infile); fprintf(stderr,"Expected Fasta-formatted input!\n"); exit(EXIT_FAILURE); } else fprintf(RESULTSTREAM,"%s (Mutated at %d%% level by %s)\n", linein,perc2mut,progname); while(fgets(linein,MAXLENGTH,infile)!=NULL) { if((commcheck=(char*)memchr(linein,'>',1))!=NULL) { /* Print stored mutant */ seqlen=strlen(sequence); if((mutseqed=mutTheSeq(sequence,seqlen,perc2mut,maxindel))==NULL) { fprintf(stderr,"mutTheSeq returned no mutant!\n"); exit(EXIT_FAILURE); } else { prettyprintseq(mutseqed,LINELEN,RESULTSTREAM); free(mutseqed); } /* Report description line */ linein[strlen(linein)-1]='\0'; fprintf(RESULTSTREAM,"%s (Mutated at %d%% level by %s)\n", linein,perc2mut,progname); sequence[0]='\0'; } else { for(i=0,j=strlen(sequence);i (indelmaxlen=(int)(MAXPERCOFLEN*seqlen))) indellen=rand()%indelmaxlen+1; for(j=0;j=printseqlen&& \ (printseq=(char*)realloc(printseq, \ sizeof(char)*(printseqlen+=5000)))==NULL) { \ fprintf(stderr,"Err (mutTheSeq): Insufficient memory.\n"); \ exit(EXIT_FAILURE); \ } \ } /* Decode modified sequence signal into the printseq array * * i tracks position in pristine sequence, j in the mutated * * sequence, and k tracks bases just prior to insertions. */ for(i=j=k=0;i (indelmaxlen=(int)(MAXPERCOFLEN*seqlen))) indellen=rand()%indelmaxlen+1; GROW_PRINTSEQ_IF_NEEDED /* Write nt prior to insertion event... */ printseq[j++]=nucATinsert[k++]; /* ...and the insert itself. */ for(l=0;l