#!/usr/bin/env python '''Allows loading gthxml PGS data into xGDB (http://sourceforge.net/projects/xgdb) This module allows one to load GeneSeqer or GenomeThreader spliced alignment results, for nucleotide- and/or protein-type query sequences, into an xGDB backend. It parses results stored in the gthxml format, as defined at http://www.public.iastate.edu/~mespar1/gthxml/GenomeThreader.rng.txt . (Note that GenomeThreader can generate this output directly using the -xmlout option, whereas GeneSeqer/GeneSeqer2 plain text output can be retrofitted to this standard via the GSQ2XML.pl tool, available at http://www.public.iastate.edu/~mespar1/gthxml/GSQ2XML.pl .) Our local xGDB administrators have historically loaded only the transcript-based hqPGS results from GeneSeqer output; I refer to these data as ``autocorrelated spliced alignments". There is (currently) no such equivalent in the GenomeThreader world, though. Therefore, the user should specify if his/her input data is based on GeneSeqer results .AND. if they wish to use hqPGS data rather than raw spliced alignments (set --geneseqer); the default for this is GenomeThreader-style. Note that hqPGS data are not available for protein spliced alignments. Usage: python xGDBload_PgsFromgthxml.py -h --help # prints this message and exits -x --xreffile file # cross reference table file for genomic template indices # per-line format is either # '(\S+)\s+(\S+)\s+chr(\d+)' .OR. # '(\S+)\s+(\S+)\s+(\d+)' # depending on whether a chromosome was used as the genomic # reference type or not, respectively. The third token will # be used as a unique numeric key for a given genomic template, # while the identifier for that template, as used in the # spliced alignment gthxml output, must be indicated in # either field one or two of the record. -c --cxreffile file # cross reference file for unique identifiers, e.g., GI # numbers, of full-length cDNA sequences. Optional. These are # not required to be numeric, but cannot contain whitespace. -p --pxreffile file # cross reference file for unique identifiers, e.g., GI # numbers, of protein sequences. These are not required to # be numeric, but cannot contain whitespace. If the gthxml # data contains protein spliced alignment data, this option # is mandatory. -i --infile file # Input file identifier. Input stream can be gthxml data # piped by stdin (default), a file on the resident file # system, or the URL for a (strictly!) gzip'ed gthxml file # from the Web -o --outfile file # Output file identifier. Can be stdout (default) or a # local file, which will be overwritten if it exists -t --table name # Handle for xGDB tables to be populated (default is 'good_pgs') -u --cdnatable name # If query sequences contain full-length cDNAs, you can # specify a table name for storing cDNA-based results, separate # from those of ESTs and/or proteins. Use in conjunction with # the -c option, if desired (default table name is # 'cdna_good_pgs', enacted only if -c is user-defined). -v --prottable name # If query sequences contain proteins, you can specify a # table name for storing protein-based results, separate # from those of ESTs/cDNAs. Use in conjunction with # the -p option, if desired (default table name is # 'prot_good_pgs', enacted only if -p is user-defined). --chromosome # Were alignments made with fully assembled pseudomolecules? --geneseqer # Load hqPGS data (autocorrelated spliced alignments) from # GeneSeqer results, rather than PGS? Note that hqPGS data # are not available for protein alignments. --nuclloscore float # minimum overall alignment score of EST/cDNA alignments (default 0.8) --nuclhiscore float # maximum overall alignment score of EST/cDNA alignments (default 2.0) --protloscore float # minimum overall alignment score of protein alignments (default 0.3) --prothiscore float # maximum overall alignment score of protein alignments (default 2.0) --nucllocov float # minimum coverage of max(template,EST/cDNA) coverage (default 0.8) --nuclhicov float # maximum coverage of max(template,EST/cDNA) coverage (default 2.0) --protlocov float # minimum coverage of max(template,protein) coverage (default 0.5) --prothicov float # maximum coverage of max(template,protein) coverage (default 2.0) ''' __author__ = 'Michael E Sparks (mespar1@iastate.edu)' __version__ = '1.2.1' __copyright__ = 'Copyright (c) 2006 Michael E Sparks, all rights reserved' __license__ = 'GNU General Public License (GPL)' # Version history # 1.2.1 - Records 'PGS_line' data regardless, overwriting # with hqPGS data, if desired # Modified some comments # # 1.2 - Added support for parsing protein alignment data # cDNA xref file now only requires a single column # Modified some comments # # 1.1 - Minor modifications to comments # Enclosed gi column in single quotes # # 1.0 - Initial release # Parsed EST/cDNA-based PGS and hqPGS results import getopt, re, sys from xml.sax.handler import ContentHandler ABS_MIN_SCORE = 0.00 ABS_MAX_SCORE = 2.00 # I don't like this, but it works...mes ABS_MIN_COV = 0.00 ABS_MAX_COV = 2.00 # I don't like this, but it works...mes class xGDBgthxmlPGSError(Exception): '''Exception subclass for addressing fatal errors in xGDBgthxmlPGS objects''' pass class xGDBgthxmlPGS(ContentHandler): '''A handler to populate xGDB with spliced alignment results in gthxml format. Allows one to parse a gthxml v1.0 file from a local filesystem or over the Web and print executable SQL statements to populate appropriate xGDB backend tables. ''' def __init__(self,argv): '''Class "constructor" method''' # Set default values prior to parsing argument list # Note that these may trigger exceptions later in the class # methods, but this is the intended effect! In short, don't # monkey with this part of the code. self.filexref = '/dev/null' # cross ref table file for genomic template indices self.cfilexref = '/dev/null' # cross ref table file for cDNA identification # strings, such as GI numbers (optional) self.cfilexrefp = False # Did user specify a cDNA cross ref file? self.pfilexref = '/dev/null' # cross ref table file for prot identification # strings, such as GI numbers (optional) self.pfilexrefp = False # Did user specify a protein cross ref file? self.table = 'good_pgs' # handle for the xGDB tables we want to populate self.ctable = 'cdna_good_pgs' # handle for xGDB tables we want to populate with cDNA info self.ptable = 'prot_good_pgs' # handle for xGDB tables we want to populate with prot info self.chrp = False # Were alignments made with fully assembled pseudomolecules? self.gsqp = False # Input is hqPGS from GeneSeqer, or no? (Not possible if # source program was GenomeThreader, nor for protein # alignments!) self.infile = sys.stdin # Input file identifier. Input stream can be gthxml data # piped by stdin, a file on the resident file system, or a # (strictly!) gzip'ed gthxml file from the Web self.instreamp = False # Have we opened the input stream yet? self.webp = False # If self.infile contains an 'http://' prefix, this will be # be set to True in the code below. User will not directly # modify this data member self.outfile = sys.stdout # Output file identifier. Can be stdout or a local file, # which will be overwritten if it exists self.outstreamp = False # Have we opened the output stream yet? self.nuclloscore = 0.80 # minimum overall alignment score of EST/cDNA alignments self.nuclhiscore = ABS_MAX_SCORE # maximum overall alignment score of EST/cDNA alignments self.protloscore = 0.30 # minimum overall alignment score of protein alignments self.prothiscore = ABS_MAX_SCORE # maximum overall alignment score of protein alignments self.nucllocov = 0.80 # minimum coverage of max(template,EST/cDNA) coverage self.nuclhicov = ABS_MAX_COV # maximum coverage of max(template,EST/cDNA) coverage self.protlocov = 0.50 # minimum coverage of max(template,protein) coverage self.prothicov = ABS_MAX_COV # maximum coverage of max(template,protein) coverage self.scoreDNE = '-1.0' # Stub value for protein donor/acceptor scores, # which do not exist. self.phase_symbols = ('a', 'b', 'c') # Used to correlate gap phases to symbols # in protein alignments # Parse option list try: opts, args = getopt.getopt(argv, 'hx:c:p:t:u:v:i:o:', ['help', 'xreffile=', 'cxreffile=', 'pxreffile=', 'table=', 'cdnatable=', 'prottable=', 'infile=', 'outfile=', 'chromosome', 'geneseqer', 'nuclloscore=', 'nuclhiscore=', 'protloscore=', 'prothiscore=', 'nucllocov=', 'nuclhicov=', 'protlocov=', 'prothicov=']) except getopt.GetoptError: print __doc__ sys.exit(2) for o, a in opts: if o in ('-h', '--help'): print __doc__ sys.exit(None) elif o in ('-x', '--xreffile'): self.filexref = a elif o in ('-c', '--cxreffile'): self.cfilexref = a elif o in ('-p', '--pxreffile'): self.pfilexref = a elif o in ('-t', '--table'): self.table = a elif o in ('-u', '--cdnatable'): self.ctable = a elif o in ('-v', '--prottable'): self.ptable = a elif o in ('-i', '--infile'): self.infile = a elif o in ('-o', '--outfile'): self.outfile = a elif o == '--chromosome': self.chrp = True elif o == '--geneseqer': self.gsqp = True elif o == '--nuclloscore': self.nuclloscore = float(a) elif o == '--nuclhiscore': self.nuclhiscore = float(a) elif o == '--protloscore': self.protloscore = float(a) elif o == '--prothiscore': self.prothiscore = float(a) elif o == '--nucllocov': self.nucllocov = float(a) elif o == '--nuclhicov': self.nuclhicov = float(a) elif o == '--protlocov': self.protlocov = float(a) elif o == '--prothicov': self.prothicov = float(a) # Verify nucleotide/protein score and coverage values are reasonable. self.__validateparms() # Populate cross-reference dictionaries self.__xrefLookup(self.filexref,self.chrp) self.__cpxrefLookup(self.cfilexref,'cDNA') self.__cpxrefLookup(self.pfilexref,'prot') # Attempt to open input stream if self.infile is not sys.stdin: if re.search(r'^http://',self.infile): # This flag will come in handy when we need to close compressed_gthxml_stream self.webp = True # I only support parsing gzip'ed gthxml over the web, # to enforce lower overhead on network traffic. import gzip, StringIO, urllib websocket = urllib.urlopen(self.infile) compressed_gthxml = websocket.read() websocket.close() self.compressed_gthxml_stream = StringIO.StringIO(compressed_gthxml) self.instream = gzip.GzipFile(fileobj=self.compressed_gthxml_stream) else: self.instream = open(self.infile,"r") else: self.instream = self.infile self.instreamp = True # Attempt to open output stream if self.outfile is not sys.stdout: self.outstream = open(self.outfile,"w") else: self.outstream = self.outfile self.outstreamp = True # initialize various data members self.__reset() def __validateparms(self): '''Verify nucleotide/protein score and coverage values are reasonable.''' # Verify nucleotide score values were legitimate if self.nuclloscore > self.nuclhiscore: sys.stderr.write('Warning: nuclloscore > nuclhiscore, swapping instead!??\n') garbage = self.nuclloscore self.nuclloscore = self.nuclhiscore self.nuclhiscore = garbage if self.nuclloscore < ABS_MIN_SCORE: sys.stderr.write('Warning: Bad --nuclloscore, using %s instead\n' \ % str(ABS_MIN_SCORE)) self.nuclloscore = ABS_MIN_SCORE if self.nuclhiscore > ABS_MAX_SCORE: sys.stderr.write('Warning: Bad --nuclhiscore, using %s instead\n' \ % str(ABS_MAX_SCORE)) self.nuclhiscore = ABS_MAX_SCORE # Verify protein score values were legitimate if self.protloscore > self.prothiscore: sys.stderr.write('Warning: protloscore > prothiscore, swapping instead!??\n') garbage = self.protloscore self.protloscore = self.prothiscore self.prothiscore = garbage if self.protloscore < ABS_MIN_SCORE: sys.stderr.write('Warning: Bad --protloscore, using %s instead\n' \ % str(ABS_MIN_SCORE)) self.protloscore = ABS_MIN_SCORE if self.prothiscore > ABS_MAX_SCORE: sys.stderr.write('Warning: Bad --prothiscore, using %s instead\n' \ % str(ABS_MAX_SCORE)) self.prothiscore = ABS_MAX_SCORE # Verify nucleotide coverage values were legitimate if self.nucllocov > self.nuclhicov: sys.stderr.write('Warning: nucllocov > nuclhicov, swapping instead!??\n') garbage = self.nucllocov self.nucllocov = self.nuclhicov self.nuclhicov = garbage if self.nucllocov < ABS_MIN_COV: sys.stderr.write('Warning: Bad --nucllocov, using %s instead\n' % str(ABS_MIN_COV)) self.nucllocov = ABS_MIN_COV if self.nuclhicov > ABS_MAX_COV: sys.stderr.write('Warning: Bad --nuclhicov, using %s instead\n' % str(ABS_MAX_COV)) self.nuclhicov = ABS_MAX_COV # Verify protein coverage values were legitimate if self.protlocov > self.prothicov: sys.stderr.write('Warning: protlocov > prothicov, swapping instead!??\n') garbage = self.protlocov self.protlocov = self.prothicov self.prothicov = garbage if self.protlocov < ABS_MIN_COV: sys.stderr.write('Warning: Bad --protlocov, using %s instead\n' % str(ABS_MIN_COV)) self.protlocov = ABS_MIN_COV if self.prothicov > ABS_MAX_COV: sys.stderr.write('Warning: Bad --prothicov, using %s instead\n' % str(ABS_MAX_COV)) self.prothicov = ABS_MAX_COV def __xrefLookup(self,indfilename,chrp): '''Populate dictionary for storing numeric indices for genomic templates This function consults a lookup table (indfilename, a plain text file in the resident file system) and populates dictionaries for storing numeric indices based on Locus or Accession identifiers. The agreed-upon format for this text file is Alias {\t} Alias {\t} ID {\n} Where this is generally Locus \t Accession \t GI. The third field will be used as the indexed key. At least one of fields one and two must refer to the genomic template *as it is indexed in the gthxml output!* ''' if indfilename == '/dev/null': raise xGDBgthxmlPGSError, \ 'Error: Must specify a cross reference file for genomic templates!' # We must distinguish alternate formats for the xref index file # based on whether a chromosome was used as the genomic reference # type or not. if chrp: regex = re.compile('^(\S+)\s+(\S+)\s+chr(\S+)$') else: regex = re.compile('^(\S+)\s+(\S+)\s+(\S+)$') self.xref_dictLOC2GI = {} self.xref_dictACC2GI = {} xref = open(indfilename,'r') for xrefline in xref: match = regex.match(xrefline) if match and not re.search(r'\D',match.group(3)): # populate the xref dictionaries self.xref_dictLOC2GI[match.group(1)] = int(match.group(3)) self.xref_dictACC2GI[match.group(2)] = int(match.group(3)) else: xref.close() raise xGDBgthxmlPGSError, \ 'Error: Blatantly non-conforming crud in xref file!' xref.close() def __cpxrefLookup(self,cpindfilename,qtype): '''Populate cDNA or protein dictionary Populate dictionary for storing identifiers of full-length cDNA or protein sequences in the input file. This data structure is useful for appropriate segregation of results based on mixtures of query types, e.g., an EST+cDNA mixture, an EST+cDNA+protein mixture, etc. The agreed-upon format for this text file is '^(\S+)$' where each record corresponds to the identification string (possibly GI, but not necessarily) of a cDNA or protein. ''' if cpindfilename == '/dev/null': # This is a legitimate condition; self.cfilexrefp or # self.pfilexrefp are set to False by default. pass else: if qtype not in ('cDNA', 'prot'): raise xGDBgthxmlPGSError, \ 'Error: Internal misuse of __cpxrefLookup method!' elif qtype == 'cDNA': self.cfilexrefp = True self.xref_cdnas = {} else: # qtype == 'prot' self.pfilexrefp = True self.xref_prots = {} cpxref = open(cpindfilename,'r') cpregex = re.compile('^(\S+)$') for cpxrefline in cpxref: match = cpregex.match(cpxrefline) if match: if qtype == 'cDNA': # populate the xref_cdnas dictionary self.xref_cdnas[match.group(1)] = True else: # qtype == 'prot' # populate the xref_prots dictionary self.xref_prots[match.group(1)] = True else: cpxref.close() if qtype == 'cDNA': raise xGDBgthxmlPGSError, \ 'Error: Blatantly non-conforming crud in cdna xref file!' else: # qtype == 'prot' raise xGDBgthxmlPGSError, \ 'Error: Blatantly non-conforming crud in prot xref file!' cpxref.close() def __reset(self): # Bools for triggering SAX events self.in_alnmodule = False self.in_exintinfo = False self.in_matchline = False self.in_totalnscore = False self.in_cumlen = False self.in_PGS = False self.in_alignment = False self.in_genome_strand = False self.in_mrna_strand = False self.in_prot_strand = False self.in_hqPGS = False # Does gene structure contain >= 1 intron? self.intronp = False # strings used to process alignments self.genome_strand = '' self.mrna_strand = '' self.prot_strand = '' # Stores query type: cDNA or Protein self.querytype = '' # Lists for storing exon and intron data self.exon_list = [] self.intron_list = [] # 15 dictionary fields for main # xGDB alignment table # # Okay, so you might notice that there are # actually 16 fields. This is because xGDB # arbitrarily uses a field named "chr" in # tables storing alignments involving # assembled pseudomolecules, and "gseg_gi" # in those involving BAC/GSS assembly/etc.-type # genomic templates. They are the same data # type in the backend, just named different. self.mainspaln = { 'uid' : 'NULL', \ 'gi' : '', \ 'E_O' : '', \ 'sim' : '', \ 'mlength' : '', \ 'cov' : '', \ 'gseg_gi' : '', \ 'chr' : '', \ 'G_O' : '', \ 'l_pos' : '', \ 'r_pos' : '', \ 'pgs' : '', \ 'pgs_lpos' : '', \ 'pgs_rpos' : '', \ 'gseg_gaps' : '', \ 'pgs_gaps' : '' \ } # 7 dictionary fields for exon xGDB alignment table self.exonspaln = { 'pgs_uid' : 'LAST_INSERT_ID()', \ 'num' : '', \ 'gseg_start' : '', \ 'gseg_stop' : '', \ 'pgs_start' : '', \ 'pgs_stop' : '', \ 'score' : '' \ } # 8 dictionary fields for intron xGDB alignment table self.intronspaln = { 'pgs_uid' : 'LAST_INSERT_ID()', \ 'num' : '', \ 'gseg_start' : '', \ 'gseg_stop' : '', \ 'Dscore' : '', \ 'Dsim' : '', \ 'Ascore' : '', \ 'Asim' : '' \ } # Override startElement method from ContentHandler def startElement(self,name,attrs): if name == 'alignment_module': self.in_alnmodule = True if self.in_alnmodule: if name == 'exon-intron_info': self.in_exintinfo = True if self.in_exintinfo: if name == 'exon': self.currserial = int(attrs['e_serial']) self.exon_list.append([]) # Note: self.exon_list is set to [] # in the __reset method if name == 'gDNA_exon_boundary': self.exon_list[self.currserial - 1].extend(( \ attrs['g_start'], \ attrs['g_stop'])) if name == 'reference_exon_boundary': self.querytype = attrs['r_type'] self.exon_list[self.currserial - 1].extend(( \ attrs['r_start'], \ attrs['r_stop'], \ attrs['r_score'])) if name == 'intron': self.intronp = True self.currserial = int(attrs['i_serial']) self.intron_list.append([]) # Note: self.intron_list is set to [] # in the __reset method if name == 'gDNA_intron_boundary': self.intron_list[self.currserial - 1].extend(( \ attrs['i_start'], \ attrs['i_stop'])) if name == 'donor': # One exon must come before any introns, so this test is safe if self.querytype == 'cDNA': self.intron_list[self.currserial - 1].extend(( \ attrs['d_prob'], \ attrs['d_score'])) else: # The Dsim field, as used in the xGDB table structure, is # not defined for protein alignments--I set it to a default, # ``impossible" value of -1.0 (see self.scoreDNE). self.intron_list[self.currserial - 1].extend(( \ attrs['d_prob'], \ self.scoreDNE)) if name == 'acceptor': # One exon must come before any introns, so this test is safe if self.querytype == 'cDNA': self.intron_list[self.currserial - 1].extend(( \ attrs['a_prob'], \ attrs['a_score'])) else: # The Asim field, as used in the xGDB table structure, is # not defined for protein alignments--I set it to a default, # ``impossible" value of -1.0 (see self.scoreDNE). self.intron_list[self.currserial - 1].extend(( \ attrs['a_prob'], \ self.scoreDNE)) if name == 'MATCH_line': self.mainspaln['gi'] = attrs['ref_id'] self.in_matchline = True if self.in_matchline: if name == 'total_alignment_score': self.in_totalnscore = True if name == 'cumulative_length_of_scored_exons': self.in_cumlen = True if name == 'coverage': self.mainspaln['cov'] = attrs['percentage'] if name == 'PGS_line': self.in_PGS = True if self.in_PGS: if name == 'gDNA': if self.chrp: # access pseudomolecule xGDB table if self.xref_dictLOC2GI.has_key(attrs['gen_id']): self.mainspaln['chr'] = self.xref_dictLOC2GI[attrs['gen_id']] elif self.xref_dictACC2GI.has_key(attrs['gen_id']): self.mainspaln['chr'] = self.xref_dictACC2GI[attrs['gen_id']] else: raise xGDBgthxmlPGSError, \ 'Error: xreffile does not index a given gDNA!' else: # other genomic template types if self.xref_dictLOC2GI.has_key(attrs['gen_id']): self.mainspaln['gseg_gi'] = self.xref_dictLOC2GI[attrs['gen_id']] elif self.xref_dictACC2GI.has_key(attrs['gen_id']): self.mainspaln['gseg_gi'] = self.xref_dictACC2GI[attrs['gen_id']] else: raise xGDBgthxmlPGSError, \ 'Error: xreffile does not index a given gDNA!' self.mainspaln['G_O'] = attrs['gen_strand'] if name == 'rDNA': self.mainspaln['eseg_gi'] = attrs['rDNA_id'] self.mainspaln['E_O'] = attrs['rDNA_strand'] if name == 'rProt': self.mainspaln['eseg_gi'] = attrs['rProt_id'] self.mainspaln['E_O'] = '+' # Reasonable default! :) if name == 'exon': self.mainspaln['pgs'] = \ self.mainspaln['pgs'] + \ attrs['e_start'] + \ ' ' + \ attrs['e_stop'] + \ ',' if name == 'alignment': self.in_alignment = True if self.in_alignment: if name == 'genome_strand': self.in_genome_strand = True if name == 'mrna_strand': self.in_mrna_strand = True if name == 'queryProt': self.in_prot_strand = True if name == 'hqPGS_line' and self.gsqp: # autocorrelated spliced alignments # Processing these elements will overwrite the raw PGS data stored # in the mainspaln dictionary. Support for hqPGS data from protein # alignments (not available in GeneSeqer or GenomeThreader, as of # version 1.2.1 of this script) is implemented to support this feature # if it should become available. self.in_hqPGS = True if self.in_hqPGS: if name == 'hqgDNA': if self.chrp: # access pseudomolecule xGDB table if self.xref_dictLOC2GI.has_key(attrs['gen_id']): self.mainspaln['chr'] = self.xref_dictLOC2GI[attrs['gen_id']] elif self.xref_dictACC2GI.has_key(attrs['gen_id']): self.mainspaln['chr'] = self.xref_dictACC2GI[attrs['gen_id']] else: raise xGDBgthxmlPGSError, \ 'Error: xreffile does not index a given gDNA!' else: # other genomic template types if self.xref_dictLOC2GI.has_key(attrs['gen_id']): self.mainspaln['gseg_gi'] = self.xref_dictLOC2GI[attrs['gen_id']] elif self.xref_dictACC2GI.has_key(attrs['gen_id']): self.mainspaln['gseg_gi'] = self.xref_dictACC2GI[attrs['gen_id']] else: raise xGDBgthxmlPGSError, \ 'Error: xreffile does not index a given gDNA!' self.mainspaln['G_O'] = attrs['gen_strand'] if name == 'hqrDNA': self.mainspaln['eseg_gi'] = attrs['rDNA_id'] self.mainspaln['E_O'] = attrs['rDNA_strand'] if name == 'hqrProt': self.mainspaln['eseg_gi'] = attrs['rProt_id'] self.mainspaln['E_O'] = '+' # Reasonable default! :) if name == 'hqexon': self.mainspaln['pgs'] = \ self.mainspaln['pgs'] + \ attrs['e_start'] + \ ' ' + \ attrs['e_stop'] + \ ',' # Override characters method from ContentHandler # Note that the parser will break content up into chunks of 1024 bytes, # so it is safest (well, it's completely necessary!) to iterate over each # chunk (hence, the append (+) operator). def characters(self,characters): if self.in_totalnscore: self.mainspaln['sim'] = self.mainspaln['sim'] + str(characters) if self.in_cumlen: self.mainspaln['mlength'] = self.mainspaln['mlength'] + str(characters) if self.in_genome_strand: self.genome_strand = self.genome_strand + str(characters) if self.in_mrna_strand: self.mrna_strand = self.mrna_strand + str(characters) if self.in_prot_strand: self.prot_strand = self.prot_strand + str(characters) # Override endElement method from ContentHandler def endElement(self,name): if self.in_alnmodule: if name == 'exon-intron_info': # set a value or two self.mainspaln['l_pos'] = min( \ int(self.exon_list[0][0]), \ int(self.exon_list[len(self.exon_list) - 1][1])) self.mainspaln['r_pos'] = max( \ int(self.exon_list[0][0]), \ int(self.exon_list[len(self.exon_list) - 1][1])) self.mainspaln['pgs_lpos'] = min( \ int(self.exon_list[0][2]), \ int(self.exon_list[len(self.exon_list) - 1][3])) self.mainspaln['pgs_rpos'] = max( \ int(self.exon_list[0][2]), \ int(self.exon_list[len(self.exon_list) - 1][3])) self.in_exintinfo = False if name == 'total_alignment_score': self.in_totalnscore = False if name == 'cumulative_length_of_scored_exons': self.in_cumlen = False if name == 'MATCH_line': self.in_matchline = False if name == 'PGS_line' and self.in_PGS: self.mainspaln['pgs'] = self.mainspaln['pgs'][:-1] # chop trailing ',' self.in_PGS = False if name == 'genome_strand': self.in_genome_strand = False if name == 'mrna_strand': self.in_mrna_strand = False if name == 'queryProt': self.in_prot_strand = False if name == 'alignment': self.in_alignment = False if name == 'hqPGS_line' and self.in_hqPGS: self.mainspaln['pgs'] = self.mainspaln['pgs'][:-1] # chop trailing ',' self.in_hqPGS = False if name == 'predicted_gene_structure' and \ self.in_alnmodule: # Triggers match recording # First, let's verify we have the query type set reasonably; # we will need this information in the methods called below. if self.querytype not in ('cDNA', 'Protein'): raise xGDBgthxmlPGSError, \ 'Error: Invalid query type! (You did not validate your gthxml?!!)' self.__compress_alignment() self.__recordmatch() self.__reset() self.in_alnmodule = True # not necessarily finished... if name == 'alignment_module': # Prevent processing data in PGL_module of gthxml self.in_alnmodule = False def __compress_alignment(self): '''Compress the alignment information using the xGDB routine.''' if self.querytype == 'cDNA' and \ (len(self.genome_strand) != len(self.mrna_strand)): raise xGDBgthxmlPGSError, \ 'Error: Genomic and EST/cDNA alignment strings incompatible!' elif self.querytype == 'Protein' and \ (len(self.genome_strand) != len(self.prot_strand)): raise xGDBgthxmlPGSError, \ 'Error: Genomic and Protein alignment strings incompatible!' else: pass # Process genomic template first self.genomic_Watson = \ (int(self.exon_list[0][0]) < \ int(self.exon_list[len(self.exon_list) - 1][1])) and True or False in_gap = False genomicposition = int(self.exon_list[0][0]) for i in range(len(self.genome_strand)): if self.genome_strand[i] == '-': if in_gap: len_gap += 1 else: in_gap = True len_gap = 1 start_gap = genomicposition elif re.search(r'\w',self.genome_strand[i]): if in_gap: if self.genomic_Watson: self.mainspaln['gseg_gaps'] = \ self.mainspaln['gseg_gaps'] + \ ':' + str(genomicposition - 1) + \ ':' + str(len_gap) else: self.mainspaln['gseg_gaps'] = \ self.mainspaln['gseg_gaps'] + \ ':' + str(genomicposition) + \ ':' + str(len_gap) in_gap = False # update genomicposition if self.genomic_Watson: genomicposition += 1 else: genomicposition -= 1 else: raise xGDBgthxmlPGSError, \ 'Error: Encountered invalid symbol in genomic template.' self.mainspaln['gseg_gaps'] = self.mainspaln['gseg_gaps'][1:] # chop leading ':' # Now, process the query sequence # Code to handle cDNA alignments if self.querytype == 'cDNA': in_acc_gap = in_gap = in_intron = False queryposition = int(self.exon_list[0][2]) self.query_Watson = \ (int(self.exon_list[0][2]) < \ int(self.exon_list[len(self.exon_list) - 1][3])) and True or False for i in range(len(self.mrna_strand)): if self.mrna_strand[i] == '-': if in_intron: # gap adjacent to acceptor if in_acc_gap: len_acc_gap += 1 else: in_acc_gap = True len_acc_gap = 1 start_acc_gap = queryposition + 1 else: if in_gap: len_gap += 1 else: in_gap = True len_gap = 1 start_gap = queryposition elif self.mrna_strand[i] == '.': if in_intron: len_intron += 1 else: in_intron = True len_intron = 1 start_intron = queryposition elif re.search(r'\w',self.mrna_strand[i]): if in_gap: if self.query_Watson: self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(start_gap - 1) + \ ':' + str(len_gap) else: self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(start_gap) + \ ':' + str(len_gap) if in_acc_gap: if self.query_Watson: self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(start_acc_gap - 1) + \ ':' + str(-(len_acc_gap)) else: self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(start_acc_gap) + \ ':' + str(-(len_acc_gap)) if in_intron: if self.query_Watson: self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(-(start_intron - 1)) + \ ':' + str(len_intron) else: self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(-(start_intron)) + \ ':' + str(len_intron) in_acc_gap = in_gap = in_intron = False # update queryposition if self.query_Watson: queryposition += 1 else: queryposition -= 1 else: raise xGDBgthxmlPGSError, \ 'Error: Encountered invalid symbol in query sequence: %s' % \ self.mrna_strand[i] # Code to handle protein alignments # Protein alignments are similar to transcript ones, but must # take phase relative to the genomic template into account. # Also, proteins are exclusively in forward orientation. # # gap/intron phase conventions: # a -> C O D - C O D # b -> C - O D C O D # c -> C O - D C O D # # Gap/intron start positions are given in terms of the query # sequence (amino acids), whereas lengths of the same are # given in terms of the genomic template (nucleotides). elif self.querytype == 'Protein': in_acc_gap = in_gap = in_intron = was_in_intron = False queryposition = int(self.exon_list[0][2]) start_qprot = True # Are we beginning the alignment parse? dist_from_Psite = 1 # Just the distance from most recent # AA symbol in prot_strand, i.e., # 01201 # GCAACC etc. # A T # 'Psite' comes from ribosomal EPA nomenclature. # This is used to assist gap phase classification; # it is always given in terms of where point is # at a given moment, never where it's going or # where it's been. safety_space_ct = 0 # Protect against gap symbols that may # fail to print in acceptor gaps for i in range(len(self.prot_strand)): if self.prot_strand[i] == ' ': if in_gap or in_acc_gap: pass elif in_intron: # Identify the intron's phase gap_phase = self.phase_symbols[ (dist_from_Psite + 1) % 3 ] self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(-(start_intron - 1)) + gap_phase + \ ':' + str(len_intron) in_intron = False was_in_intron = True safety_space_ct = 1 elif was_in_intron: safety_space_ct += 1 else: dist_from_Psite += 1 elif self.prot_strand[i] == '-': # I assume that '.-' never occurs in the query alignment string.(?) if was_in_intron and not in_acc_gap: # gap adjacent to acceptor in_acc_gap = True len_acc_gap = 3 start_acc_gap = queryposition + 1 was_in_intron = False elif in_acc_gap: # I assume that all gaps in the protein query incur # gap lengths of multiples of 3.(?) len_acc_gap += 3 else: # I assume that all gaps in the protein query incur # gap lengths of multiples of 3.(?) if in_gap: len_gap += 3 else: in_gap = True len_gap = 3 start_gap = queryposition elif self.prot_strand[i] == '.': if start_qprot: # The example below, corresponding to this case, # is a special one, where the first codon position # is in an upstream exon of sufficiently poor quality # as not to be included in the final alignment. # # If the residue A in the query protein below were # at position 114, this intron would be represented # as '-113b:1'. # # GCAACCCTTG genome # A T L translated genome # | | . # .A T F query protein # # Note that I assume also that such ``introductory # introns" do not occur in phases a or c, but only b.(?) in_intron = True len_intron = 1 start_intron = queryposition dist_from_Psite = 2 # Special trick to get the phase (b) correct. elif in_intron: len_intron += 1 else: in_intron = True len_intron = 1 start_intron = queryposition if not in_gap: dist_from_Psite += 1 dist_from_Psite %= 3 elif re.search(r'[\w\*]',self.prot_strand[i]): # '*' is Stop codon if in_gap: # Identify the gap's phase gap_phase = self.phase_symbols[ (dist_from_Psite + 1) % 3 ] self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(start_gap - 1) + gap_phase + \ ':' + str(len_gap) if in_acc_gap: # Identify the gap's phase gap_phase = self.phase_symbols[ (dist_from_Psite + 1) % 3 ] self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(start_acc_gap - 1) + gap_phase + \ ':' + str(-(len_acc_gap)) if in_intron: # Identify the intron's phase gap_phase = self.phase_symbols[ (dist_from_Psite + 1) % 3 ] self.mainspaln['pgs_gaps'] = \ self.mainspaln['pgs_gaps'] + \ ':' + str(-(start_intron - 1)) + gap_phase + \ ':' + str(len_intron) in_acc_gap = in_gap = in_intron = was_in_intron = False dist_from_Psite = safety_space_ct = 0 queryposition += 1 else: raise xGDBgthxmlPGSError, \ 'Error: Encountered invalid symbol in query sequence: %s' % \ self.prot_strand[i] if dist_from_Psite > 2 or safety_space_ct > 2: raise xGDBgthxmlPGSError, \ 'Error: Bad query alignment string (too many contiguous spaces!): \n\ See position %i of \n%s' % \ (i+1, self.prot_strand) start_qprot = False else: raise xGDBgthxmlPGSError, \ 'Error: (Internal) Invalid query type after verification?!!' self.mainspaln['pgs_gaps'] = self.mainspaln['pgs_gaps'][1:] # chop leading ':' def __recordmatch(self): '''Record SQL statements for populating xGDB backend.''' if (self.querytype == 'cDNA' and \ float(self.mainspaln['sim']) >= self.nuclloscore and \ float(self.mainspaln['sim']) <= self.nuclhiscore and \ float(self.mainspaln['cov']) >= self.nucllocov and \ float(self.mainspaln['cov']) <= self.nuclhicov) \ or \ (self.querytype == 'Protein' and \ float(self.mainspaln['sim']) >= self.protloscore and \ float(self.mainspaln['sim']) <= self.prothiscore and \ float(self.mainspaln['cov']) >= self.protlocov and \ float(self.mainspaln['cov']) <= self.prothicov): # Populate the main spliced alignment table if self.querytype == 'Protein': if not (self.pfilexrefp and \ self.xref_prots.has_key(self.mainspaln['gi'])): raise xGDBgthxmlPGSError, \ 'Error: Protein %s not documented in xref file!' % \ self.mainspaln['gi'] else: sqlstring = 'INSERT INTO ' + self.ptable elif self.querytype == 'cDNA' and \ self.cfilexrefp and \ self.xref_cdnas.has_key(self.mainspaln['gi']): sqlstring = 'INSERT INTO ' + self.ctable else: sqlstring = 'INSERT INTO ' + self.table if self.chrp: sqlstring = \ sqlstring + \ ' (uid,gi,E_O,sim,mlength,cov,chr,G_O,l_pos,r_pos,' + \ 'pgs,pgs_lpos,pgs_rpos,gseg_gaps,pgs_gaps) VALUES (' + \ self.mainspaln['uid'] + ',' + \ '\'' + self.mainspaln['gi'] + '\',' + \ '\'' + self.mainspaln['E_O'] + '\',' + \ self.mainspaln['sim'] + ',' + \ self.mainspaln['mlength'] + ',' + \ self.mainspaln['cov'] + ',' + \ str(self.mainspaln['chr']) + ',' else: sqlstring = \ sqlstring + \ ' (uid,gi,E_O,sim,mlength,cov,gseg_gi,G_O,l_pos,r_pos,' + \ 'pgs,pgs_lpos,pgs_rpos,gseg_gaps,pgs_gaps) VALUES (' + \ self.mainspaln['uid'] + ',' + \ '\'' + self.mainspaln['gi'] + '\',' + \ '\'' + self.mainspaln['E_O'] + '\',' + \ self.mainspaln['sim'] + ',' + \ self.mainspaln['mlength'] + ',' + \ self.mainspaln['cov'] + ',' + \ str(self.mainspaln['gseg_gi']) + ',' sqlstring = \ sqlstring + \ '\'' + self.mainspaln['G_O'] + '\',' + \ str(self.mainspaln['l_pos']) + ',' + \ str(self.mainspaln['r_pos']) + ',' + \ '\'' + self.mainspaln['pgs'] + '\',' + \ str(self.mainspaln['pgs_lpos']) + ',' + \ str(self.mainspaln['pgs_rpos']) + ',' + \ '\'' + self.mainspaln['gseg_gaps'] + '\',' + \ '\'' + self.mainspaln['pgs_gaps'] + '\');' self.outstream.write(sqlstring + '\n') # Now, for the exons table if self.querytype == 'Protein': if not (self.pfilexrefp and \ self.xref_prots.has_key(self.mainspaln['gi'])): raise xGDBgthxmlPGSError, \ 'Error: Protein %s not documented in xref file!' % \ self.mainspaln['gi'] else: sqlstring = 'INSERT INTO ' + self.ptable elif self.querytype == 'cDNA' and \ self.cfilexrefp and \ self.xref_cdnas.has_key(self.mainspaln['gi']): sqlstring = 'INSERT INTO ' + self.ctable else: sqlstring = 'INSERT INTO ' + self.table sqlstring = sqlstring + \ '_exons (pgs_uid,num,gseg_start,gseg_stop,pgs_start,pgs_stop,score) VALUES ' for i in range(len(self.exon_list)): self.exonspaln['num'] = str(i + 1) self.exonspaln['gseg_start'] = self.exon_list[i][0] self.exonspaln['gseg_stop'] = self.exon_list[i][1] self.exonspaln['pgs_start'] = self.exon_list[i][2] self.exonspaln['pgs_stop'] = self.exon_list[i][3] self.exonspaln['pgs_score'] = self.exon_list[i][4] sqlstring = \ sqlstring + '(' + \ self.exonspaln['pgs_uid'] + ',' + \ self.exonspaln['num'] + ',' + \ self.exonspaln['gseg_start'] + ',' + \ self.exonspaln['gseg_stop'] + ',' + \ self.exonspaln['pgs_start'] + ',' + \ self.exonspaln['pgs_stop'] + ',' + \ self.exonspaln['pgs_score'] + '),' sqlstring = sqlstring[:-1] + ';' # replace trailing ',' with ';' self.outstream.write(sqlstring + '\n') # Now, for the introns table, if applicable if self.intronp: if self.querytype == 'Protein': if not (self.pfilexrefp and \ self.xref_prots.has_key(self.mainspaln['gi'])): raise xGDBgthxmlPGSError, \ 'Error: Protein %s not documented in xref file!' % \ self.mainspaln['gi'] else: sqlstring = 'INSERT INTO ' + self.ptable elif self.querytype == 'cDNA' and \ self.cfilexrefp and \ self.xref_cdnas.has_key(self.mainspaln['gi']): sqlstring = 'INSERT INTO ' + self.ctable else: sqlstring = 'INSERT INTO ' + self.table sqlstring = sqlstring + \ '_introns (pgs_uid,num,gseg_start,gseg_stop,Dscore,Dsim,Ascore,Asim) VALUES ' for i in range(len(self.intron_list)): self.intronspaln['num'] = str(i + 1) self.intronspaln['gseg_start'] = self.intron_list[i][0] self.intronspaln['gseg_stop'] = self.intron_list[i][1] self.intronspaln['Dscore'] = self.intron_list[i][2] self.intronspaln['Dsim'] = self.intron_list[i][3] self.intronspaln['Ascore'] = self.intron_list[i][4] self.intronspaln['Asim'] = self.intron_list[i][5] sqlstring = \ sqlstring + '(' + \ self.intronspaln['pgs_uid'] + ',' + \ self.intronspaln['num'] + ',' + \ self.intronspaln['gseg_start'] + ',' + \ self.intronspaln['gseg_stop'] + ',' + \ self.intronspaln['Dscore'] + ',' + \ self.intronspaln['Dsim'] + ',' + \ self.intronspaln['Ascore'] + ',' + \ self.intronspaln['Asim'] + '),' sqlstring = sqlstring[:-1] + ';' # replace trailing ',' with ';' self.outstream.write(sqlstring + '\n') def __del__(self): '''Class "destructor" method''' # close input stream if self.instreamp: self.instream.close() if self.webp: self.compressed_gthxml_stream.close() # close output stream if self.outstreamp: self.outstream.close() if __name__ == '__main__': gthxml_handler = xGDBgthxmlPGS(sys.argv[1:]) from xml.sax import make_parser from xml.sax.handler import ContentHandler genericsax = make_parser() genericsax.setContentHandler(gthxml_handler) try: genericsax.parse(gthxml_handler.instream) except xGDBgthxmlPGSError, inst: print inst.args[0]