#!/usr/bin/env python
'''Convert spliced alignments in gthxml format to GFF

This module contains the gthxmlToGFF class definition.
It parses results stored in the gthxml format, as
defined at http://www.public.iastate.edu/~mespar1/gthxml/GenomeThreader.rng.txt .
(Note that GenomeThreader can generate this output directly using the
-xmlout option, whereas GeneSeqer plain text output can be
retrofitted to this standard via the GSQ2XML.pl tool, available
at http://www.public.iastate.edu/~mespar1/gthxml/GSQ2XML.pl .)

Usage:
    python gthxmlToGFF.py (aka XML2GFF.py)
        -h --help           # prints this message and exits
        -t --type qtype     # one of "EST", "cDNA", "prot", or "mixed"
        -i --infile file    # Input file identifier. Input stream can be gthxml data
                            # piped by stdin (default) or a file on the resident file
                            # system
        -o --outfile file   # Output file identifier. Can be stdout (default) or a
                            # local file, which will be overwritten if it exists
        --loscore float     # minimum overall alignment score
        --hiscore float     # maximum overall alignment score
        --locov float       # minimum coverage of G,C,or P
        --hicov float       # maximum coverage of G,C,or P
'''

__author__ = 'Michael E Sparks (mespar1@iastate.edu)'
__version__ = '1.3'
__copyright__ = 'Copyright (c) 2004, 2007 Michael E Sparks, all rights reserved'
__license__ = 'BSD-style license'

# MES - 19 June 2007
# Version 1.3 updates this script to generate validating GFFv3 output:
# http://song.sourceforge.net/gff3.shtml

import getopt, sys
from xml.sax.handler import ContentHandler

ABS_MIN_SCORE = 0.00
ABS_MAX_SCORE = 2.00 # I don't like this, but it works...mes
ABS_MIN_COV = 0.00
ABS_MAX_COV = 2.00 # I don't like this, but it works...mes
QTYPES = ('mixed', 'EST', 'cDNA', 'prot') # If you do change this, please
                                          # leave 'mixed' at the front!

class gthxmlToGFFError(Exception):
    '''Exception subclass for addressing fatal errors in xGDBgthxmlPGS objects'''
    pass
                                                                                                                             
class gthxmlToGFF(ContentHandler):
    '''A handler to convert spliced alignments in gthxml to GFF

    Allows one to parse a gthxml v1.0 file from a local filesystem or over the
    Web and print spliced alignments, per minimum/maximum score and coverage 
    criteria, in GFF formats for use in, e.g., gbrowse
    '''
    def __init__(self,argv):
        '''Class "constructor" method'''
                                                                                                                             
        # Set default values prior to parsing argument list
        # Note that these may trigger exceptions later in the class
        # methods, but this is the intended effect!  In short, don't
        # monkey with this part of the code.
        self.infile = sys.stdin      # Input file identifier. Input stream can be gthxml data
                                     # piped by stdin or a file on the resident file system.
        self.instreamp = False       # Have we opened the input stream yet?
        self.outfile = sys.stdout    # Output file identifier. Can be stdout or a local file,
                                     # which will be overwritten if it exists
        self.outstreamp = False      # Have we opened the output stream yet?
        self.loscore = 0.80          # minimum alignment score
        self.hiscore = ABS_MAX_SCORE # maximum alignment score
        self.locov = 0.80            # min coverage of G,C,or P
        self.hicov = ABS_MAX_COV     # max coverage of G,C,or P
        self.qtype = 'mixed'         # type of query used for spliced alignment
                                                                                                                             
        # Parse option list
        try:
            opts, args = getopt.getopt(argv, 'ht:i:o:',
                                            ['help',
                                             'type=',
                                             'infile=',
                                             'outfile=',
                                             'loscore=', 'hiscore=',
                                             'locov=', 'hicov='])

        except getopt.GetoptError:
            print __doc__
            sys.exit(2)
                                                                                                                             
        for o, a in opts:
            if o in ('-h', '--help'):
                print __doc__
                sys.exit(None)
            elif o in ('-t', '--type'):
                self.qtype = a
            elif o in ('-i', '--infile'):
                self.infile = a
            elif o in ('-o', '--outfile'):
                self.outfile = a
            elif o == '--loscore':
                self.loscore = float(a)
            elif o == '--hiscore':
                self.hiscore = float(a)
            elif o == '--locov':
                self.locov = float(a)
            elif o == '--hicov':
                self.hicov = float(a)

        # Verify score values were legitimate
        if self.loscore > self.hiscore:
            sys.stderr.write('Warning: loscore > hiscore, swapping instead!??\n')
            garbage = self.loscore
            self.loscore = self.hiscore
            self.hiscore = garbage
        if self.loscore < ABS_MIN_SCORE:
            sys.stderr.write('Warning: Bad --loscore, using %s instead\n' % str(ABS_MIN_SCORE))
            self.loscore = ABS_MIN_SCORE
        if self.hiscore > ABS_MAX_SCORE:
            sys.stderr.write('Warning: Bad --hiscore, using %s instead\n' % str(ABS_MAX_SCORE))
            self.hiscore = ABS_MAX_SCORE
                                                                                                                             
        # Verify coverage values were legitimate
        if self.locov > self.hicov:
            sys.stderr.write('Warning: locov > hicov, swapping instead!??\n')
            self.garbage = self.locov
            self.locov = self.hicov
            self.hicov = self.garbage
        if self.locov < ABS_MIN_COV:
            sys.stderr.write('Warning: Bad --locov, using %s instead\n' % str(ABS_MIN_COV))
            self.locov = ABS_MIN_COV
        if self.hicov > ABS_MAX_COV:
            sys.stderr.write('Warning: Bad --hicov, using %s instead\n' % str(ABS_MAX_COV))
            self.hicov = ABS_MAX_COV

        # Verify query type is legitimate
        if self.qtype not in QTYPES:
            sys.stderr.write('Warning: Incorrect qtype, using %s instead\n' % QTYPES[0])
            self.qtype = QTYPES[0]

        # Attempt to open input stream
        if self.infile is not sys.stdin:
            self.instream = open(self.infile,'r')
        else:
            self.instream = self.infile
        self.instreamp = True

        # Attempt to open output stream
        if self.outfile is not sys.stdout:
            self.outstream = open(self.outfile,'w')
        else:
            self.outstream = self.outfile
        self.outstreamp = True

        # Write the GFF 3 pragma
        self.outstream.write('##gff-version 3\n')

        # Bools for triggering SAX events
        self.in_header = False
        self.in_splaln = False
        self.in_totscor = False
        self.is_start = False
        self.qtypeset = False

        # 9 fields for GFF dictionary
        self.GFF = { 'template': '', \
                     'source'  : '', \
                     'type'    : '', \
                     'start'   : '', \
                     'stop'    : '', \
                     'score'   : '', \
                     'strand'  : '', \
                     'phase'   : '.', \
                     'group'   : '' \
                   }

        # Store current (though temporary!) query type designations
        # if processing gthxml with mixed query type
        self.tempqtype = ''

        # Direction components
        self.dir_exon = ''

        # Store critical values for whether we print the alignment or not
        self.score = ''
        self.coverage = ''

        # Count exons
        self.exon_ct = 1

        # Start and stop of aligned regions
        self.gene_start = 0
        self.gene_stop = 0

        # String to enqueue
        self.pre_queue = ''

        # List of exon lines, possibly a PPA line
        self.exon_queue = []

    def startElement(self, name, attrs):
        if name == 'header':
            self.in_header = True
        if name == 'spliced_alignment':
            self.exon_queue = []
            self.in_splaln = True
        if self.in_header and name == 'source':
            if self.qtype != 'mixed':
                self.GFF['source'] = attrs['program'] + self.qtype
                self.qtypeset = True
            else:
                self.GFF['source'] = attrs['program']
        if self.in_splaln:
            if name == 'reference':
                self.GFF['group'] = 'transcript=' + attrs['ref_id'] + \
                                    ';Note=' + attrs['ref_id']
            if name == 'template':
                self.GFF['template'] = attrs['temp_id']
                self.GFF['strand'] = attrs['temp_strand']
            if name == 'predicted_gene_structure':
                self.is_start = True
                self.exon_ct = 1
            if name == 'gDNA_exon_boundary':
                # Record exon start, stop
                self.GFF['start'] = str(min(int(attrs['g_start']), \
                                            int(attrs['g_stop'])))
                self.GFF['stop'] = str(max(int(attrs['g_start']), \
                                           int(attrs['g_stop'])))

                # Record overall gene start
                if self.is_start:
                    if self.GFF['strand'] == '+':
                        self.gene_start = self.GFF['start']
                    else:
                        self.gene_start = self.GFF['stop']
                    self.is_start = False

                # Record overall gene stop for each exon...
                # eventually it will reach its correct value.
                if self.GFF['strand'] == '+':
                    self.gene_stop = self.GFF['stop']
                else:
                    self.gene_stop = self.GFF['start']
            if name == 'reference_exon_boundary':
                # Determine query type, if necessary
                if not self.qtypeset:
                    if attrs['r_type'] == 'Protein':
                        self.tempqtype = 'prot'
                    else:
                        self.tempqtype = 'EST' # Reasonable default!
                    self.qtypeset = True
                self.GFF['score'] = attrs['r_score']
                # Ready to enqueue an exon line!
                self.pre_queue = self.GFF['template'] + '\t' + \
                                 self.GFF['source'] + \
                                 self.tempqtype + '\t' + \
                                 'exon\t' + \
                                 self.GFF['start'] + '\t' + \
                                 self.GFF['stop'] + '\t' + \
                                 self.GFF['score'] + '\t' + \
                                 self.GFF['strand'] + '\t' + \
                                 self.GFF['phase'] + '\t' + \
                                 self.GFF['group'] + '.exon' + \
                                 str(self.exon_ct) + '\n'
                self.exon_ct += 1
                self.exon_queue.append(self.pre_queue)
            if name == 'total_alignment_score':
                self.in_totscor = True
            if name == 'coverage':
                self.coverage = attrs['percentage']

    def characters(self, characters):
        if self.in_totscor:
            self.score = self.score + str(characters)
            self.in_totscor = False

    def endElement(self, name):
        if name == 'header':
            self.in_header = False
        if name == 'spliced_alignment':
            self.__recordmatch()
            self.in_splaln = False
            self.score = ''
            self.coverage = ''
            if self.qtype == 'mixed':
                self.qtypeset = False
                self.tempqtype = ''

    def __recordmatch(self):
        if float(self.score) >= self.loscore and \
           float(self.score) <= self.hiscore and \
           float(self.coverage) >= self.locov and \
           float(self.coverage) <= self.hicov:

            if self.GFF['strand'] == '+':
                self.dir_exon = self.gene_start + '\t' + \
                                self.gene_stop + '\t'
            else:
                self.dir_exon = self.gene_stop + '\t' + \
                                self.gene_start + '\t'
            # Print the transcript line
            self.outstream.write(self.GFF['template'] + '\t' + \
                                 self.GFF['source'] + \
                                 self.tempqtype + '\t' + \
                                 'transcript\t' + \
                                 self.dir_exon + \
                                 self.score + '\t' + \
                                 self.GFF['strand'] + '\t' + \
                                 self.GFF['phase'] + '\t' + \
                                 self.GFF['group'] + '\n')
            # Print the exon queue
            for x in self.exon_queue:
                self.outstream.write(x)

    def __del__(self):
        '''Class *destructor* method'''
                                                                                                                             
        # close input stream
        if self.instreamp:
            self.instream.close()

        # close output stream
        if self.outstreamp:
            self.outstream.close()

if __name__ == '__main__':
    gthxml_handler = gthxmlToGFF(sys.argv[1:])
                                                                                                                             
    from xml.sax import make_parser

    genericsax = make_parser()
    genericsax.setContentHandler(gthxml_handler)
    try:
        genericsax.parse(gthxml_handler.instream)
    except gthxmlToGFFError, inst:
        print inst.args[0]