extract_splice_sites.py

#!/usr/bin/env python

#
# Copyright 2014, Daehwan Kim <infphilo@gmail.com>
#
# This file is part of HISAT.
#
# HISAT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HISAT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HISAT.  If not, see <http://www.gnu.org/licenses/>.
#

from __future__ import print_function

from sys import stderr, exit
from collections import defaultdict as dd, Counter
from argparse import ArgumentParser, FileType


def extract_splice_sites(annotation_file, verbose=False, ftype='gtf'):
    genes = dd(list)
    trans = {}

    # Parse valid exon lines from the annotation file into a dict by transcript_id
    for line in annotation_file:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        if '#' in line:
            line = line.split('#')[0].strip()

        try:
            chrom, source, feature, left, right, score, \
                strand, frame, values = line.split('\t')
        except ValueError:
            continue
        left, right = int(left), int(right)

        if feature != 'exon' or left >= right:
            continue

        values_dict = get_values(values, ftype)
        if values_dict is None:
            continue
        
        transcript_id = values_dict['transcript_id']
        if transcript_id not in trans:
            trans[transcript_id] = [chrom, strand, [[left, right]]]
            genes[values_dict['gene_id']].append(transcript_id)
        else:
            trans[transcript_id][2].append([left, right])

    # Sort exons and merge where separating introns are <=5 bps
    for tran, [chrom, strand, exons] in trans.items():
            exons.sort()
            tmp_exons = [exons[0]]
            for i in range(1, len(exons)):
                if exons[i][0] - tmp_exons[-1][1] <= 5:
                    tmp_exons[-1][1] = exons[i][1]
                else:
                    tmp_exons.append(exons[i])
            trans[tran] = [chrom, strand, tmp_exons]

    # Calculate and print the unique junctions
    junctions = set()
    for chrom, strand, exons in trans.values():
        for i in range(1, len(exons)):
            junctions.add((chrom, exons[i-1][1], exons[i][0], strand))
    junctions = sorted(junctions)
    for chrom, left, right, strand in junctions:
        # Zero-based offset
        print('{}\t{}\t{}\t{}'.format(chrom, left-1, right-1, strand))

    # Print some stats if asked
    if verbose:
        exon_lengths, intron_lengths, trans_lengths = \
            Counter(), Counter(), Counter()
        for chrom, strand, exons in trans.values():
            tran_len = 0
            for i, exon in enumerate(exons):
                exon_len = exon[1]-exon[0]+1
                exon_lengths[exon_len] += 1
                tran_len += exon_len
                if i == 0:
                    continue
                intron_lengths[exon[0] - exons[i-1][1]] += 1
            trans_lengths[tran_len] += 1

        print('genes: {}, genes with multiple isoforms: {}'.format(
                len(genes), sum(len(v) > 1 for v in genes.values())),
              file=stderr)
        print('transcripts: {}, transcript avg. length: {:d}'.format(
                len(trans), sum(trans_lengths.elements())/len(trans)),
              file=stderr)
        print('exons: {}, exon avg. length: {:d}'.format(
                sum(exon_lengths.values()),
                sum(exon_lengths.elements())/sum(exon_lengths.values())),
              file=stderr)
        print('introns: {}, intron avg. length: {:d}'.format(
                sum(intron_lengths.values()),
                sum(intron_lengths.elements())/sum(intron_lengths.values())),
              file=stderr)
        print('average number of exons per transcript: {:d}'.format(
                sum(exon_lengths.values())/len(trans)),
              file=stderr)


def get_values(values, ftype):
    values_dict = {}
    temp_dict = {}
    if ftype == 'gtf':
        for attr in values.split(';')[:-1]:
            attr, _, val = attr.strip().partition(' ')
            values_dict[attr] = val.strip('"')

        if 'gene_id' not in values_dict or \
                'transcript_id' not in values_dict:
            return None
    else:
        assert ftype == 'gff'
        for attr in values.split(';'):
            if attr.startswith('Dbxref'):
                db, xrefs = attr.split('=')
                for xref in xrefs.split(','):
                    k, v = xref.split(':')
                    temp_dict[k] = v
            else:
                k, v = attr.split('=')
                temp_dict[k] = v
        if 'transcript_id' not in temp_dict or \
                'GeneID' not in temp_dict:
            return None
        values_dict = dict(gene_id=temp_dict['GeneID'],
                           transcript_id=temp_dict['transcript_id'])    
    return values_dict


if __name__ == '__main__':
    parser = ArgumentParser(
        description='Extract splice junctions from a GTF or GFF3 annotation file')
    parser.add_argument('annotation_file',
        nargs='?',
        type=FileType('r'),
        help='input annotation file (use "-" for stdin)')
    parser.add_argument('-v', '--verbose',
        dest='verbose',
        action='store_true',
        help='also print some statistics to stderr')
    group = parser.add_mutually_exclusive_group(required=False)
    group.add_argument('-f', '--gff',
        dest='gtf',
        action='store_false',
        help='input file is GFF')
    group.add_argument('-t', '--gtf',
        dest='gtf',
        action='store_true',
        help='input file is GTF')

    args = parser.parse_args()
    if not args.annotation_file:
        parser.print_help()
        exit(1)
    if args.gtf:
        ftype = 'gtf'
    else:
        ftype = 'gff'
    extract_splice_sites(args.annotation_file, args.verbose, ftype)