https://github.com/pysam-developers/pysam
Raw File
Tip revision: 9218de6bb523f43667520fde740067124c48745f authored by Andreas Heger on 09 April 2023, 21:52:23 UTC
move release to tag section
Tip revision: 9218de6
libctabix.pyx
# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
###############################################################################
###############################################################################
# Cython wrapper for access to tabix indexed files in bgzf format
###############################################################################
# The principal classes and functions defined in this module are:
#
# class TabixFile  class wrapping tabix indexed files in bgzf format
#
# class asTuple  Parser class for tuples
# class asGTF    Parser class for GTF formatted rows
# class asGFF3   Parser class for GFF3 formatted rows
# class asBed    Parser class for Bed formatted rows
# class asVCF    Parser class for VCF formatted rows
#
# class tabix_generic_iterator  Streamed iterator of bgzf formatted files
#
# Additionally this module defines several additional classes that are part
# of the internal API. These are:
#
# class Parser                base class for parsers of tab-separated rows
# class tabix_file_iterator
# class TabixIterator         iterator class over rows in bgzf file
# class EmptyIterator
#
# For backwards compatibility, the following classes are also defined:
#
# class Tabixfile   equivalent to TabixFile
#
###############################################################################
#
# The MIT License
#
# Copyright (c) 2015 Andreas Heger
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
###############################################################################
import os
import sys

from libc.stdio cimport printf, fprintf, stderr
from libc.string cimport strerror
from libc.errno cimport errno
from posix.unistd cimport dup

from cpython cimport PyErr_SetString, PyBytes_Check, \
    PyUnicode_Check, PyBytes_FromStringAndSize, \
    PyObject_AsFileDescriptor

cimport pysam.libctabixproxies as ctabixproxies

from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
    BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
    tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
    tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
    tbx_destroy, hisremote, region_list, hts_getline, \
    TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
    no_compression, bcf, bcf_index_build2

from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
from pysam.libcutils cimport encode_filename, from_string_and_size

cdef class Parser:

    def __init__(self, encoding="ascii"):
        self.encoding = encoding

    def set_encoding(self, encoding):
        self.encoding = encoding

    def get_encoding(self):
        return self.encoding

    cdef parse(self, char * buffer, int length):
        raise NotImplementedError(
            'parse method of %s not implemented' % str(self))

    def __call__(self, char * buffer, int length):
        return self.parse(buffer, length)


cdef class asTuple(Parser):
    '''converts a :term:`tabix row` into a python tuple.

    A field in a row is accessed by numeric index.
    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.TupleProxy r
        r = ctabixproxies.TupleProxy(self.encoding)
        # need to copy - there were some
        # persistence issues with "present"
        r.copy(buffer, len)
        return r


cdef class asGFF3(Parser):
    '''converts a :term:`tabix row` into a GFF record with the following
    fields:
   
    +----------+----------+-------------------------------+
    |*Column*  |*Name*    |*Content*                      |
    +----------+----------+-------------------------------+
    |1         |contig    |the chromosome name            |
    +----------+----------+-------------------------------+
    |2         |feature   |The feature type               |
    +----------+----------+-------------------------------+
    |3         |source    |The feature source             |
    +----------+----------+-------------------------------+
    |4         |start     |genomic start coordinate       |
    |          |          |(0-based)                      |
    +----------+----------+-------------------------------+
    |5         |end       |genomic end coordinate         |
    |          |          |(0-based)                      |
    +----------+----------+-------------------------------+
    |6         |score     |feature score                  |
    +----------+----------+-------------------------------+
    |7         |strand    |strand                         |
    +----------+----------+-------------------------------+
    |8         |frame     |frame                          |
    +----------+----------+-------------------------------+
    |9         |attributes|the attribute field            |
    +----------+----------+-------------------------------+

    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.GFF3Proxy r
        r = ctabixproxies.GFF3Proxy(self.encoding)
        r.copy(buffer, len)
        return r


cdef class asGTF(Parser):
    '''converts a :term:`tabix row` into a GTF record with the following
    fields:
   
    +----------+----------+-------------------------------+
    |*Column*  |*Name*    |*Content*                      |
    +----------+----------+-------------------------------+
    |1         |contig    |the chromosome name            |
    +----------+----------+-------------------------------+
    |2         |feature   |The feature type               |
    +----------+----------+-------------------------------+
    |3         |source    |The feature source             |
    +----------+----------+-------------------------------+
    |4         |start     |genomic start coordinate       |
    |          |          |(0-based)                      |
    +----------+----------+-------------------------------+
    |5         |end       |genomic end coordinate         |
    |          |          |(0-based)                      |
    +----------+----------+-------------------------------+
    |6         |score     |feature score                  |
    +----------+----------+-------------------------------+
    |7         |strand    |strand                         |
    +----------+----------+-------------------------------+
    |8         |frame     |frame                          |
    +----------+----------+-------------------------------+
    |9         |attributes|the attribute field            |
    +----------+----------+-------------------------------+

    GTF formatted entries also define the following fields that
    are derived from the attributes field:

    +--------------------+------------------------------+
    |*Name*              |*Content*                     |
    +--------------------+------------------------------+
    |gene_id             |the gene identifier           |
    +--------------------+------------------------------+
    |transcript_id       |the transcript identifier     |
    +--------------------+------------------------------+

    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.GTFProxy r
        r = ctabixproxies.GTFProxy(self.encoding)
        r.copy(buffer, len)
        return r
    

cdef class asBed(Parser):
    '''converts a :term:`tabix row` into a bed record
    with the following fields:

    +-----------+-----------+------------------------------------------+
    |*Column*   |*Field*    |*Contents*                                |
    |           |           |                                          |
    +-----------+-----------+------------------------------------------+
    |1          |contig     |contig                                    |
    |           |           |                                          |
    +-----------+-----------+------------------------------------------+
    |2          |start      |genomic start coordinate (zero-based)     |
    +-----------+-----------+------------------------------------------+
    |3          |end        |genomic end coordinate plus one           |
    |           |           |(zero-based)                              |
    +-----------+-----------+------------------------------------------+
    |4          |name       |name of feature.                          |
    +-----------+-----------+------------------------------------------+
    |5          |score      |score of feature                          |
    +-----------+-----------+------------------------------------------+
    |6          |strand     |strand of feature                         |
    +-----------+-----------+------------------------------------------+
    |7          |thickStart |thickStart                                |
    +-----------+-----------+------------------------------------------+
    |8          |thickEnd   |thickEnd                                  |
    +-----------+-----------+------------------------------------------+
    |9          |itemRGB    |itemRGB                                   |
    +-----------+-----------+------------------------------------------+
    |10         |blockCount |number of bocks                           |
    +-----------+-----------+------------------------------------------+
    |11         |blockSizes |',' separated string of block sizes       |
    +-----------+-----------+------------------------------------------+
    |12         |blockStarts|',' separated string of block genomic     |
    |           |           |start positions                           |
    +-----------+-----------+------------------------------------------+

    Only the first three fields are required. Additional
    fields are optional, but if one is defined, all the preceding
    need to be defined as well.

    ''' 
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.BedProxy r
        r = ctabixproxies.BedProxy(self.encoding)
        r.copy(buffer, len)
        return r


cdef class asVCF(Parser): 
    '''converts a :term:`tabix row` into a VCF record with
    the following fields:
    
    +----------+---------+------------------------------------+
    |*Column*  |*Field*  |*Contents*                          |
    |          |         |                                    |
    +----------+---------+------------------------------------+
    |1         |contig   |chromosome                          |
    +----------+---------+------------------------------------+
    |2         |pos      |chromosomal position, zero-based    |
    +----------+---------+------------------------------------+
    |3         |id       |id                                  |
    +----------+---------+------------------------------------+
    |4         |ref      |reference allele                    |
    +----------+---------+------------------------------------+
    |5         |alt      |alternate alleles                   |
    +----------+---------+------------------------------------+
    |6         |qual     |quality                             |
    +----------+---------+------------------------------------+
    |7         |filter   |filter                              |
    +----------+---------+------------------------------------+
    |8         |info     |info                                |
    +----------+---------+------------------------------------+
    |9         |format   |format specifier.                   |
    +----------+---------+------------------------------------+

    Access to genotypes is via index::

        contig = vcf.contig
        first_sample_genotype = vcf[0]
        second_sample_genotype = vcf[1]

    '''
    cdef parse(self, char * buffer, int len):
        cdef ctabixproxies.VCFProxy r
        r = ctabixproxies.VCFProxy(self.encoding)
        r.copy(buffer, len)
        return r


cdef class TabixFile:
    """Random access to bgzf formatted files that
    have been indexed by :term:`tabix`.

    The file is automatically opened. The index file of file
    ``<filename>`` is expected to be called ``<filename>.tbi``
    by default (see parameter `index`).
    
    Parameters
    ----------
    
    filename : string
        Filename of bgzf file to be opened.

    index : string
        The filename of the index. If not set, the default is to
        assume that the index is called ``filename.tbi``

    mode : char
        The file opening mode. Currently, only ``r`` is permitted.
        
    parser : :class:`pysam.Parser`
    
        sets the default parser for this tabix file. If `parser`
        is None, the results are returned as an unparsed string.
        Otherwise, `parser` is assumed to be a functor that will return
        parsed data (see for example :class:`~pysam.asTuple` and
        :class:`~pysam.asGTF`).

    encoding : string

        The encoding passed to the parser

    threads: integer
        Number of threads to use for decompressing Tabix files.
        (Default=1)


    Raises
    ------
    
    ValueError
        if index file is missing.

    IOError
        if file could not be opened
    """
    def __cinit__(self,
                  filename,
                  mode='r',
                  parser=None,
                  index=None,
                  encoding="ascii",
                  threads=1,
                  *args,
                  **kwargs ):

        self.htsfile = NULL
        self.is_remote = False
        self.is_stream = False
        self.parser = parser
        self.threads = threads
        self._open(filename, mode, index, *args, **kwargs)
        self.encoding = encoding

    def _open( self,
               filename,
               mode='r',
               index=None,
               threads=1,
              ):
        '''open a :term:`tabix file` for reading.'''

        if mode != 'r':
            raise ValueError("invalid file opening mode `%s`" % mode)

        if self.htsfile != NULL:
            self.close()
        self.htsfile = NULL
        self.threads=threads

        filename_index = index or (filename + ".tbi")
        # encode all the strings to pass to tabix
        self.filename = encode_filename(filename)
        self.filename_index = encode_filename(filename_index)

        self.is_stream = self.filename == b'-'
        self.is_remote = hisremote(self.filename)

        if not self.is_remote:
            if not os.path.exists(filename):
                raise IOError("file `%s` not found" % filename)

            if not os.path.exists(filename_index):
                raise IOError("index `%s` not found" % filename_index)

        # open file
        cdef char *cfilename = self.filename
        cdef char *cfilename_index = self.filename_index
        with nogil:
            self.htsfile = hts_open(cfilename, 'r')

        if self.htsfile == NULL:
            raise IOError("could not open file `%s`" % filename)
        
        #if self.htsfile.format.category != region_list:
        #    raise ValueError("file does not contain region data")

        with nogil:
            self.index = tbx_index_load2(cfilename, cfilename_index)

        if self.index == NULL:
            raise IOError("could not open index for `%s`" % filename)

        if not self.is_stream:
            self.start_offset = self.tell()

    def _dup(self):
        '''return a copy of this tabix file.
        
        The file is being re-opened.
        '''
        return TabixFile(self.filename,
                         mode="r",
                         threads=self.threads,
                         parser=self.parser,
                         index=self.filename_index,
                         encoding=self.encoding)

    def fetch(self, 
              reference=None,
              start=None, 
              end=None, 
              region=None,
              parser=None,
              multiple_iterators=False):
        '''fetch one or more rows in a :term:`region` using 0-based
        indexing. The region is specified by :term:`reference`,
        *start* and *end*. Alternatively, a samtools :term:`region`
        string can be supplied.

        Without *reference* or *region* all entries will be fetched. 
        
        If only *reference* is set, all reads matching on *reference*
        will be fetched.

        If *parser* is None, the default parser will be used for
        parsing.
        
        Set *multiple_iterators* to true if you will be using multiple
        iterators on the same file at the same time. The iterator
        returned will receive its own copy of a filehandle to the file
        effectively re-opening the file. Re-opening a file creates
        some overhead, so beware.

        '''
        if not self.is_open():
            raise ValueError("I/O operation on closed file")

        # convert coordinates to region string, which is one-based
        if reference:
            if end is not None:
                if end < 0:
                    raise ValueError("end out of range (%i)" % end)
                if start is None:
                    start = 0
                    
                if start < 0:
                    raise ValueError("start out of range (%i)" % end)
                elif start > end:
                    raise ValueError(
                        'start (%i) >= end (%i)' % (start, end))
                elif start == end:
                    return EmptyIterator()
                else:
                    region = '%s:%i-%i' % (reference, start + 1, end)
            elif start is not None:
                if start < 0:
                    raise ValueError("start out of range (%i)" % end)
                region = '%s:%i' % (reference, start + 1)
            else:
                region = reference

        # get iterator
        cdef hts_itr_t * itr
        cdef char *cstr
        cdef TabixFile fileobj

        # reopen the same file if necessary
        if multiple_iterators:
            fileobj = self._dup()
        else:
            fileobj = self

        if region is None:
            # without region or reference - iterate from start
            with nogil:
                itr = tbx_itr_queryi(fileobj.index,
                                     HTS_IDX_START,
                                     0,
                                     0)
        else:
            s = force_bytes(region, encoding=fileobj.encoding)
            cstr = s
            with nogil:
                itr = tbx_itr_querys(fileobj.index, cstr)

        if itr == NULL:
            if region is None:
                if len(self.contigs) > 0:
                    # when accessing a tabix file created prior tabix 1.0
                    # the full-file iterator is empty.
                    raise ValueError(
                        "could not create iterator, possible "
                        "tabix version mismatch")
                else:
                    # possible reason is that the file is empty -
                    # return an empty iterator
                    return EmptyIterator()
            else:
                raise ValueError(
                    "could not create iterator for region '%s'" %
                    region)
            
        # use default parser if no parser is specified
        if parser is None:
            parser = fileobj.parser

        cdef TabixIterator a
        if parser is None: 
            a = TabixIterator(encoding=fileobj.encoding)
        else:
            parser.set_encoding(fileobj.encoding)
            a = TabixIteratorParsed(parser)

        a.tabixfile = fileobj
        a.iterator = itr

        return a

    ###############################################################
    ###############################################################
    ###############################################################
    ## properties
    ###############################################################
    property header:
        '''the file header.

        The file header consists of the lines at the beginning of a
        file that are prefixed by the comment character ``#``.
       
        .. note::
            The header is returned as an iterator presenting lines
            without the newline character.
        '''
        
        def __get__(self):

            cdef char *cfilename = self.filename
            cdef char *cfilename_index = self.filename_index
            
            cdef kstring_t buffer
            buffer.l = buffer.m = 0
            buffer.s = NULL
            
            cdef htsFile * fp = NULL
            cdef int KS_SEP_LINE = 2
            cdef tbx_t * tbx = NULL
            lines = []
            with nogil:
                fp = hts_open(cfilename, 'r')
                
            if fp == NULL:
                raise OSError("could not open {} for reading header".format(self.filename))

            with nogil:
                tbx = tbx_index_load2(cfilename, cfilename_index)
                
            if tbx == NULL:
                raise OSError("could not load .tbi/.csi index of {}".format(self.filename))

            while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
                if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
                    break
                lines.append(force_str(buffer.s, self.encoding))

            with nogil:
                hts_close(fp)
                free(buffer.s)

            return lines

    property contigs:
        '''list of chromosome names'''
        def __get__(self):
            cdef const char ** sequences
            cdef int nsequences
            
            with nogil:
                sequences = tbx_seqnames(self.index, &nsequences)
            cdef int x
            result = []
            for x from 0 <= x < nsequences:
                result.append(force_str(sequences[x]))
            
            # htslib instructions:
            # only free container, not the sequences themselves
            free(sequences)

            return result
            
    def close(self):
        '''
        closes the :class:`pysam.TabixFile`.'''
        if self.htsfile != NULL:
            hts_close(self.htsfile)
            self.htsfile = NULL
        if self.index != NULL:
            tbx_destroy(self.index)
            self.index = NULL

    def __dealloc__( self ):
        # remember: dealloc cannot call other python methods
        # note: no doc string
        # note: __del__ is not called.
        if self.htsfile != NULL:
            hts_close(self.htsfile)
            self.htsfile = NULL
        if self.index != NULL:
            tbx_destroy(self.index)


cdef class TabixIterator:
    """iterates over rows in *tabixfile* in region
    given by *tid*, *start* and *end*.
    """

    def __init__(self, encoding="ascii"):
        self.encoding = encoding
    
    def __iter__(self):
        self.buffer.s = NULL
        self.buffer.l = 0
        self.buffer.m = 0

        return self 

    cdef int __cnext__(self):
        '''iterate to next element.
        
        Return -5 if file has been closed when this function
        was called.
        '''
        if self.tabixfile.htsfile == NULL:
            return -5

        cdef int retval

        while 1:
            with nogil:
                retval = tbx_itr_next(
                    self.tabixfile.htsfile,
                    self.tabixfile.index,
                    self.iterator,
                    &self.buffer)

            if retval < 0:
                break

            if self.buffer.s[0] != b'#':
                break

        return retval

    def __next__(self): 
        """python version of next().

        pyrex uses this non-standard name instead of next()
        """
        
        cdef int retval = self.__cnext__()
        if retval == -5:
            raise IOError("iteration on closed file")
        elif retval < 0:
            raise StopIteration

        return charptr_to_str(self.buffer.s, self.encoding)

    def __dealloc__(self):
        if <void*>self.iterator != NULL:
            tbx_itr_destroy(self.iterator)
        if self.buffer.s != NULL:
            free(self.buffer.s)


class EmptyIterator:
    '''empty iterator'''

    def __iter__(self):
        return self

    def __next__(self):
        raise StopIteration()


cdef class TabixIteratorParsed(TabixIterator):
    """iterates over mapped reads in a region.

    The *parser* determines the encoding.

    Returns parsed data.
    """

    def __init__(self, 
                 Parser parser):
        
        TabixIterator.__init__(self)
        self.parser = parser

    def __next__(self): 
        """python version of next().

        pyrex uses this non-standard name instead of next()
        """
        
        cdef int retval = self.__cnext__()
        if retval == -5:
            raise IOError("iteration on closed file")
        elif retval < 0:
            raise StopIteration

        return self.parser.parse(self.buffer.s,
                                 self.buffer.l)


cdef class GZIterator:
    def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
        '''iterate line-by-line through gzip (or bgzip)
        compressed file.
        '''
        if not os.path.exists(filename):
            raise IOError("No such file or directory: %s" % filename)

        filename = encode_filename(filename)
        cdef char *cfilename = filename
        with nogil:
            self.gzipfile = bgzf_open(cfilename, "r")
        self._filename = filename
        self.kstream = ks_init(self.gzipfile)
        self.encoding = encoding

        self.buffer.l = 0
        self.buffer.m = 0
        self.buffer.s = <char*>malloc(buffer_size)

    def __dealloc__(self):
        '''close file.'''
        if self.gzipfile != NULL:
            bgzf_close(self.gzipfile)
            self.gzipfile = NULL
        if self.buffer.s != NULL:
            free(self.buffer.s)
        if self.kstream != NULL:
            ks_destroy(self.kstream)

    def __iter__(self):
        return self

    cdef int __cnext__(self):
        cdef int dret = 0
        cdef int retval = 0
        while 1:
            with nogil:
                retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
            
            if retval < 0: 
                break

            return dret
        return -1

    def __next__(self):
        """python version of next().
        """
        cdef int retval = self.__cnext__()
        if retval < 0:
            raise StopIteration
        return force_str(self.buffer.s, self.encoding)


cdef class GZIteratorHead(GZIterator):
    '''iterate line-by-line through gzip (or bgzip)
    compressed file returning comments at top of file.
    '''

    def __next__(self):
        """python version of next().
        """
        cdef int retval = self.__cnext__()
        if retval < 0:
            raise StopIteration
        if self.buffer.s[0] == b'#':
            return self.buffer.s
        else:
            raise StopIteration


cdef class GZIteratorParsed(GZIterator):
    '''iterate line-by-line through gzip (or bgzip)
    compressed file returning comments at top of file.
    '''

    def __init__(self, parser):
        self.parser = parser

    def __next__(self):
        """python version of next().
        """
        cdef int retval = self.__cnext__()
        if retval < 0:
            raise StopIteration

        return self.parser.parse(self.buffer.s,
                                 self.buffer.l)


def tabix_compress(filename_in, 
                   filename_out,
                   force=False):
    '''compress *filename_in* writing the output to *filename_out*.
    
    Raise an IOError if *filename_out* already exists, unless *force*
    is set.
    '''

    if not force and os.path.exists(filename_out):
        raise IOError(
            "Filename '%s' already exists, use *force* to "
            "overwrite" % filename_out)

    cdef int WINDOW_SIZE
    cdef int c, r
    cdef void * buffer
    cdef BGZF * fp
    cdef int fd_src
    cdef bint is_empty = True
    cdef int O_RDONLY
    O_RDONLY = os.O_RDONLY

    WINDOW_SIZE = 64 * 1024

    fn = encode_filename(filename_out)
    cdef char *cfn = fn
    with nogil:
        fp = bgzf_open(cfn, "w")
    if fp == NULL:
        raise IOError("could not open '%s' for writing" % filename_out)

    fn = encode_filename(filename_in)
    fd_src = open(fn, O_RDONLY)
    if fd_src == 0:
        raise IOError("could not open '%s' for reading" % filename_in)

    buffer = malloc(WINDOW_SIZE)
    c = 1
    
    while c > 0:
        with nogil:
            c = read(fd_src, buffer, WINDOW_SIZE)
            if c > 0:
                is_empty = False
            r = bgzf_write(fp, buffer, c)
        if r < 0:
            free(buffer)
            raise IOError("writing failed")
        
    free(buffer)
    r = bgzf_close(fp)
    if r < 0:
        raise IOError("error %i when writing to file %s" % (r, filename_out))

    r = close(fd_src)
    # an empty file will return with -1, thus ignore this.
    if r < 0:
        if not (r == -1 and is_empty):
            raise IOError("error %i when closing file %s" % (r, filename_in))


def tabix_index(filename,
                force=False,
                seq_col=None,
                start_col=None,
                end_col=None,
                preset=None,
                meta_char="#",
                int line_skip=0,
                zerobased=False,
                int min_shift=-1,
                index=None,
                keep_original=False,
                csi=False,
                ):
    '''index tab-separated *filename* using tabix.

    An existing index will not be overwritten unless *force* is set.

    The index will be built from coordinates in columns *seq_col*,
    *start_col* and *end_col*.

    The contents of *filename* have to be sorted by contig and
    position - the method does not check if the file is sorted.

    Column indices are 0-based. Note that this is different from the
    tabix command line utility where column indices start at 1.
    
    Coordinates in the file are assumed to be 1-based unless
    *zerobased* is set.

    If *preset* is provided, the column coordinates are taken from a
    preset. Valid values for preset are "gff", "bed", "sam", "vcf",
    psltbl", "pileup".
    
    Lines beginning with *meta_char* and the first *line_skip* lines
    will be skipped.

    If *filename* is not detected as a gzip file it will be automatically
    compressed. The original file will be removed and only the compressed
    file will be retained.

    By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
    is greater than zero and/or *csi* is True, creates a CSI index with a
    minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).

    *index* controls the filename which should be used for creating the index.
    If not set, the default is to append ``.tbi`` to *filename*.

    When automatically compressing files, if *keep_original* is set the
    uncompressed file will not be deleted.

    returns the filename of the compressed data

    '''
    
    if preset is None and \
       (seq_col is None or start_col is None or end_col is None):
        raise ValueError(
            "neither preset nor seq_col,start_col and end_col given")

    fn = encode_filename(filename)
    cdef char *cfn = fn

    cdef htsFile *fp = hts_open(cfn, "r")
    if fp == NULL:
        raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))

    cdef htsFormat fmt = hts_get_format(fp)[0]
    hts_close(fp)

    if fmt.compression == no_compression:
        tabix_compress(filename, filename + ".gz", force=force)
        if not keep_original:
            os.unlink(filename)
        filename += ".gz"
        fn = encode_filename(filename)
        cfn = fn

    # columns (1-based):
    #   preset-code, contig, start, end, metachar for
    #     comments, lines to ignore at beginning
    # 0 is a missing column
    preset2conf = {
        'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
        'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
        'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
        'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
        'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
        }
    
    conf_data = None
    if preset == "bcf" or fmt.format == bcf:
        csi = True
    elif preset:
        try:
            conf_data = preset2conf[preset]
        except KeyError:
            raise KeyError(
                "unknown preset '%s', valid presets are '%s'" %
                (preset, ",".join(preset2conf.keys())))
    else:
        if end_col is None:
            end_col = -1
            
        preset = 0
        # tabix internally works with 0-based coordinates and
        # open/closed intervals.  When using a preset, conversion is
        # automatically taken care of.  Otherwise, the coordinates are
        # assumed to be 1-based closed intervals and -1 is subtracted
        # from the start coordinate. To avoid doing this, set the
        # TI_FLAG_UCSC=0x10000 flag:
        if zerobased:
            preset = preset | TBX_UCSC

        conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)

    cdef tbx_conf_t conf
    if conf_data:
        conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data

    if csi or min_shift > 0:
        suffix = ".csi"
        if min_shift <= 0: min_shift = 14
    else:
        suffix = ".tbi"
        min_shift = 0

    index = index or filename + suffix    
    fn_index = encode_filename(index)

    if not force and os.path.exists(index):
        raise IOError(
            "filename '%s' already exists, use *force* to overwrite" % index)
    
    cdef char *fnidx = fn_index
    cdef int retval = 0

    if csi and fmt.format == bcf:
        with nogil:
            retval = bcf_index_build2(cfn, fnidx, min_shift)
    else:
        with nogil:
            retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
            
    if retval != 0:
        raise OSError("building of index for {} failed".format(filename))
    
    return filename

# #########################################################
# cdef class tabix_file_iterator_old:
#     '''iterate over ``infile``.

#     This iterator is not safe. If the :meth:`__next__()` method is called 
#     after ``infile`` is closed, the result is undefined (see ``fclose()``).

#     The iterator might either raise a StopIteration or segfault.
#     '''


#     def __cinit__(self, 
#                   infile, 
#                   Parser parser,
#                   int buffer_size = 65536 ):

#         cdef int fd = PyObject_AsFileDescriptor( infile )
#         if fd == -1: raise ValueError( "I/O operation on closed file." )
#         self.infile = fdopen( fd, 'r')

#         if self.infile == NULL: raise ValueError( "I/O operation on closed file." )

#         self.buffer = <char*>malloc( buffer_size )        
#         self.size = buffer_size
#         self.parser = parser

#     def __iter__(self):
#         return self

#     cdef __cnext__(self):

#         cdef char * b
#         cdef size_t nbytes
#         b = self.buffer

#         while not feof( self.infile ):
#             nbytes = getline( &b, &self.size, self.infile)

#             # stop at first error or eof
#             if (nbytes == -1): break
#             # skip comments
#             if (b[0] == '#'): continue

#             # skip empty lines
#             if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue

#             # make sure that entry is complete
#             if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
#                 result = b
#                 raise ValueError( "incomplete line at %s" % result )

#             # make sure that this goes fully through C
#             # otherwise buffer is copied to/from a
#             # Python object causing segfaults as
#             # the wrong memory is freed
#             return self.parser.parse( b, nbytes )

#         raise StopIteration

#     def __dealloc__(self):
#         free(self.buffer)

#     def __next__(self):
#         return self.__cnext__()

#########################################################
#########################################################
#########################################################
## Iterators for parsing through unindexed files.
#########################################################
# cdef buildGzipError(void *gzfp):
#     cdef int errnum = 0
#     cdef char *s = gzerror(gzfp, &errnum)
#     return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)


cdef class tabix_file_iterator:
    '''iterate over a compressed or uncompressed ``infile``.
    '''

    def __cinit__(self, 
                  infile, 
                  Parser parser,
                  int buffer_size=65536):

        if infile.closed:
            raise ValueError("I/O operation on closed file.")

        self.infile = infile

        cdef int fd = PyObject_AsFileDescriptor(infile)
        if fd == -1:
            raise ValueError("I/O operation on closed file.")

        self.duplicated_fd = dup(fd)

        # From the manual:
        # gzopen can be used to read a file which is not in gzip format; 
        # in this case gzread will directly read from the file without decompression. 
        # When reading, this will be detected automatically by looking 
        # for the magic two-byte gzip header. 
        self.fh = bgzf_dopen(self.duplicated_fd, 'r')

        if self.fh == NULL: 
            raise IOError('%s' % strerror(errno))

        self.kstream = ks_init(self.fh) 
        
        self.buffer.s = <char*>malloc(buffer_size)
        #if self.buffer == NULL:
        #    raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
        #self.size = buffer_size
        self.parser = parser

    def __iter__(self):
        return self

    cdef __cnext__(self):

        cdef char * b
        cdef int dret = 0
        cdef int retval = 0
        while 1:
            with nogil:
                retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
            
            if retval < 0: 
                break
                #raise IOError('gzip error: %s' % buildGzipError( self.fh ))

            b = self.buffer.s
            
            # skip comments
            if (b[0] == b'#'):
                continue

            # skip empty lines
            if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
                continue

            # gzgets terminates at \n, no need to test

            # parser creates a copy
            return self.parser.parse(b, self.buffer.l)

        raise StopIteration

    def __dealloc__(self):
        free(self.buffer.s)
        ks_destroy(self.kstream)
        bgzf_close(self.fh)
        
    def __next__(self):
        return self.__cnext__()


class tabix_generic_iterator:
    '''iterate over ``infile``.
    
    Permits the use of file-like objects for example from the gzip module.
    '''
    def __init__(self, infile, parser):

        self.infile = infile
        if self.infile.closed:
            raise ValueError("I/O operation on closed file.")
        self.parser = parser

    def __iter__(self):
        return self

    # cython version - required for python 3
    def __next__(self):
        
        cdef char * b
        cdef char * cpy
        cdef size_t nbytes

        encoding = self.parser.get_encoding()

        # note that GzipFile.close() does not close the file
        # reading is still possible.
        if self.infile.closed:
            raise ValueError("I/O operation on closed file.")

        while 1:

            line = self.infile.readline()
            if not line:
                break
            
            s = force_bytes(line, encoding)
            b = s
            nbytes = len(line)
            assert b[nbytes] == b'\0'

            # skip comments
            if b[0] == b'#':
                continue

            # skip empty lines
            if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
                continue
            
            # make sure that entry is complete
            if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
                raise ValueError("incomplete line at %s" % line)
            
            bytes_cpy = <bytes> b
            cpy = <char *> bytes_cpy

            return self.parser(cpy, nbytes)            

        raise StopIteration


def tabix_iterator(infile, parser):
    """return an iterator over all entries in a file.
    
    Results are returned parsed as specified by the *parser*. If
    *parser* is None, the results are returned as an unparsed string.
    Otherwise, *parser* is assumed to be a functor that will return
    parsed data (see for example :class:`~pysam.asTuple` and
    :class:`~pysam.asGTF`).

    """
    return tabix_generic_iterator(infile, parser)
        
    
cdef class Tabixfile(TabixFile):
    """Tabixfile is deprecated: use TabixFile instead"""
    pass


__all__ = [
    "tabix_index", 
    "tabix_compress",
    "TabixFile",
    "Tabixfile",
    "asTuple",
    "asGTF",
    "asGFF3",
    "asVCF",
    "asBed",
    "GZIterator",
    "GZIteratorHead",
    "tabix_iterator", 
    "tabix_generic_iterator", 
    "tabix_file_iterator", 
]
back to top