Source code for bionumpy.io.indexed_files

from pathlib import PurePath
import os
from .indexed_fasta import IndexedFasta, create_index
from .files import bnp_open
from .delimited_buffers import DelimitedBuffer
from .multiline_buffer import FastaIdx


class IndexBuffer(DelimitedBuffer):
    sep = "\t"
    dataclass = FastaIdx



[docs]
def open_indexed(filename: str) -> IndexedFasta:
    """Open an indexed fasta (for now) file with random access

    If an index is not already present for the file, create it

    Parameters
    ----------
    filename : str
        The filename of the file

    Returns
    -------
    IndexedFasta
        An Indexed fasta object that supports random access on
        chromosome or intervals

    Examples
    --------
    >>> from bionumpy import open_indexed
    >>> reference = open_indexed("example_data/small_genome.fa")
    >>> reference
    Indexed Fasta File with chromosome sizes: {'0': 80, '1': 80, '2': 80, '3': 80}
    >>> reference["1"]
    encoded_array('gcttggtatgaaaacccatc...')
    >>> from bionumpy.datatypes import Interval
    >>> intervals = Interval.from_entry_tuples([("1", 10, 20), ("2", 20, 30)])
    >>> reference.get_interval_sequences(intervals)
    encoded_ragged_array(['aaaacccatc',
                          'ggccgttttt'])
    """

    path = PurePath(filename)
    suffix = path.suffixes[-1]
    index_file_name = path.with_suffix(path.suffix + ".fai")
    assert suffix in (".fa", ".fasta"), "Only fasta supported for indexed read"
    if not os.path.isfile(index_file_name):
        index = create_index(path)
        bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(index)
    return IndexedFasta(filename)