Source code for bionumpy.genomic_data.genome

import os
import numpy as np
from typing import Dict
from pathlib import PurePath
from ..io import bnp_open, Bed6Buffer, BedBuffer, open_indexed
from ..io.files import buffer_types, BamBuffer, BamIntervalBuffer
from ..io.indexed_files import IndexBuffer, create_index
from .genomic_track import GenomicArray
from .genomic_intervals import GenomicIntervals, GenomicLocation, LocationEntry
from .genomic_sequence import GenomicSequence
from .annotation import GenomicAnnotation
from .genome_context import GenomeContext, keep_all, ignore_underscores
from ..encoded_array import as_encoded_array
from ..bnpdataclass import replace, BNPDataClass
from ..datatypes import BedGraph, Interval
from ..util.formating import table



[docs]
class Genome:
    '''
    The entry point for working with genomic data. Supports reading data directly from
    file and into GenomicData objects, or creating GenomicData objects from `BNPDataClass` objects, 
    such `Interval`, `BedGraph` or `LocationEntry`.

    Supports both in memory or streamed/lazy data, controlled with the `stream` keyword. For streamed
    data it requires that the data in the file uses the same chromosome ordering as the genome. This
    will usually be the same as either what is provided in the `chrom.sizes` file, or a simple alphabetic
    ordering. If sort order errors occur, try using `sort_names=True` when creating the `Genome` object.
    '''
    def __init__(self, chrom_sizes: Dict[str, int], fasta_filename: str = None, sort_names=False, filter_function=lambda x: True):
        if isinstance(chrom_sizes, GenomeContext):
            self._genome_context = chrom_sizes
        else:
            if sort_names:
                chrom_sizes = {key: chrom_sizes[key] for key in sorted(chrom_sizes.keys())}
            self._genome_context = GenomeContext.from_dict(chrom_sizes, filter_function)
        self._fasta_filename = fasta_filename

    def with_ignored_added(self, ignored):
        return self.__class__(self._genome_context.with_ignored_added(ignored), self._fasta_filename)

    @classmethod
    def from_dict(cls, chrom_sizes: Dict[str, int], *args, **kwargs):
        return cls(chrom_sizes, *args, **kwargs)


[docs]
    @classmethod
    def from_file(cls, filename: str, sort_names: bool=False, filter_function=ignore_underscores) -> 'Genome':
        """Read genome information from a 'chrom.sizes' or 'fa.fai' file

        File should contain rows of names and lengths of chromosomes.
        If a fasta file is provided, a fasta index will be written to 'fa.fai'
        if not already theree. The sequence will then be available through, 'genome.read_sequence()'
        without any 'filename' parameter.

        Parameters
        ----------
        cls :
        filename : str
        sort_names : bool Whether or not to sort the chromosome names

        Returns
        -------
        'Genome'
            A `Genome` object created from the read chromosome sizes

        """
        path = PurePath(filename)
        suffix = path.suffixes[-1]
        index_file_name = path.with_suffix(path.suffix + ".fai")
        fasta_filename = None
        if suffix in (".fa", ".fasta"):
            if not os.path.isfile(index_file_name):
                bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(create_index(path))
            fasta_filename = filename
            filename = index_file_name

        split_lines = (line.split()[:2] for line in open(filename))
        return cls({name: int(length) for name, length in split_lines}, fasta_filename=fasta_filename, sort_names=sort_names, filter_function=filter_function)


    @staticmethod
    def _open(filename, stream, buffer_type=None):
        f = bnp_open(filename, buffer_type=buffer_type)
        if stream:
            content = f.read_chunks()
        else:
            content = f.read()
        return content


[docs]
    def get_track(self, bedgraph: BedGraph) -> GenomicArray:
        """Create a `GenomicArray` for the data in the bedgraph

        Parameters
        ----------
        bedgraph : BedGraph

        Returns
        -------
        GenomicArray
        """
        bedgraph = self._mask_data_on_extra_chromosomes(bedgraph)
        return GenomicArray.from_bedgraph(bedgraph, self._genome_context)



[docs]
    def read_track(self, filename: str, stream: bool = False) -> GenomicArray:
        """Read a bedgraph from file and convert it to a `GenomicArray`

        If `stream` is `True` then read the bedgraph in chunks and get
        a lazy evaluated GenomicArray

        Parameters
        ----------
        filename : str
            Filename for the bedgraph file
        stream : bool
            Whether or not to read as stream

        """
        content = self._open(filename, stream)
        return self.get_track(content)



[docs]
    def get_intervals(self, intervals: Interval, stranded: bool = False) -> GenomicIntervals:
        """Get genomic intervals from interval data. 

        Parameters
        ----------
        intervals : Interval
            Interval's or stream of Intervals's
        stranded : bool
            Wheter or not the intervals are stranded

        Returns
        -------
        GenomicIntervals
        """
        return GenomicIntervals.from_intervals(intervals, self._genome_context, is_stranded=stranded)



[docs]
    def read_intervals(self, filename: str, stranded: bool = False, stream: bool = False, buffer_type=None) -> GenomicIntervals:
        """Read a bed file and represent it as `GenomicIntervals`

        If `stream` is `True` then read the bedgraph in chunks and get
        a lazy evaluated GenomicTrack

        Parameters
        ----------
        filename : str
            Filename for the bed file
        stranded : bool
            Whether or not to treat the intervals as stranded
        stream : bool
            Wheter to read as a stream

        Returns
        -------
        GenomicIntervals
        """
        path = PurePath(filename)
        suffix = path.suffixes[-1]
        if suffix == ".gz":
            suffix = path.suffixes[-2]
        if buffer_type is None:
            buffer_type = buffer_types[suffix]
            if buffer_type == BedBuffer and stranded: 
                buffer_type = Bed6Buffer
            if buffer_type == BamBuffer:
                buffer_type = BamIntervalBuffer
        content = self._open(filename, stream, buffer_type=buffer_type)
        return self.get_intervals(content, stranded)

    # return GenomicIntervals.from_intervals(content, self._chrom_sizes)


[docs]
    def read_locations(self, filename: str, stranded: bool = False, stream: bool = False, has_numeric_chromosomes=False, buffer_type=None) -> GenomicLocation:
        """Read a set of locations from file and convert them to `GenomicLocation`        

        Locations can be treated as stranded or not stranded. If `has_numeric_chromosomes` then 'chr'
        will be prepended to all the chromosome names in order to make them compatible with the
        genome's chromosome names

        Parameters
        ----------
        filename : str
            Filename of the locations
        stranded : bool
            Whether or not the locations are stranded
        stream : bool
            Whether the data should be read as a stream
        has_numeric_chromosomes : bool
            Whether the file has the chromosomes as numbers

        Returns
        -------
        GenomicLocation
            (Stranded) GenomicLocation

        """
        assert not (stream and has_numeric_chromosomes)
        f = bnp_open(filename, buffer_type=buffer_type)
        data = f.read_chunks()
        if not stream:
            data_list = list(data)
            if len(data_list) == 0:
                data = LocationEntry.empty()
            else:
                data = np.concatenate(data_list)

        return self.get_locations(data, has_numeric_chromosomes=has_numeric_chromosomes)


    def _mask_data_on_extra_chromosomes(self, data, chromosome_field_name='chromosome'):
        if (not isinstance(data, BNPDataClass)) or len(data)==0:
            return data
        encoded_chromosomes = self._genome_context.encoding.encode(getattr(data, chromosome_field_name))
        data = replace(data, **{chromosome_field_name: encoded_chromosomes})
        mask = self._genome_context.is_included(encoded_chromosomes)
        return data[mask]


[docs]
    def get_locations(self, data: LocationEntry, has_numeric_chromosomes=False) -> GenomicLocation:
        """Create `GenomicLocation`s from location entries

        Parameters
        ----------
        data : LocationEntry
            BNPDataClass of location entries

        Returns
        -------
        GenomicLocation

        """
        if has_numeric_chromosomes:
            data = replace(data, chromosome=as_encoded_array(['chr'+chromosome.to_string() for chromosome in data.chromosome]))
        data = self._mask_data_on_extra_chromosomes(data)
        return GenomicLocation.from_data(data, self._genome_context)



[docs]
    def read_sequence(self, filename: str = None) -> GenomicSequence:
        """Read the genomic sequence from file.

        If a `fasta` file was used to create the Genome object, `filename` can be `None`
        in which case the sequence will be read from that fasta file

        Parameters
        ----------
        filename : str

        Returns
        -------
        GenomicSequence

        """
        
        if filename is None:
            assert self._fasta_filename is not None
            filename = self._fasta_filename
        return GenomicSequence.from_indexed_fasta(open_indexed(filename), genome_context=self._genome_context)



[docs]
    def read_annotation(self, filename: str) -> GenomicAnnotation:
        """Read genomic annotation from a 'gtf' or 'gff' file

        The annotation file should have 'gene', 'transcript' and 'exon' entries.

        Parameters
        ----------
        filename : str
            Filename of the annotation file


        Returns
        -------
        GenomicAnnotation
            `GenomicAnnotation` containing the genes, transcripts and exons

        """
        
        gtf_entries = self._open(filename, stream=False)
        return GenomicAnnotation.from_gtf_entries(gtf_entries, self._genome_context)


    def __repr__(self):
        return f"{self.__class__.__name__}(" + repr(self._genome_context) + ")"

    def __str__(self):
        return table(
            ((key, value) for key, value in self._genome_context.chrom_sizes.items() if '_' not in key),
            headers=["Chromosome", "Size"])

    def get_genome_context(self):
        return self._genome_context

    @property
    def size(self):
        '''The size of the genome'''
        return self._genome_context.size