Source code for bionumpy.genomic_data.genome

import os
import numpy as np
from typing import Dict
from pathlib import PurePath
from ..io import bnp_open, Bed6Buffer, BedBuffer, open_indexed
from ..io.files import buffer_types, BamBuffer, BamIntervalBuffer
from ..io.indexed_files import IndexBuffer, create_index
from .genomic_track import GenomicArray
from .genomic_intervals import GenomicIntervals, GenomicLocation, LocationEntry
from .genomic_sequence import GenomicSequence
from .annotation import GenomicAnnotation
from .genome_context import GenomeContext, keep_all, ignore_underscores
from ..encoded_array import as_encoded_array
from ..bnpdataclass import replace, BNPDataClass
from ..datatypes import BedGraph, Interval
from ..util.formating import table


[docs] class Genome: ''' The entry point for working with genomic data. Supports reading data directly from file and into GenomicData objects, or creating GenomicData objects from `BNPDataClass` objects, such `Interval`, `BedGraph` or `LocationEntry`. Supports both in memory or streamed/lazy data, controlled with the `stream` keyword. For streamed data it requires that the data in the file uses the same chromosome ordering as the genome. This will usually be the same as either what is provided in the `chrom.sizes` file, or a simple alphabetic ordering. If sort order errors occur, try using `sort_names=True` when creating the `Genome` object. ''' def __init__(self, chrom_sizes: Dict[str, int], fasta_filename: str = None, sort_names=False, filter_function=lambda x: True): if isinstance(chrom_sizes, GenomeContext): self._genome_context = chrom_sizes else: if sort_names: chrom_sizes = {key: chrom_sizes[key] for key in sorted(chrom_sizes.keys())} self._genome_context = GenomeContext.from_dict(chrom_sizes, filter_function) self._fasta_filename = fasta_filename def with_ignored_added(self, ignored): return self.__class__(self._genome_context.with_ignored_added(ignored), self._fasta_filename) @classmethod def from_dict(cls, chrom_sizes: Dict[str, int], *args, **kwargs): return cls(chrom_sizes, *args, **kwargs)
[docs] @classmethod def from_file(cls, filename: str, sort_names: bool=False, filter_function=ignore_underscores) -> 'Genome': """Read genome information from a 'chrom.sizes' or 'fa.fai' file File should contain rows of names and lengths of chromosomes. If a fasta file is provided, a fasta index will be written to 'fa.fai' if not already theree. The sequence will then be available through, 'genome.read_sequence()' without any 'filename' parameter. Parameters ---------- cls : filename : str sort_names : bool Whether or not to sort the chromosome names Returns ------- 'Genome' A `Genome` object created from the read chromosome sizes """ path = PurePath(filename) suffix = path.suffixes[-1] index_file_name = path.with_suffix(path.suffix + ".fai") fasta_filename = None if suffix in (".fa", ".fasta"): if not os.path.isfile(index_file_name): bnp_open(index_file_name, "w", buffer_type=IndexBuffer).write(create_index(path)) fasta_filename = filename filename = index_file_name split_lines = (line.split()[:2] for line in open(filename)) return cls({name: int(length) for name, length in split_lines}, fasta_filename=fasta_filename, sort_names=sort_names, filter_function=filter_function)
@staticmethod def _open(filename, stream, buffer_type=None): f = bnp_open(filename, buffer_type=buffer_type) if stream: content = f.read_chunks() else: content = f.read() return content
[docs] def get_track(self, bedgraph: BedGraph) -> GenomicArray: """Create a `GenomicArray` for the data in the bedgraph Parameters ---------- bedgraph : BedGraph Returns ------- GenomicArray """ bedgraph = self._mask_data_on_extra_chromosomes(bedgraph) return GenomicArray.from_bedgraph(bedgraph, self._genome_context)
[docs] def read_track(self, filename: str, stream: bool = False) -> GenomicArray: """Read a bedgraph from file and convert it to a `GenomicArray` If `stream` is `True` then read the bedgraph in chunks and get a lazy evaluated GenomicArray Parameters ---------- filename : str Filename for the bedgraph file stream : bool Whether or not to read as stream """ content = self._open(filename, stream) return self.get_track(content)
[docs] def get_intervals(self, intervals: Interval, stranded: bool = False) -> GenomicIntervals: """Get genomic intervals from interval data. Parameters ---------- intervals : Interval Interval's or stream of Intervals's stranded : bool Wheter or not the intervals are stranded Returns ------- GenomicIntervals """ return GenomicIntervals.from_intervals(intervals, self._genome_context, is_stranded=stranded)
[docs] def read_intervals(self, filename: str, stranded: bool = False, stream: bool = False, buffer_type=None) -> GenomicIntervals: """Read a bed file and represent it as `GenomicIntervals` If `stream` is `True` then read the bedgraph in chunks and get a lazy evaluated GenomicTrack Parameters ---------- filename : str Filename for the bed file stranded : bool Whether or not to treat the intervals as stranded stream : bool Wheter to read as a stream Returns ------- GenomicIntervals """ path = PurePath(filename) suffix = path.suffixes[-1] if suffix == ".gz": suffix = path.suffixes[-2] if buffer_type is None: buffer_type = buffer_types[suffix] if buffer_type == BedBuffer and stranded: buffer_type = Bed6Buffer if buffer_type == BamBuffer: buffer_type = BamIntervalBuffer content = self._open(filename, stream, buffer_type=buffer_type) return self.get_intervals(content, stranded)
# return GenomicIntervals.from_intervals(content, self._chrom_sizes)
[docs] def read_locations(self, filename: str, stranded: bool = False, stream: bool = False, has_numeric_chromosomes=False, buffer_type=None) -> GenomicLocation: """Read a set of locations from file and convert them to `GenomicLocation` Locations can be treated as stranded or not stranded. If `has_numeric_chromosomes` then 'chr' will be prepended to all the chromosome names in order to make them compatible with the genome's chromosome names Parameters ---------- filename : str Filename of the locations stranded : bool Whether or not the locations are stranded stream : bool Whether the data should be read as a stream has_numeric_chromosomes : bool Whether the file has the chromosomes as numbers Returns ------- GenomicLocation (Stranded) GenomicLocation """ assert not (stream and has_numeric_chromosomes) f = bnp_open(filename, buffer_type=buffer_type) data = f.read_chunks() if not stream: data_list = list(data) if len(data_list) == 0: data = LocationEntry.empty() else: data = np.concatenate(data_list) return self.get_locations(data, has_numeric_chromosomes=has_numeric_chromosomes)
def _mask_data_on_extra_chromosomes(self, data, chromosome_field_name='chromosome'): if (not isinstance(data, BNPDataClass)) or len(data)==0: return data encoded_chromosomes = self._genome_context.encoding.encode(getattr(data, chromosome_field_name)) data = replace(data, **{chromosome_field_name: encoded_chromosomes}) mask = self._genome_context.is_included(encoded_chromosomes) return data[mask]
[docs] def get_locations(self, data: LocationEntry, has_numeric_chromosomes=False) -> GenomicLocation: """Create `GenomicLocation`s from location entries Parameters ---------- data : LocationEntry BNPDataClass of location entries Returns ------- GenomicLocation """ if has_numeric_chromosomes: data = replace(data, chromosome=as_encoded_array(['chr'+chromosome.to_string() for chromosome in data.chromosome])) data = self._mask_data_on_extra_chromosomes(data) return GenomicLocation.from_data(data, self._genome_context)
[docs] def read_sequence(self, filename: str = None) -> GenomicSequence: """Read the genomic sequence from file. If a `fasta` file was used to create the Genome object, `filename` can be `None` in which case the sequence will be read from that fasta file Parameters ---------- filename : str Returns ------- GenomicSequence """ if filename is None: assert self._fasta_filename is not None filename = self._fasta_filename return GenomicSequence.from_indexed_fasta(open_indexed(filename), genome_context=self._genome_context)
[docs] def read_annotation(self, filename: str) -> GenomicAnnotation: """Read genomic annotation from a 'gtf' or 'gff' file The annotation file should have 'gene', 'transcript' and 'exon' entries. Parameters ---------- filename : str Filename of the annotation file Returns ------- GenomicAnnotation `GenomicAnnotation` containing the genes, transcripts and exons """ gtf_entries = self._open(filename, stream=False) return GenomicAnnotation.from_gtf_entries(gtf_entries, self._genome_context)
def __repr__(self): return f"{self.__class__.__name__}(" + repr(self._genome_context) + ")" def __str__(self): return table( ((key, value) for key, value in self._genome_context.chrom_sizes.items() if '_' not in key), headers=["Chromosome", "Size"]) def get_genome_context(self): return self._genome_context @property def size(self): '''The size of the genome''' return self._genome_context.size