Source code for bionumpy.sequence.count_encoded

import numpy as np
from numpy.typing import ArrayLike
from numbers import Number
from ..io.matrix_dump import Matrix
from ..util.typing import EncodedArrayLike
from ..encoded_array import EncodedArray


class EncodedCounts:
    alphabet: list
    counts: np.ndarray
    row_names: list = None

    def __init__(self, alphabet, counts, row_names=None):
        self.counts = counts
        self.alphabet = alphabet
        self.row_names = row_names

    def __str__(self):
        return "\n".join(f"{c}: {n}" for c, n in zip(self.alphabet, self.counts.T))

    def __repr__(self):
        return f'''EncodedCounts(alphabet={repr(self.alphabet)}, counts={repr(self.counts)}, row_names={repr(self.row_names)})'''

    def __eq__(self, other):
        if self.alphabet != other.alphabet:
            return False
        if not np.all(self.counts == other.counts):
            return False
        return True

    def __getitem__(self, idx):
        return self.counts[..., self.alphabet.index(idx)]

    def __add__(self, other):
        if isinstance(other, Number):
            o_counts = other
        else:
            assert self.alphabet == other.alphabet
            o_counts = other.counts
        return self.__class__(self.alphabet, self.counts + o_counts)

    def __radd__(self, other):
        if isinstance(other, Number):
            o_counts = other
        else:
            assert self.alphabet == other.alphabet
            o_counts = other.counts
        return self.__class__(self.alphabet, self.counts + o_counts)

    # return dataclasses.replace(self, counts=self.counts+o_counts)

    @property
    def proportions(self):
        s = self.counts.sum(axis=-1, keepdims=True)
        return np.where(s > 0, self.counts / s, 0)

    @property
    def proportion_matrix(self):
        s = self.counts.sum(axis=-1, keepdims=True)
        return Matrix(np.where(s > 0, self.counts / s, 0), col_names=self.alphabet)

    def get_count_for_label(self, label):
        return np.sum(self.counts[..., self.alphabet.index(l)] for l in label)

    @property
    def labels(self):
        return self.alphabet

    @classmethod
    def vstack(cls, counts):
        alphabet = counts[0].alphabet
        row_names = counts[0].row_names
        assert all(count.alphabet == alphabet for count in counts)
        ret = cls(alphabet, np.array([count.counts for count in counts], dtype="int"))
        if row_names is not None:
            ret.row_names = [count.row_names for count in counts]
        return ret

    def most_common(self, n=None):
        args = np.argsort(self.counts)[::-1]
        if n is not None:
            args = args[:n]
        return self.__class__(
            [self.alphabet[i] for i in args],
            self.counts[args])

    def as_dict(self):
        return dict(zip(self.alphabet, self.counts.T))



[docs]
def count_encoded(values: EncodedArrayLike, weights: ArrayLike = None, axis: int = -1) -> EncodedCounts:
    """Count the occurances of encoded entries. Works on any encoding with finite alphabet

    Parameters
    ----------
    values : EncodedArrayLike
    weights : ArrayLike
    axis : int

    Returns
    -------
    EncodedCounts

    """
    weights2d = weights is not None and np.asanyarray(weights).ndim == 2
    if axis is None:
        values = values.ravel()
    if hasattr(values.encoding, "get_alphabet"):
        alphabet = values.encoding.get_alphabet()
    else:
        alphabet = values.encoding.get_labels()
    if isinstance(values, EncodedArray) and len(values.shape) == 1 and not weights2d:
        max_size = 1000000
        if len(values) > max_size and weights is None:
            counts = sum(np.bincount(values[i * max_size:(i + 1) * max_size], minlength=len(alphabet))
                         for i in range(len(values) // max_size + 1))
        else:
            counts = np.bincount(values, weights=weights, minlength=len(alphabet))
    elif axis == -1:
        if not weights2d:
            counts = np.array([np.bincount(row, weights=weights, minlength=len(alphabet)) for row in values])
        else:
            counts = np.array([np.bincount(values, weights=row, minlength=len(alphabet)) for row in weights])
            if not np.issubdtype(counts.dtype, np.integer) and not np.issubdtype(weights.dtype, np.floating):
                counts = counts.astype(int)

    return EncodedCounts(alphabet, counts)