Source code for bionumpy.sequence.position_weight_matrix

import numpy as np
from typing import Dict
from numpy.typing import ArrayLike
import typing
# from ..io.motifs import Motif
from .rollable import RollableFunction
from ..encoded_array import EncodedArray, EncodedRaggedArray, as_encoded_array
from ..util.typing import EncodedArrayLike
from ..encodings import AlphabetEncoding
from npstructures import RaggedArray


class PositionWeightMatrix(RollableFunction):
    def __init__(self, pwm):
        self._pwm = pwm
        self._encoding = pwm._encoding
        self.window_size = pwm.window_size

    def __call__(self, sequence: EncodedArray) -> float:
        return self._pwm.calculate_score(sequence)

        if self._encoding is not None:
            sequence = as_encoded_array(sequence, self._encoding).raw()
        scores = self._matrix[sequence, self._indices]
        return scores.sum(axis=-1)


def _pwm_from_counts(count_matrix):
    with_pseudo = count_matrix+1
    return np.log(with_pseudo/with_pseudo.sum(axis=0, keepdims=True))



[docs]
class PWM:
    """
    Class representing a Position Weight Matrix. Calculates scores based on
    the log likelihood ratio between the motif and a background probability
    """
    def __init__(self, matrix, alphabet):
        self._matrix = matrix
        self._alphabet = alphabet
        self._encoding = AlphabetEncoding(alphabet)
        self._indices = np.arange(self.window_size)

    def as_valid_encoded_array(self, sequence):
        if isinstance(sequence, (EncodedArray, EncodedRaggedArray)):
            
            if isinstance(sequence.encoding, AlphabetEncoding):
                alphabet = list((sequence.encoding.get_alphabet()))
                s_alphabet = list(self._alphabet)
                if not alphabet[:len(self._alphabet)] == s_alphabet or np.max(sequence.raw()) >= len(self._alphabet):
                    raise Exception(f'Could not calculate pwm for alphabet {s_alphabet} on {alphabet} encoded array')
                else:
                    return sequence
        return as_encoded_array(sequence, self._encoding)

        

    @property
    def alphabet(self):
        return self._alphabet

    def __str__(self):
        return "\n".join(["\t".join(self._alphabet), str(self._matrix)])

    @property
    def window_size(self):
        return self._matrix.shape[-1]


[docs]
    def calculate_score(self, sequence: EncodedArrayLike) -> float:
        """Calculates the pwm score for a sequence of the same length as the motif

        Parameters
        ----------
        sequence : EncodedArrayLike

        """
        sequence = self.as_valid_encoded_array(sequence)
        # sequence = as_encoded_array(sequence, self._encoding)
        # assert sequence.encoding == self._encoding
        assert sequence.shape[-1] == self.window_size
        scores = self._matrix[sequence.raw(), self._indices]
        return scores.sum(axis=-1)



[docs]
    def calculate_scores(self, sequence: EncodedArrayLike) -> ArrayLike:
        """Calculate motif scores for an entire sequence

        Parameters
        ----------
        sequence : EncodedArrayLike

        Returns
        -------
        ArrayLike
            Motif scores for all valid and invalid windows
        """
        sequence = self.as_valid_encoded_array(sequence)
        # sequence = as_encoded_array(sequence, self._encoding)
        # assert sequence.encoding == self._encoding
        scores = np.zeros(sequence.size, dtype=float)
        m = self._matrix.T.copy()
        for offset, row in enumerate(m):
            scores[:scores.size-offset] += row[sequence[offset:].raw()]
        return scores



[docs]
    @classmethod
    def from_dict(cls, dictionary: Dict[str, ArrayLike], background: Dict[str, float]=None) -> "PWM":
        """Create a PWM object from a dict of letters to position probabilities

        This takes raw probabilities as input. Not log
        likelihood(ratios)

        Parameters
        ----------
        cls :
        dictionary : Dict[str, ArrayLike]
            Mapping of alphabet letters to position probability scores
        background : Dict[str, float]
            Background probabilities. By default assume uniform
            probabilities

        Returns
        -------
        "PWM"
            Position Weight Matrix object with log-likelihood ratios
        """
        if background is None:
            background = {key: 1/len(dictionary) for key in dictionary}
        alphabet = "".join(dictionary.keys())
        with np.errstate(divide="ignore"):
            matrix = np.log(np.array(list(dictionary.values())))-np.log([background[key] for key in dictionary])[:, np.newaxis]
        return cls(matrix, alphabet)


    # @classmethod
    # def from_motif(cls, motif: Motif):
    #     return cls(motif.matrix, motif.alphabet)

    @classmethod
    def from_counts(cls, counts: typing.Union[dict]):
        # if isinstance(counts, Motif):
        #     return cls(_pwm_from_counts(counts.matrix), counts.alphabet)
        # else:
        return cls(_pwm_from_counts(np.array(list(counts.values()))), 
                   "".join(counts.keys()))

    def __str__(self):
        matrix = self._matrix.transpose()
        return "PWM with alphabet " + self._alphabet + "\n" + \
            '\n'.join([' '.join([str(round(c, 2)) for c in row]) for row in matrix])




def get_motif_scores_old(sequence: EncodedRaggedArray, pwm: PWM) -> RaggedArray:
    """Computes motif scores for a motif on a sequence.
    Returns a RaggedArray with the score at each position in every read.

    Parameters
    ----------
    sequence: EncodedRaggedArray
    motif: PositionWeightMatrix

    Returns
    -------
    RaggedArray
        A numeric RaggedArray. Contains one row for every read
        with the scores for every position of that read.

    Examples
    --------

    """
    pwm = PositionWeightMatrix(pwm)
    return pwm.rolling_window(sequence)



[docs]
def get_motif_scores(sequence: EncodedRaggedArray, pwm: PWM) -> RaggedArray:
    """Computes motif scores for a motif on a sequence.
    Returns a RaggedArray with the score at each position in every read.

    Parameters
    ----------
    sequence: EncodedRaggedArray
    motif: PositionWeightMatrix

    Returns
    -------
    RaggedArray
        A numeric RaggedArray. Contains one row for every read
        with the scores for every position of that read.

    Examples
    --------
    >>> import bionumpy as bnp
    >>> pwm = bnp.sequence.position_weight_matrix.PWM.from_dict({"A": [5, 1], "C": [1, 5], "G": [0, 0], "T": [0, 0]})
    >>> sequences = bnp.as_encoded_array(["ACTGAC", "CA", "GG"])
    >>> bnp.get_motif_scores(sequences, pwm)
    ragged_array([5.99146455       -inf       -inf       -inf 5.99146455]
    [2.77258872]
    [-inf])
    """
    sequence = as_encoded_array(sequence)
    flat_sequence, shape = (sequence.ravel(), sequence.shape)
    scores = pwm.calculate_scores(flat_sequence)
    if isinstance(sequence, EncodedRaggedArray):
        scores = RaggedArray(scores, shape[-1])
    return scores[..., :(-pwm.window_size+1)]