Source code for bionumpy.sequence.position_weight_matrix

import numpy as np
from typing import Dict
from numpy.typing import ArrayLike
import typing
# from import Motif
from .rollable import RollableFunction
from ..encoded_array import EncodedArray, EncodedRaggedArray, as_encoded_array
from ..util.typing import EncodedArrayLike
from ..encodings import AlphabetEncoding
from npstructures import RaggedArray

class PositionWeightMatrix(RollableFunction):
    def __init__(self, pwm):
        self._pwm = pwm
        self._encoding = pwm._encoding
        self.window_size = pwm.window_size

    def __call__(self, sequence: EncodedArray) -> float:
        return self._pwm.calculate_score(sequence)

        if self._encoding is not None:
            sequence = as_encoded_array(sequence, self._encoding).raw()
        scores = self._matrix[sequence, self._indices]
        return scores.sum(axis=-1)

def _pwm_from_counts(count_matrix):
    with_pseudo = count_matrix+1
    return np.log(with_pseudo/with_pseudo.sum(axis=0, keepdims=True))

[docs] class PWM: """ Class representing a Position Weight Matrix. Calculates scores based on the log likelihood ratio between the motif and a background probability """ def __init__(self, matrix, alphabet): self._matrix = matrix self._alphabet = alphabet self._encoding = AlphabetEncoding(alphabet) self._indices = np.arange(self.window_size) def as_valid_encoded_array(self, sequence): if isinstance(sequence, (EncodedArray, EncodedRaggedArray)): if isinstance(sequence.encoding, AlphabetEncoding): alphabet = list((sequence.encoding.get_alphabet())) s_alphabet = list(self._alphabet) if not alphabet[:len(self._alphabet)] == s_alphabet or np.max(sequence.raw()) >= len(self._alphabet): raise Exception(f'Could not calculate pwm for alphabet {s_alphabet} on {alphabet} encoded array') else: return sequence return as_encoded_array(sequence, self._encoding) @property def alphabet(self): return self._alphabet def __str__(self): return "\n".join(["\t".join(self._alphabet), str(self._matrix)]) @property def window_size(self): return self._matrix.shape[-1]
[docs] def calculate_score(self, sequence: EncodedArrayLike) -> float: """Calculates the pwm score for a sequence of the same length as the motif Parameters ---------- sequence : EncodedArrayLike """ sequence = self.as_valid_encoded_array(sequence) # sequence = as_encoded_array(sequence, self._encoding) # assert sequence.encoding == self._encoding assert sequence.shape[-1] == self.window_size scores = self._matrix[sequence.raw(), self._indices] return scores.sum(axis=-1)
[docs] def calculate_scores(self, sequence: EncodedArrayLike) -> ArrayLike: """Calculate motif scores for an entire sequence Parameters ---------- sequence : EncodedArrayLike Returns ------- ArrayLike Motif scores for all valid and invalid windows """ sequence = self.as_valid_encoded_array(sequence) # sequence = as_encoded_array(sequence, self._encoding) # assert sequence.encoding == self._encoding scores = np.zeros(sequence.size, dtype=float) m = self._matrix.T.copy() for offset, row in enumerate(m): scores[:scores.size-offset] += row[sequence[offset:].raw()] return scores
[docs] @classmethod def from_dict(cls, dictionary: Dict[str, ArrayLike], background: Dict[str, float]=None) -> "PWM": """Create a PWM object from a dict of letters to position probabilities This takes raw probabilities as input. Not log likelihood(ratios) Parameters ---------- cls : dictionary : Dict[str, ArrayLike] Mapping of alphabet letters to position probability scores background : Dict[str, float] Background probabilities. By default assume uniform probabilities Returns ------- "PWM" Position Weight Matrix object with log-likelihood ratios """ if background is None: background = {key: 1/len(dictionary) for key in dictionary} alphabet = "".join(dictionary.keys()) with np.errstate(divide="ignore"): matrix = np.log(np.array(list(dictionary.values())))-np.log([background[key] for key in dictionary])[:, np.newaxis] return cls(matrix, alphabet)
# @classmethod # def from_motif(cls, motif: Motif): # return cls(motif.matrix, motif.alphabet) @classmethod def from_counts(cls, counts: typing.Union[dict]): # if isinstance(counts, Motif): # return cls(_pwm_from_counts(counts.matrix), counts.alphabet) # else: return cls(_pwm_from_counts(np.array(list(counts.values()))), "".join(counts.keys())) def __str__(self): matrix = self._matrix.transpose() return "PWM with alphabet " + self._alphabet + "\n" + \ '\n'.join([' '.join([str(round(c, 2)) for c in row]) for row in matrix])
def get_motif_scores_old(sequence: EncodedRaggedArray, pwm: PWM) -> RaggedArray: """Computes motif scores for a motif on a sequence. Returns a RaggedArray with the score at each position in every read. Parameters ---------- sequence: EncodedRaggedArray motif: PositionWeightMatrix Returns ------- RaggedArray A numeric RaggedArray. Contains one row for every read with the scores for every position of that read. Examples -------- """ pwm = PositionWeightMatrix(pwm) return pwm.rolling_window(sequence)
[docs] def get_motif_scores(sequence: EncodedRaggedArray, pwm: PWM) -> RaggedArray: """Computes motif scores for a motif on a sequence. Returns a RaggedArray with the score at each position in every read. Parameters ---------- sequence: EncodedRaggedArray motif: PositionWeightMatrix Returns ------- RaggedArray A numeric RaggedArray. Contains one row for every read with the scores for every position of that read. Examples -------- >>> import bionumpy as bnp >>> pwm = bnp.sequence.position_weight_matrix.PWM.from_dict({"A": [5, 1], "C": [1, 5], "G": [0, 0], "T": [0, 0]}) >>> sequences = bnp.as_encoded_array(["ACTGAC", "CA", "GG"]) >>> bnp.get_motif_scores(sequences, pwm) ragged_array([5.99146455 -inf -inf -inf 5.99146455] [2.77258872] [-inf]) """ sequence = as_encoded_array(sequence) flat_sequence, shape = (sequence.ravel(), sequence.shape) scores = pwm.calculate_scores(flat_sequence) if isinstance(sequence, EncodedRaggedArray): scores = RaggedArray(scores, shape[-1]) return scores[..., :(-pwm.window_size+1)]