Computing GC content inside genes

This example shows how a classic bioinformatics task like computing GC content inside genes can be performed intuitively and efficiently using BioNumPy.

It combines two main workhorses of BioNumPy: Sequence arrays for DNA, and Interval arrays for genomic locations.

Sequence arrays are read in from fasta files. Interval arrays are read in from bed files.

The crux of the example is that the gene parts of a DNA sequence are extracted by a single, intuitive indexing operation, simply: selected_seq = sequence[intervals].

The count of a given nulceotide nn inside these gene regions is then computed simply by (selected_seq == nn).sum().

Finally, to compute GC content outside genes, one first computes the regions outside genes by inverting a mask of gene regions: ~get_boolean_mask(genes, len(chr1_sequence))

A first version of the example reads in the full sequence and interval data from fasta and bed files that only contain data for a single chromosome: bnp.open(seq_fn).read():

from typing import Tuple

import bionumpy as bnp
from bionumpy.arithmetics import get_boolean_mask


def analyze_within_chromosome(seq_filename: str, genes_filename: str):
    chr1 = bnp.open(seq_filename).read()[0]
    genes = bnp.open(genes_filename).read()
    print("Gene-regions: ", genes)
    print("Fasta: ", chr1)
    gc_inside, gc_outside = gc_inside_and_outside(chr1.sequence, genes)
    print(f"GC inside: {gc_inside:.2f}, GC outside: {gc_outside:.2f}")
    return gc_inside, gc_outside


def gc_inside_and_outside(chr1_sequence: bnp.EncodedArray, genes: bnp.Interval)-> Tuple[int, int]:
    gc_inside = get_gc_content(chr1_sequence, genes)
    gc_outside = get_gc_content(chr1_sequence, ~get_boolean_mask(genes, len(chr1_sequence)))
    return gc_inside, gc_outside


def get_gc_content(sequence, intervals):
    selected_seq = sequence[intervals]
    nn_counts = {nn: (selected_seq == nn).sum() for nn in "ACTG"}
    gc_count = sum([nn_counts[nn] for nn in "GC"])
    return gc_count / sum(nn_counts.values())

if __name__ == '__main__':
    result = analyze_within_chromosome("example_data/gc_test_onechr.fa", "example_data/gc_bedtest_onechr.bed")
    assert result == (0.6, 0.2)

A second version still reads in the full data by the read function, but adds in how to handle fasta and bed files containing data for multiple chromosomes by splitting the interval data based on the groupby function, and accessing corresponding entries from the read in sequence data:

import bionumpy as bnp


def analyze_across_chromosome(seq_fn, genes_fn):
    all_sequence = bnp.open(seq_fn).read()
    all_genes = bnp.open(genes_fn).read()
    genes_by_chr = bnp.groupby(all_genes, "chromosome")
    nn_counts = {nn:0 for nn in "ACTG"}
    for i,(chr, genes) in enumerate(genes_by_chr):
        selected_seq = all_sequence[i].sequence[genes]
        assert str(all_sequence[i].name) == chr
        for nn in "ACTG":
            nn_counts[nn] += (selected_seq == nn).sum()
    gc_content = sum([nn_counts[nn] for nn in "GC"]) / sum(nn_counts.values())
    print(f"GC-content inside genes: {gc_content:.2f}", )

if __name__ == '__main__':
    analyze_across_chromosome("example_data/gc_test_multichr.fa", "example_data/gc_bedtest_multichr.bed")

A third version shows a more scalable version of the same code, which uses the read_chunks function (see Reading files) with the fasta file to get a generator that reads in a single chromosome sequence at a time, allowing it to handle full-genome data with limited memory footprint:

import bionumpy as bnp


def analyze_across_chromosome(seq_fn, genes_fn):
    all_sequence = bnp.open(seq_fn).read_chunks(50)
    all_genes = bnp.open(genes_fn).read()
    genes_by_chr = bnp.groupby(all_genes, "chromosome")
    nn_counts = {nn:0 for nn in "ACTG"}
    for seq, (chr, genes) in zip(all_sequence, genes_by_chr):
        assert str(seq.name[0]) == chr, (seq, seq.name, chr)
        selected_seq = seq.sequence[0][genes]
        for nn in "ACTG":
            nn_counts[nn] += (selected_seq == nn).sum()
    gc_content = sum([nn_counts[nn] for nn in "GC"]) / sum(nn_counts.values())
    print(f"GC-content inside genes: {gc_content:.2f}", )


if __name__ == '__main__':
    analyze_across_chromosome("example_data/gc_test_multichr.fa", "example_data/gc_bedtest_multichr.bed")

The fourth version shows how to use the Genome functionality to conveniently calculate the GC content:

import bionumpy as bnp


def gc_genomic(sequence_filename: str, genes_filename: str) -> float:
    genome = bnp.Genome.from_file(sequence_filename)
    gene_mask = genome.read_intervals(genes_filename).get_mask()
    sequence = genome.read_sequence()
    gene_sequences = sequence[gene_mask]
    counts = bnp.count_encoded(gene_sequences, axis=None)
    gc_content = (counts["G"] + counts["C"]) / (gene_sequences.size - counts["N"])
    return gc_content


def test():
    assert gc_genomic("example_data/gc_test_multichr.fa", "example_data/gc_bedtest_multichr.bed") == 1 / 3


if __name__ == '__main__':
    import sys

    print(f'GC content {gc_genomic(*sys.argv[1:])}')