Functions that use Biopython objects together with tests.

Download original file: 6_functions_with_tests.ipynb

View original file in nbviewer: 6_functions_with_tests.ipynb

# These are the objects our code uses.
# This cell must be executed first, before the others.
from Bio import SeqIO
from Bio.Alphabet import IUPAC, Alphabet
from Bio.Data import IUPACData
from Bio.Seq import Seq

import random


IUPAC.ambiguous_rna.letters
IUPACData.ambiguous_rna_values
IUPAC.unambiguous_dna.letters




'GATC'




my_seq = Seq('AAAAGG', IUPAC.unambiguous_dna)
my_seq.translate()




Seq('KR', IUPACProtein())




IUPAC.ambiguous_rna.letters
IUPACData.unambiguous_rna_weights




{'G': 379.0, 'A': 363.0, 'C': 339.0, 'U': 340.0}




# Note that this function needed to be changed 
# to accomodate the creation of proper Seq objects 
# in the functions below.  It now returns a proper 
# Alphabet object from Biopython and our code needs 
# to use .letters explicitly

def select_alphabet(alphabet):
    ''' Returns a default alphabet.
    '''
    a = alphabet

    if a is None:
        a = IUPAC.unambiguous_dna

    if not hasattr(a, 'letters'):
        a = Alphabet()
        a.letters = alphabet

    return a

def validate_sequence(sequence, alphabet=None, method='A'):
    """Return True if the string sequence contains 
    only letters from the supplied alphabet, otherwise False. 
    Default alphabet is IUPAC.unambiguous_dna"""
    alphabet = select_alphabet(alphabet)

    if method == 'A':
        for i in sequence:
            if i not in alphabet.letters:
                return False
        return True

    elif method == 'B':
        return set(sequence.upper()) - set(alphabet.letters.upper()) == set()

def test_validate_sequence():
    vs = validate_sequence
    for method in ['A', 'B']:
        assert vs('AAA', 'ATG', method)
        assert vs('AAA', 'T', method) == False
        assert vs('ATCG', IUPAC.unambiguous_dna.letters, method)
        assert vs('AT', 'A', method) == False
        assert vs('', 'A')
    print('Test OK.')

test_validate_sequence()

Test OK.



import random
def random_base(alphabet=None, method='A'):
    """Returns a random base from the supplied alphabet. 
    Default alphabet is IUPAC.unambiguous_dna"""
    alphabet = select_alphabet(alphabet)

    if method == 'A':
        # This method is better because it returns a proper 
        # Biopython sequence object. 
        return Seq(random.choice(alphabet.letters), alphabet)

    if method == 'B':
        return random.choice(alphabet.letters)

def test_random_base():
    for i in range(6000):
        for method in ['A', 'B']:
            rb = random_base(IUPAC.unambiguous_rna)
            assert validate_sequence(rb, IUPAC.unambiguous_rna)
    print('Test OK.')
test_random_base()

Test OK.



def random_codon(alphabet=None, method='A'):
    """Returns a random codon from the supplied alphabet. 
    Default alphabet is IUPAC.unambiguous_dna"""

    if method == 'A':
        return random_base(alphabet) + \
               random_base(alphabet) + \
               random_base(alphabet)

    if method == 'B':
        return ''.join([random_base(alphabet, method) for x in range(3)])

def test_random_codon():
    for i in range(6000):
        for method in ['A', 'B']:
            rc = random_codon(IUPAC.unambiguous_rna, method)
            assert validate_sequence(rc, IUPAC.unambiguous_rna)

            if method == 'A':
                assert type(rc) == type(Seq('A'))
    print('Test OK.')

test_random_codon()

Test OK.



def replace_base_randomly(sequence, alphabet=None, method='A'):
    """Returns sequence with one base randomly mutated 
    to one of the letters from the supplied alphabet. 
    Default alphabet is IUPAC.unambiguous_dna"""
    alphabet = select_alphabet(alphabet)

    if len(sequence) == 0:
        raise Exception ("Error: No sequence detected!", sequence)

    elif len(set(alphabet.letters.upper())) == 1 and \
         len(set(sequence.upper())) == 1:
        raise Exception ("Error: Mutation not possible!", sequence, alphabet.letters)

    pos = random.randrange(0, len(sequence))
    removed_base = sequence[pos]

    if method == 'A':
        letters_subset = ''
        for base in alphabet.letters:
            if base != removed_base.upper() and base != removed_base.lower():
                letters_subset += base

    elif method == 'B':
        letters_subset = ''.join((b for b in alphabet.letters 
                                  if base != removed_base.upper() and 
                                     base != removed_base.lower()))

    if method in ['A', 'B']:
        return sequence[:pos] + random.choice(letters_subset) + sequence[pos+1:]

    if method == 'C':
        seq_list = [r_base if i == r_index else b for i, b in enumerate(sequence)]


def replace_base_randomly(sequence, alphabet=None, method='A'):
    """Returns sequence with one base randomly mutated 
    to one of the letters from the supplied alphabet. 
    Default alphabet is IUPAC.unambiguous_dna"""
    alphabet = select_alphabet(alphabet)

    if len(sequence) == 0:
        raise Exception ("Error: No sequence detected!", sequence)

    elif len(set(alphabet.letters.upper())) == 1 and \
         len(set(sequence.upper())) == 1:
        raise Exception ("Error: Mutation not possible!", sequence, alphabet.letters)

    pos = random.randrange(0, len(sequence))
    removed_base = sequence[pos]

    if method == 'A':
        letters_subset = ''
        for base in alphabet.letters:
            if base != removed_base.upper() and base != removed_base.lower():
                letters_subset += base

    elif method == 'B':
        letters_subset = ''.join((b for b in alphabet.letters 
                                  if b != removed_base.upper() and 
                                     b != removed_base.lower()))  
    elif method == 'C':  
        letters_subset = ''.join(alphabet.letters.split(removed_base))

    return sequence[:pos] + random.choice(letters_subset) + sequence[pos+1:]

def test_replace_base_randomly():

    for method in ['A', 'B', 'C']:
        try:
            replace_base_randomly("aAaAaAaA", "Aa", method=method)
        except Exception as e:
            assert e.args == ('Error: Mutation not possible!',
                              'aAaAaAaA', 'Aa')

        for i in range(6000):
            random_sequence = random_codon('atcg') + \
                              random_codon('ATCG') + \
                              random_codon('atcg')
            new_seq = replace_base_randomly(random_sequence, method=method)

            #print('Old: {}  New: {}'.format(random_sequence, new_seq))   
            assert new_seq.upper() != random_sequence.upper()

            N_differences = 0
            for index, base in enumerate(new_seq):
                if base != random_sequence[index]:
                    N_differences += 1
            assert N_differences == 1, N_differences

    print('Test OK.')

test_replace_base_randomly()

Test OK.



def sequence_content_fraction(sequence, letters, alphabet=None, method='A'):
    """Return the percentage of characters in sequence that are 
    part of letters. Checks if both the sequence and letters are 
    part of the supplied alphabet. Default alphabet is IUPAC.unambiguous_dna"""

    if not validate_sequence(sequence, alphabet):
        raise Exception('The sequence you supplied must only contain '
                        'letters from the alphabet supplied.', alphabet)

    if not validate_sequence(letters, alphabet):
        raise Exception('The letters you supplied must only contain '
                        'characters from the alphabet supplied.', 
                        alphabet, letters)
    if method == 'A':
        n = 0
        for letter in set(letters):
            for char in sequence:
                if letter == char:
                    n += 1
        return 100* n / float(len(sequence))

    if method == 'B':
        return_value = 0
        for letter in set(letters):
            return_value += 100*list(sequence).count(letter) / len(sequence)
        return return_value

    if method == 'C':
        from collections import Counter as CC
        return 100 * sum(bc/len(sequence) for b, bc in CC(sequence).items() if b in letters)

def test_sequence_content_fraction():
    f_A = sequence_content_fraction('AAABBBCCC', 'BC', 'ABC', method='A')
    f_B = sequence_content_fraction('AAABBBCCC', 'BC', 'ABC', method='B')
    f_C = sequence_content_fraction('AAABBBCCC', 'BC', 'ABC', method='C')

    # This is how you should check if two floats have the same 
    # (the values must be close, but not exactly the same)
    assert abs(f_A - f_B) < 1e-12
    assert abs(f_A - f_C) < 1e-12
    print('Test OK.')

test_sequence_content_fraction()

Test OK.