Functions that use Biopython objects together with tests.
Download original file: 6_functions_with_tests.ipynb
View original file in nbviewer: 6_functions_with_tests.ipynb
# These are the objects our code uses.
# This cell must be executed first, before the others.
from Bio import SeqIO
from Bio.Alphabet import IUPAC, Alphabet
from Bio.Data import IUPACData
from Bio.Seq import Seq
import random
IUPAC.ambiguous_rna.letters
IUPACData.ambiguous_rna_values
IUPAC.unambiguous_dna.letters
'GATC'
my_seq = Seq('AAAAGG', IUPAC.unambiguous_dna)
my_seq.translate()
Seq('KR', IUPACProtein())
IUPAC.ambiguous_rna.letters
IUPACData.unambiguous_rna_weights
{'G': 379.0, 'A': 363.0, 'C': 339.0, 'U': 340.0}
# Note that this function needed to be changed
# to accomodate the creation of proper Seq objects
# in the functions below. It now returns a proper
# Alphabet object from Biopython and our code needs
# to use .letters explicitly
def select_alphabet(alphabet):
''' Returns a default alphabet.
'''
a = alphabet
if a is None:
a = IUPAC.unambiguous_dna
if not hasattr(a, 'letters'):
a = Alphabet()
a.letters = alphabet
return a
def validate_sequence(sequence, alphabet=None, method='A'):
"""Return True if the string sequence contains
only letters from the supplied alphabet, otherwise False.
Default alphabet is IUPAC.unambiguous_dna"""
alphabet = select_alphabet(alphabet)
if method == 'A':
for i in sequence:
if i not in alphabet.letters:
return False
return True
elif method == 'B':
return set(sequence.upper()) - set(alphabet.letters.upper()) == set()
def test_validate_sequence():
vs = validate_sequence
for method in ['A', 'B']:
assert vs('AAA', 'ATG', method)
assert vs('AAA', 'T', method) == False
assert vs('ATCG', IUPAC.unambiguous_dna.letters, method)
assert vs('AT', 'A', method) == False
assert vs('', 'A')
print('Test OK.')
test_validate_sequence()
Test OK.
import random
def random_base(alphabet=None, method='A'):
"""Returns a random base from the supplied alphabet.
Default alphabet is IUPAC.unambiguous_dna"""
alphabet = select_alphabet(alphabet)
if method == 'A':
# This method is better because it returns a proper
# Biopython sequence object.
return Seq(random.choice(alphabet.letters), alphabet)
if method == 'B':
return random.choice(alphabet.letters)
def test_random_base():
for i in range(6000):
for method in ['A', 'B']:
rb = random_base(IUPAC.unambiguous_rna)
assert validate_sequence(rb, IUPAC.unambiguous_rna)
print('Test OK.')
test_random_base()
Test OK.
def random_codon(alphabet=None, method='A'):
"""Returns a random codon from the supplied alphabet.
Default alphabet is IUPAC.unambiguous_dna"""
if method == 'A':
return random_base(alphabet) + \
random_base(alphabet) + \
random_base(alphabet)
if method == 'B':
return ''.join([random_base(alphabet, method) for x in range(3)])
def test_random_codon():
for i in range(6000):
for method in ['A', 'B']:
rc = random_codon(IUPAC.unambiguous_rna, method)
assert validate_sequence(rc, IUPAC.unambiguous_rna)
if method == 'A':
assert type(rc) == type(Seq('A'))
print('Test OK.')
test_random_codon()
Test OK.
def replace_base_randomly(sequence, alphabet=None, method='A'):
"""Returns sequence with one base randomly mutated
to one of the letters from the supplied alphabet.
Default alphabet is IUPAC.unambiguous_dna"""
alphabet = select_alphabet(alphabet)
if len(sequence) == 0:
raise Exception ("Error: No sequence detected!", sequence)
elif len(set(alphabet.letters.upper())) == 1 and \
len(set(sequence.upper())) == 1:
raise Exception ("Error: Mutation not possible!", sequence, alphabet.letters)
pos = random.randrange(0, len(sequence))
removed_base = sequence[pos]
if method == 'A':
letters_subset = ''
for base in alphabet.letters:
if base != removed_base.upper() and base != removed_base.lower():
letters_subset += base
elif method == 'B':
letters_subset = ''.join((b for b in alphabet.letters
if base != removed_base.upper() and
base != removed_base.lower()))
if method in ['A', 'B']:
return sequence[:pos] + random.choice(letters_subset) + sequence[pos+1:]
if method == 'C':
seq_list = [r_base if i == r_index else b for i, b in enumerate(sequence)]
def replace_base_randomly(sequence, alphabet=None, method='A'):
"""Returns sequence with one base randomly mutated
to one of the letters from the supplied alphabet.
Default alphabet is IUPAC.unambiguous_dna"""
alphabet = select_alphabet(alphabet)
if len(sequence) == 0:
raise Exception ("Error: No sequence detected!", sequence)
elif len(set(alphabet.letters.upper())) == 1 and \
len(set(sequence.upper())) == 1:
raise Exception ("Error: Mutation not possible!", sequence, alphabet.letters)
pos = random.randrange(0, len(sequence))
removed_base = sequence[pos]
if method == 'A':
letters_subset = ''
for base in alphabet.letters:
if base != removed_base.upper() and base != removed_base.lower():
letters_subset += base
elif method == 'B':
letters_subset = ''.join((b for b in alphabet.letters
if b != removed_base.upper() and
b != removed_base.lower()))
elif method == 'C':
letters_subset = ''.join(alphabet.letters.split(removed_base))
return sequence[:pos] + random.choice(letters_subset) + sequence[pos+1:]
def test_replace_base_randomly():
for method in ['A', 'B', 'C']:
try:
replace_base_randomly("aAaAaAaA", "Aa", method=method)
except Exception as e:
assert e.args == ('Error: Mutation not possible!',
'aAaAaAaA', 'Aa')
for i in range(6000):
random_sequence = random_codon('atcg') + \
random_codon('ATCG') + \
random_codon('atcg')
new_seq = replace_base_randomly(random_sequence, method=method)
#print('Old: {} New: {}'.format(random_sequence, new_seq))
assert new_seq.upper() != random_sequence.upper()
N_differences = 0
for index, base in enumerate(new_seq):
if base != random_sequence[index]:
N_differences += 1
assert N_differences == 1, N_differences
print('Test OK.')
test_replace_base_randomly()
Test OK.
def sequence_content_fraction(sequence, letters, alphabet=None, method='A'):
"""Return the percentage of characters in sequence that are
part of letters. Checks if both the sequence and letters are
part of the supplied alphabet. Default alphabet is IUPAC.unambiguous_dna"""
if not validate_sequence(sequence, alphabet):
raise Exception('The sequence you supplied must only contain '
'letters from the alphabet supplied.', alphabet)
if not validate_sequence(letters, alphabet):
raise Exception('The letters you supplied must only contain '
'characters from the alphabet supplied.',
alphabet, letters)
if method == 'A':
n = 0
for letter in set(letters):
for char in sequence:
if letter == char:
n += 1
return 100* n / float(len(sequence))
if method == 'B':
return_value = 0
for letter in set(letters):
return_value += 100*list(sequence).count(letter) / len(sequence)
return return_value
if method == 'C':
from collections import Counter as CC
return 100 * sum(bc/len(sequence) for b, bc in CC(sequence).items() if b in letters)
def test_sequence_content_fraction():
f_A = sequence_content_fraction('AAABBBCCC', 'BC', 'ABC', method='A')
f_B = sequence_content_fraction('AAABBBCCC', 'BC', 'ABC', method='B')
f_C = sequence_content_fraction('AAABBBCCC', 'BC', 'ABC', method='C')
# This is how you should check if two floats have the same
# (the values must be close, but not exactly the same)
assert abs(f_A - f_B) < 1e-12
assert abs(f_A - f_C) < 1e-12
print('Test OK.')
test_sequence_content_fraction()
Test OK.