[med-svn] [Git][med-team/fastaq][upstream] New upstream version 3.17.0

Sascha Steinbiss gitlab at salsa.debian.org
Thu Feb 22 10:06:08 UTC 2018


Sascha Steinbiss pushed to branch upstream at Debian Med / fastaq


Commits:
e4dd7b7f by Sascha Steinbiss at 2018-02-22T10:22:16+01:00
New upstream version 3.17.0
- - - - -


3 changed files:

- pyfastaq/sequences.py
- pyfastaq/tests/sequences_test.py
- setup.py


Changes:

=====================================
pyfastaq/sequences.py
=====================================
--- a/pyfastaq/sequences.py
+++ b/pyfastaq/sequences.py
@@ -1,9 +1,8 @@
 import copy
 import re
-import string
 import random
 import itertools
-
+from collections import Counter
 from pyfastaq import utils, intervals, genetic_codes
 
 class Error (Exception): pass
@@ -465,6 +464,47 @@ class Fasta:
         '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2'''
         return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)]))
 
+    def gc_content(self, as_decimal=True):
+        """Returns the GC content for the sequence.
+        Notes:
+            This method ignores N when calculating the length of the sequence.
+            It does not, however ignore other ambiguous bases. It also only
+            includes the ambiguous base S (G or C). In this sense the method is
+            conservative with its calculation.
+
+        Args:
+            as_decimal (bool): Return the result as a decimal. Setting to False
+            will return as a percentage. i.e for the sequence GCAT it will
+            return 0.5 by default and 50.00 if set to False.
+
+        Returns:
+            float: GC content calculated as the number of G, C, and S divided
+            by the number of (non-N) bases (length).
+
+        """
+        gc_total = 0.0
+        num_bases = 0.0
+        n_tuple = tuple('nN')
+        accepted_bases = tuple('cCgGsS')
+
+        # counter sums all unique characters in sequence. Case insensitive.
+        for base, count in Counter(self.seq).items():
+
+            # dont count N in the number of bases
+            if base not in n_tuple:
+                num_bases += count
+
+                if base in accepted_bases:  # S is a G or C
+                    gc_total += count
+
+        gc_content = gc_total / num_bases
+
+        if not as_decimal:  # return as percentage
+            gc_content *= 100
+
+        return gc_content
+
+
 
 class Embl(Fasta):
     '''Exactly the same as Fasta, but reading seqs from a file works differently'''


=====================================
pyfastaq/tests/sequences_test.py
=====================================
--- a/pyfastaq/tests/sequences_test.py
+++ b/pyfastaq/tests/sequences_test.py
@@ -520,6 +520,19 @@ class TestFasta(unittest.TestCase):
             fa = sequences.Fasta('name', 'A')
             fa.split_capillary_id()
 
+    def test_gc_content(self):
+        """Test GC content calculation works as expected"""
+        tests = [
+            (sequences.Fasta('ID', 'cgCG'), 1.0),
+            (sequences.Fasta('ID', 'tTaA'), 0.0),
+            (sequences.Fasta('ID', 'GCAT'), 0.5),
+            (sequences.Fasta('ID', 'GCATNN'), 0.5),
+            (sequences.Fasta('ID', 'GCATNNS'), 0.6),
+            (sequences.Fasta('ID', 'GCATNNSK'), 0.5)
+        ]
+        for test, answer in tests:
+            self.assertAlmostEqual(test.gc_content(), answer)
+            self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100)
 
 class TestEmbl(unittest.TestCase):
     def test_get_id_from_header_line(self):


=====================================
setup.py
=====================================
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@ from setuptools import setup, find_packages
 
 setup(
     name='pyfastaq',
-    version='3.16.0',
+    version='3.17.0',
     description='Script to manipulate FASTA and FASTQ files, plus API for developers',
     packages = find_packages(),
     author='Martin Hunt',



View it on GitLab: https://salsa.debian.org/med-team/fastaq/commit/e4dd7b7f6d4f01fc0b193e609338be560d7e3bb8

---
View it on GitLab: https://salsa.debian.org/med-team/fastaq/commit/e4dd7b7f6d4f01fc0b193e609338be560d7e3bb8
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alioth.debian.org/pipermail/debian-med-commit/attachments/20180222/e17ff1f1/attachment-0001.html>


More information about the debian-med-commit mailing list