[med-svn] [pbgenomicconsensus] 01/03: Imported Upstream version 1.1.0

Wed Nov 25 21:07:38 UTC 2015

This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository pbgenomicconsensus.

commit 2e74dad9f58be74db8e92ad6935331b845bb2af5
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Tue Nov 24 23:44:29 2015 -0800

    Imported Upstream version 1.1.0
---
 CHANGELOG                           |  3 +++
 GenomicConsensus/ResultCollector.py |  2 +-
 GenomicConsensus/__init__.py        |  2 +-
 GenomicConsensus/io/utils.py        | 17 +++++------------
 GenomicConsensus/main.py            | 27 ++++++++++++++-------------
 GenomicConsensus/options.py         |  9 +++++++++
 GenomicConsensus/quiver/model.py    |  2 +-
 GenomicConsensus/quiver/quiver.py   |  2 +-
 GenomicConsensus/reference.py       | 30 +++++++++++++++++++++---------
 9 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 8ee9baa..b6eed56 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,6 @@
+Version 1.1.0
+  * Working support for DataSet read and reference files
+
 Version 1.0.0
   * Working support for BAM files adhering to our BAM spec (version 3.0b6)
 
diff --git a/GenomicConsensus/ResultCollector.py b/GenomicConsensus/ResultCollector.py
index e6d18e9..41f26ba 100644
--- a/GenomicConsensus/ResultCollector.py
+++ b/GenomicConsensus/ResultCollector.py
@@ -139,7 +139,7 @@ class ResultCollector(object):
                 if (s == 0) and (e == refEntry.length):
                     spanName = refName
                 else:
-                    spanName = refName + ":%d-%d" % (s, e)
+                    spanName = refName + "_%d_%d" % (s, e)
                 cssName = consensus.consensusContigName(spanName,
                                                         options.algorithm)
                 # Gather just the chunks pertaining to this span
diff --git a/GenomicConsensus/__init__.py b/GenomicConsensus/__init__.py
index 92a44d5..abed5b6 100644
--- a/GenomicConsensus/__init__.py
+++ b/GenomicConsensus/__init__.py
@@ -30,4 +30,4 @@
 
 # Author: David Alexander
 
-__VERSION__ = "1.0.0"
+__VERSION__ = "1.1.0"
diff --git a/GenomicConsensus/io/utils.py b/GenomicConsensus/io/utils.py
index 10b5d85..8e9928e 100644
--- a/GenomicConsensus/io/utils.py
+++ b/GenomicConsensus/io/utils.py
@@ -33,7 +33,7 @@
 __all__ = ["loadCmpH5", "loadBam"]
 
 import h5py, os.path
-from pbcore.io import CmpH5Reader, BamReader
+from pbcore.io import AlignmentSet
 
 
 def loadCmpH5(filename, referenceFname, disableChunkCache=False):
@@ -41,17 +41,10 @@ def loadCmpH5(filename, referenceFname, disableChunkCache=False):
     Get a CmpH5Reader object, disabling the chunk cache if requested.
     """
     filename = os.path.abspath(os.path.expanduser(filename))
-    if not disableChunkCache:
-        file = h5py.File(filename, "r")
-    else:
-        propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
-        propfaid.set_cache(0, 0, 0, 0)
-        fid = h5py.h5f.open(filename,
-                            flags=h5py.h5f.ACC_RDONLY,
-                            fapl=propfaid)
-        file = h5py.File(fid)
-    return CmpH5Reader(file)
+    return AlignmentSet(filename)
 
 def loadBam(filename, referenceFname):
     filename = os.path.abspath(os.path.expanduser(filename))
-    return BamReader(filename, referenceFname)
+    aln = AlignmentSet(filename)
+    aln.addReference(referenceFname)
+    return aln
diff --git a/GenomicConsensus/main.py b/GenomicConsensus/main.py
index 8e2432f..09b2d8b 100644
--- a/GenomicConsensus/main.py
+++ b/GenomicConsensus/main.py
@@ -36,7 +36,8 @@ from __future__ import absolute_import
 import argparse, atexit, cProfile, gc, glob, h5py, logging, multiprocessing
 import os, pstats, random, shutil, tempfile, time, threading, Queue, traceback
 
-from pbcore.io import CmpH5Reader, BamReader
+from pbcore.io import AlignmentSet
+
 from GenomicConsensus import reference
 from GenomicConsensus.options import (options,
                                       parseOptions,
@@ -139,11 +140,7 @@ class ToolRunner(object):
         store it as self._inCmpH5.
         """
         fname = options.inputFilename
-        if options.usingBam:
-            self._inCmpH5 = BamReader(fname, options.referenceFilename)
-        else:
-            logging.debug("Before open on main process, # hdf5 objects open: %d" % h5py.h5f.get_obj_count())
-            self._inCmpH5 = CmpH5Reader(fname)
+        self._inCmpH5 = AlignmentSet(fname)
 
     def _loadReference(self, cmpH5):
         logging.info("Loading reference")
@@ -165,6 +162,8 @@ class ToolRunner(object):
         else:
             options.referenceWindows = map(reference.stringToWindow,
                                            options.referenceWindowsAsString.split(","))
+        if options.referenceWindowsFromAlignment:
+            options.referenceWindows = cmpH5.refWindows
 
     def _checkFileCompatibility(self, cmpH5):
         if not cmpH5.isSorted:
@@ -173,11 +172,13 @@ class ToolRunner(object):
             die("Input CmpH5 file must be nonempty.")
 
     def _shouldDisableChunkCache(self, cmpH5):
-        if isinstance(cmpH5, CmpH5Reader):
-            threshold = options.autoDisableHdf5ChunkCache
-            return datasetCountExceedsThreshold(cmpH5, threshold)
-        else:
-            return False
+        #if isinstance(cmpH5, CmpH5Reader):
+        #if cmpH5.isCmpH5:
+        #    threshold = options.autoDisableHdf5ChunkCache
+        #    return datasetCountExceedsThreshold(cmpH5, threshold)
+        #else:
+        #    return False
+        return False
 
     def _configureAlgorithm(self, options, cmpH5):
         assert self._algorithm != None
@@ -276,7 +277,7 @@ class ToolRunner(object):
             options.fancyChunking = False
 
             # Peek at the bam file to build tables
-            with BamReader(options.inputFilename) as peekCmpH5:
+            with AlignmentSet(options.inputFilename) as peekCmpH5:
                 logging.info("Peeking at BAM file %s" % options.inputFilename)
                 logging.info("Input BAM data: numAlnHits=%d" % len(peekCmpH5))
                 resolveOptions(peekCmpH5)
@@ -289,7 +290,7 @@ class ToolRunner(object):
             # whether the selected algorithm parameters (Quiver) are
             # compatible with the data.  But we then have to close the
             # file, and let the "real" open happen after the fork.
-            with CmpH5Reader(options.inputFilename) as peekCmpH5:
+            with AlignmentSet(options.inputFilename) as peekCmpH5:
                 logging.info("Peeking at CmpH5 file %s" % options.inputFilename)
                 logging.info("Input CmpH5 data: numAlnHits=%d" % len(peekCmpH5))
                 resolveOptions(peekCmpH5)
diff --git a/GenomicConsensus/options.py b/GenomicConsensus/options.py
index 7e71607..e063575 100644
--- a/GenomicConsensus/options.py
+++ b/GenomicConsensus/options.py
@@ -167,6 +167,15 @@ def parseOptions():
              "(default: entire reference).",
         default=None)
 
+    readSelection.add_argument(
+        "--alignmentSetRefWindows",
+        action="store_true",
+        dest="referenceWindowsFromAlignment",
+        help="The window (or multiple comma-delimited windows) of the reference to " + \
+             "be processed, in the format refGroup:refStart-refEnd "                 + \
+             "will be pulled from the alignment file.",
+        default=False)
+
     def slurpWindowFile(fname):
         return ",".join(map(str.strip, open(fname).readlines()))
 
diff --git a/GenomicConsensus/quiver/model.py b/GenomicConsensus/quiver/model.py
index 7eedfdd..0c9c234 100644
--- a/GenomicConsensus/quiver/model.py
+++ b/GenomicConsensus/quiver/model.py
@@ -136,7 +136,7 @@ class Model(object):
         MappedRead.
         """
         assert aln.referenceSpan > 0
-        name = str(aln.rowNumber)
+        name = aln.readName
         chemistry = chemOrUnknown(aln)
         read = cc.Read(cls.extractFeatures(aln), name, chemistry)
         return cc.MappedRead(read,
diff --git a/GenomicConsensus/quiver/quiver.py b/GenomicConsensus/quiver/quiver.py
index 6e2f674..2aca757 100644
--- a/GenomicConsensus/quiver/quiver.py
+++ b/GenomicConsensus/quiver/quiver.py
@@ -108,7 +108,7 @@ def consensusAndVariantsForWindow(cmpH5, refWindow, referenceContig,
 
             logging.debug("%s: Reads being used: %s" %
                           (reference.windowToString(subWin),
-                           " ".join([str(hit.rowNumber) for hit in alns])))
+                           " ".join([str(hit.readName) for hit in alns])))
 
             css = U.consensusForAlignments(subWin,
                                            intRefSeq,
diff --git a/GenomicConsensus/reference.py b/GenomicConsensus/reference.py
index 38e3e74..62bc2ef 100644
--- a/GenomicConsensus/reference.py
+++ b/GenomicConsensus/reference.py
@@ -34,7 +34,7 @@ from __future__ import absolute_import
 
 import logging, re, numpy as np
 from collections import OrderedDict
-from pbcore.io import FastaTable, splitFastaHeader
+from pbcore.io import ReferenceSet
 
 from .windows import holes, kCoveredIntervals, enumerateIntervals
 from .utils import die, nub
@@ -122,13 +122,14 @@ def loadFromFile(filename_, cmpH5):
     # Load contigs
     assert not isLoaded()
     try:
-        f = FastaTable(filename_)
+        f = ReferenceSet(filename_)
+        f.assertIndexed()
     except IOError as e:
         die(e)
 
-    cmpContigNames = set(splitFastaHeader(x)[0] for x in cmpH5.referenceInfoTable.FullName)
+    cmpContigNames = set(cmpH5.refNames)
 
-    for fastaRecord in f:
+    for fastaRecord in f.contigs:
         refName = fastaRecord.id
         if refName in cmpContigNames:
             cmpH5RefEntry   = cmpH5.referenceInfo(refName)
@@ -209,11 +210,22 @@ def fancyEnumerateChunks(cmpH5, refId, referenceStride,
     Enumerate chunks, creating chunks with hasCoverage=False for
     coverage cutouts.
     """
-    startRow = cmpH5.referenceInfo(refId).StartRow
-    endRow   = cmpH5.referenceInfo(refId).EndRow
-    goodMapQVs = (cmpH5.MapQV[startRow:endRow] >= minMapQV)
-    tStart = cmpH5.tStart[startRow:endRow][goodMapQVs].view(np.int32)
-    tEnd   = cmpH5.tEnd  [startRow:endRow][goodMapQVs].view(np.int32)
+
+    # The abstraction frays here, unless we want to push all of this into
+    # pbdataset. With the fairly specific nature of this query, a composition
+    # of standard accessors is probably ok:
+    tStart = []
+    tEnd = []
+    for reader in cmpH5.resourceReaders(refId):
+        startRow = reader.referenceInfo(refId).StartRow
+        endRow = reader.referenceInfo(refId).EndRow
+        goodMapQVs = (reader.MapQV[startRow:endRow] >= minMapQV)
+        tStart.extend(
+            reader.tStart[startRow:endRow][goodMapQVs].view(np.int32))
+        tEnd.extend(reader.tEnd[startRow:endRow][goodMapQVs].view(np.int32))
+    # Sort the intervals (not sure if it matters, but might as well be
+    # consistent with the inputs:
+    tStart, tEnd = map(list, zip(*sorted(zip(tStart, tEnd))))
 
     for span in enumerateSpans(refId, referenceWindows):
         _, spanStart, spanEnd = span

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pbgenomicconsensus.git