[med-svn] [nanopolish] 04/09: New upstream version 0.8.5

Afif Elghraoui afif at moszumanska.debian.org
Sun Feb 4 21:19:30 UTC 2018


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository nanopolish.

commit 436609ff75aa5294791d7fd8433376eb27965f20
Author: Afif Elghraoui <afif at debian.org>
Date:   Sun Feb 4 16:05:33 2018 -0500

    New upstream version 0.8.5
---
 Makefile                                           |  26 +-
 README.md                                          |  11 +-
 docs/Makefile                                      |  20 +
 docs/source/_static/nanopolish-workflow.png        | Bin 0 -> 318184 bytes
 .../_static/quickstart_methylation_results.png     | Bin 0 -> 50765 bytes
 docs/source/conf.py                                | 168 +++++++++
 docs/source/debug.rst                              | 170 +++++++++
 docs/source/index.rst                              |  30 ++
 docs/source/installation.rst                       |  45 +++
 docs/source/manual.rst                             | 413 +++++++++++++++++++++
 docs/source/quickstart_call_methylation.rst        | 125 +++++++
 docs/source/quickstart_consensus.rst               | 115 ++++++
 docs/source/quickstart_eventalign.rst              | 103 +++++
 scripts/calculate_methylation_frequency.py         |  11 +-
 scripts/extract_reads_aligned_to_region.py         | 309 +++++++++++++++
 scripts/nanopolish_merge.py                        |  17 +-
 src/alignment/nanopolish_alignment_db.cpp          |   1 +
 src/alignment/nanopolish_eventalign.cpp            | 134 ++-----
 src/common/nanopolish_bam_processor.cpp            |   3 +-
 src/common/nanopolish_common.h                     |   2 +-
 src/main/nanopolish.cpp                            |   9 +-
 src/nanopolish_call_variants.cpp                   |   6 +-
 src/nanopolish_extract.cpp                         |  70 ++--
 src/nanopolish_index.cpp                           |  18 +-
 src/nanopolish_raw_loader.cpp                      |  12 +-
 src/nanopolish_read_db.cpp                         |  19 +-
 src/nanopolish_squiggle_read.cpp                   |  46 ++-
 27 files changed, 1703 insertions(+), 180 deletions(-)

diff --git a/Makefile b/Makefile
index 5efffe6..3cab7a6 100644
--- a/Makefile
+++ b/Makefile
@@ -10,14 +10,15 @@ SUBDIRS := src src/hmm src/thirdparty src/thirdparty/scrappie src/common src/ali
 #Basic flags every build needs
 LIBS=-lz
 CXXFLAGS ?= -g -O3
-CXXFLAGS += -std=c++11 -fopenmp
+CXXFLAGS += -std=c++11 -fopenmp -fsigned-char
 CFLAGS ?= -O3 -std=c99
 CXX ?= g++
 CC ?= gcc
 
-# Change the value of HDF5 or EIGEN below to any value to disable compilation of bundled HDF5 code
-HDF5=install
-EIGEN=install
+# Change the value of HDF5, EIGEN, or HTS below to any value to disable compilation of bundled code
+HDF5?=install
+EIGEN?=install
+HTS?=install
 
 # Check operating system, OSX doesn't have -lrt
 UNAME_S := $(shell uname -s)
@@ -45,9 +46,14 @@ else
     EIGEN_CHECK=
 endif
 
-# Build and link the libhts submodule
-HTS_LIB=./htslib/libhts.a
-HTS_INCLUDE=-I./htslib
+# Default to build and link the libhts submodule
+ifeq ($(HTS), install)
+    HTS_LIB=./htslib/libhts.a
+    HTS_INCLUDE=-I./htslib
+else
+    # Use system-wide htslib
+    HTS_LIB=-lhts
+endif
 
 # Include the header-only fast5 library
 FAST5_INCLUDE=-I./fast5/include
@@ -116,15 +122,15 @@ include .depend
 	$(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $<
 
 .c.o:
-	$(CC) -o $@ -c $(CFLAGS) $(H5_INCLUDE) -fPIC $<
+	$(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $<
 
 # Link main executable
 $(PROGRAM): src/main/nanopolish.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(EIGEN_CHECK)
-	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS)
+	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
 
 # Link test executable
 $(TEST_PROGRAM): src/test/nanopolish_test.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB)
-	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS)
+	$(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $< $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(LIBS) $(LDFLAGS)
 
 test: $(TEST_PROGRAM)
 	./$(TEST_PROGRAM)
diff --git a/README.md b/README.md
index 0564f32..b3433c4 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,16 @@ Software package for signal-level analysis of Oxford Nanopore sequencing data. N
 
 ## Dependencies
 
-[libhdf5](http://www.hdfgroup.org/HDF5/release/obtain5.html) is automatically downloaded and compiled when running `make` but this can be disabled with: `HDF5=nofetch make`. The nanopolish binary will link libhdf5.a statically.
+A compiler that supports C++11 is needed to build nanopolish. Development of the code is performed using [gcc-4.8](https://gcc.gnu.org/gcc-4.8/).
 
-[eigen](http://eigen.tuxfamily.org) is also automatically downloaded and included when compiling with `make`.
+By default, nanopolish will download and compile all of its required dependencies. Some users however may want to use system-wide versions of the libraries. To turn off the automatic installation of dependencies set `HDF5=noinstall`, `EIGEN=noinstall` or `HTS=noinstall` parameters when running `make` as appropriate. The current versions and compile options for the dependencies are:
 
-[biopython](http://www.biopython.org) is required to run the helpers in `scripts/`.
+* [libhdf5-1.8.14](http://www.hdfgroup.org/HDF5/release/obtain5.html) compiled with multi-threading support `--enable-threadsafe`
+* [eigen-3.2.5](http://eigen.tuxfamily.org)
+* [htslib-1.4](http://github.com/samtools/htslib) 
 
-[htslib](http://github.com/samtools/htslib) is included as a submodule and compiled automatically.
+Additionally the helper `scripts` require [biopython](http://www.biopython.org) and [pysam](http://pysam.readthedocs.io/en/latest/installation.html).
 
-A compiler that supports C++11 is needed to build the sources. Development of the code is performed using [gcc-4.8](https://gcc.gnu.org/gcc-4.8/).
 
 ## Installation instructions
 
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..4f4c5de
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = Nanopolish
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/source/_static/nanopolish-workflow.png b/docs/source/_static/nanopolish-workflow.png
new file mode 100644
index 0000000..34c8a6d
Binary files /dev/null and b/docs/source/_static/nanopolish-workflow.png differ
diff --git a/docs/source/_static/quickstart_methylation_results.png b/docs/source/_static/quickstart_methylation_results.png
new file mode 100644
index 0000000..05dc733
Binary files /dev/null and b/docs/source/_static/quickstart_methylation_results.png differ
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..d4d41d5
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+#
+# Nanopolish documentation build configuration file, created by
+# sphinx-quickstart on Tue Nov 14 14:59:25 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'Nanopolish'
+copyright = u'2017, Simpson Lab'
+author = u'Simpson Lab'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'0.8.4'
+# The full version, including alpha/beta/rc tags.
+release = u'0.8.4'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+#html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+html_sidebars = {
+    '**': [
+        'relations.html',  # needs 'show_related': True theme option to display
+        'searchbox.html',
+    ]
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Nanopolishdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Nanopolish.tex', u'Nanopolish Documentation',
+     u'Simpson Lab', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'nanopolish', u'Nanopolish Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Nanopolish', u'Nanopolish Documentation',
+     author, 'Nanopolish', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
diff --git a/docs/source/debug.rst b/docs/source/debug.rst
new file mode 100644
index 0000000..005f763
--- /dev/null
+++ b/docs/source/debug.rst
@@ -0,0 +1,170 @@
+.. _help_us_debug:
+
+Helping us debug nanopolish
+===============================
+
+Overview
+"""""""""""""""""""""""
+
+Running into errors with nanopolish? To help us debug, we need to be able to reproduce the errors. We can do this by packaging a subset of the files that were used by a nanopolish. We have provided ``scripts/extract_reads_aligned_to_region.py`` and this tutorial to help you do exactly this.
+
+Briefly, this script will:
+
+* extract reads that align to a given region in the draft genome assembly
+* rewrite a new BAM, BAI, FASTA file with these reads
+* extract the FAST5 files associated with these reads
+* save all these files into a tar.gz file
+
+Workflow
+"""""""""""""
+
+#. Narrow down a problematic region by running ``nanopolish variants --consensus [...]`` and changing the ``-w`` parameter.
+#. Run the ``scripts/extract_reads_aligned_to_region.py``.
+#. Send the resulting ``tar.gz`` file to us by hosting either a dropbox or google drive.
+
+.. _creating_example_dataset:
+
+Tutorial - using extraction helper script to create example datsets
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+We extracted a subset of reads for a 2kb region to create the example dataset for the eventalign and consensus tutorial using ``scripts/extract_reads_aligned_to_region.py``. Here is how:
+
+|
+
+Generated basecalled ``--reads`` file:
+
+#. Basecalled reads with albacore: ::
+
+    read_fast5_basecaller.py -c r94_450bps_linear.cfg -t 8 -i /path/to/raw/fast5/files -s /path/to/albacore-2.0.1/output/directory -o fastq 
+
+#. Merged the different albacore fastq outputs: ::
+
+    cat /path/to/albacore-2.0.1/output/directory/workspace/pass/*.fastq > albacore-2.0.1-merged.fastq
+
+#. Converted the merged fastq to fasta format: ::
+
+    paste - - - - < albacore-2.0.1-merged.fastq | cut -f 1,2 | sed 's/^@/>/' | tr "\t" "\n" > reads.fasta
+
+|
+
+Generated ``--bam`` file with the draft genome assembly (``-g``):
+
+#. Ran canu to create draft genome assembly: ::
+
+    canu \
+        -p ecoli -d outdir genomeSize=4.6m \
+        -nanopore-raw reads.fasta \ 
+
+#. Index draft assembly: ::
+
+    bwa index ecoli.contigs.fasta
+    samtools faidx ecoli.contigs.fasta
+
+#. Aligned reads to draft genome assembly with bwa (v0.7.12): ::
+
+    bwa mem -x ont2d ecoli.contigs.fasta reads.fasta | samtools sort -o reads.sorted.bam -T reads.tmp
+    samtools index reads.sorted.bam
+
+|
+
+Selected a ``--window``:
+
+#. Identified the first contig name and chose a random start position: ::
+
+    head -3 ecoli.contigs.fasta
+
+Output: ::
+
+    >tig00000001 len=4376233 reads=23096 covStat=7751.73 gappedBases=no class=contig suggestRepeat=no suggestCircular=no
+    AGATGCTTTGAAAGAAACGCAGAATAGATCTCTATGTAATGATATGGAATACTCTGGTATTGTCTGTAAAGATACTAATGGAAAATATTTTGCATCTAAG
+    GCAGAAACTGATAATTTAAGAAAGGAGTCATATCCTCTGAAAAGAAAATGTCCCACAGGTACAGATAGAGTTGCTGCTTATCATACTCACGGTGCAGATA
+ 
+As we wanted a 2kb region, we selected a random start position (200000) so our end position was 202000. Therefore our ``--window`` was "tig00000001:200000-202000".
+
+|
+
+Using the files we created, we ran ``scripts/extract_reads_aligned_to_region.py``, please see usage example below.
+
+.. note:: Make sure nanopolish still reproduces the same error on this subset before sending it to us.
+
+Usage example
+"""""""""""""""""""""""
+::
+
+    python extract_reads_aligned_to_region.py \
+        --reads reads.fasta \
+        --genome ecoli.contigs.fasta \
+        --bam reads.sorted.bam \
+        --window "tig00000001:200000-202000" \
+        --output_prefix ecoli_2kb_region --verbose
+
+.. list-table:: 
+   :widths: 25 5 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Req.
+     - Default value
+     - Description
+
+   * - ``-b``, ``--bam``
+     - Y
+     - NA
+     - Sorted bam file created by aligning reads to the draft genome.
+
+   * - ``-g``, ``--genome``
+     - Y
+     - NA
+     - Draft genome assembly
+
+   * - ``-r``, ``--reads``
+     - Y
+     - NA
+     - Fasta, fastq, fasta.gz, or fastq.gz file containing basecalled reads.
+
+   * - ``-w``, ``--window``
+     - Y
+     - NA
+     - Draft genome assembly coordinate string ex. "contig:start-end". It is essential that you wrap the coordinates in quotation marks (\").
+
+   * - ``-o``, ``--output_prefix``
+     - N
+     - reads_subset
+     - Prefix of output tar.gz and log file.
+
+   * - ``-v``, ``--verbose``
+     - N
+     - False
+     - Use for verbose output with info on progress.
+
+Script overview
+"""""""""""""""""""""
+
+#. Parse input files
+#. Assumes readdb file name from input reads file
+#. Validates input
+    - checks that input bam, readdb, fasta/q, draft genome assembly, draft genome assembly index file exist, are not empy, and are readable
+#. With user input draft genome assembly coordinates, extracts all reads that aligned within these coordinates stores the read_ids. This information can be found from the input BAM.
+    - uses pysam.AlignmentFile
+    - uses samfile.fetch(region=draft_ga_coords) to get all reads aligned to region
+    - if reads map to multiple sections within draft ga it is not added again
+#. Parses through the input readdb file to find the FAST5 files associated with each region that aligned to region
+    - stores in dictionary region_fast5_files; key = read_id, value = path/to/fast5/file
+    - path to fast5 file is currently dependent on the user's directory structure
+#. Make a BAM and BAI file for this specific region
+    - creates a new BAM file called ``region.bam``
+    - with pysam.view we rewrite the new bam with reads that aligned to the region...
+    - with pysam.index we create a new BAI file
+#. Now to make a new FASTA file with this subset of reads
+    - the new fasta file is called ``region.fasta``
+    - this first checks what type of sequences file is given { ``fasta``, ``fastq``, ``fasta.gz``, ``fastq.gz`` }
+    - then handles based on type of seq file using SeqIO.parse
+    - then writes to a new fasta file
+#. Let's get to tarring
+    - creates a ``tar.gz`` file with the output prefix
+    - saves the fast5 files in directory ``output_prefix/fast5_files/``
+#. Adds the new fasta, new bam, and new bai file with the subset of reads
+#. Adds the draft genome asssembly and associated fai index file
+#. Performs a check
+    - the number of reads in the new BAM file, new FASTA file, and the number of files in the fast5_files directory should be equal
+#. Outputs a ``tar.gz`` and ``log`` file. FIN!
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..1bce713
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,30 @@
+.. Nanopolish documentation master file, created by
+   sphinx-quickstart on Tue Nov 14 14:59:25 2017.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+nanopolish
+======================================
+`nanopolish <https://github.com/jts/nanopolish>`_ is a software package for signal-level analysis of Oxford Nanopore sequencing data. Nanopolish can calculate an improved consensus sequence for a draft genome assembly, detect base modifications, call SNPs and indels with respect to a reference genome and more (see Nanopolish modules, below).
+
+
+.. toctree::
+   :hidden:
+
+   installation
+   quickstart_consensus
+   quickstart_eventalign
+   quickstart_call_methylation
+   debug
+   manual
+
+Publications
+====================================
+
+* Loman, Nicholas J., Joshua Quick, and Jared T. Simpson. "A complete bacterial genome assembled de novo using only nanopore sequencing data." Nature methods 12.8 (2015): 733-735.
+* Quick, Joshua, et al. "Real-time, portable genome sequencing for Ebola surveillance." Nature 530.7589 (2016): 228-232.
+* Simpson, Jared T., et al. "Detecting DNA cytosine methylation using nanopore sequencing." nature methods 14.4 (2017): 407-410.
+
+Credits and Thanks
+========================
+The fast table-driven logsum implementation was provided by Sean Eddy as public domain code. This code was originally part of `hmmer3 <http://hmmer.org/>`_ . Nanopolish also includes code from Oxford Nanopore's `scrappie <https://github.com/nanoporetech/scrappie>`_ basecaller. This code is licensed under the MPL.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
new file mode 100644
index 0000000..36a1de9
--- /dev/null
+++ b/docs/source/installation.rst
@@ -0,0 +1,45 @@
+.. _installation:
+
+Installation
+=======================
+
+Dependencies
+-----------------------
+
+A compiler that supports C++11 is needed to build nanopolish. Development of the code is performed using `gcc-4.8 <https://gcc.gnu.org/gcc-4.8/>`_.
+
+By default, nanopolish will download and compile all of its required dependencies. Some users however may want to use system-wide versions of the libraries. To turn off the automatic installation of dependencies set `HDF5=noinstall`, `EIGEN=noinstall` or `HTS=noinstall` parameters when running `make` as appropriate. The current versions and compile options for the dependencies are:
+
+* `libhdf5-1.8.14 <http://www.hdfgroup.org/HDF5/release/obtain5.html>`_ compiled with multi-threading support ``--enable-threadsafe``
+* `eigen-3.2.5 <http://eigen.tuxfamily.org/>`_ 
+* `htslib-1.4 <http://github.com/samtools/htslib>`_
+
+Additionally the helper `scripts` require `biopython <http://biopython.org/>`_ and `pysam <http://pysam.readthedocs.io/en/latest/installation.html>`_.
+
+Installing the latest code from github (recommended)
+------------------------------------------------------
+You can download and compile the latest code from github as follows ::
+
+    git clone --recursive https://github.com/jts/nanopolish.git
+    cd nanopolish
+    make
+
+Installing a particular release
+------------------------------------------------------
+When major features have been added or bugs fixed, we will tag and release a new version of nanopolish. If you wish to use a particular version, you can checkout the tagged version before compiling ::
+
+    git clone --recursive https://github.com/jts/nanopolish.git
+    cd nanopolish
+    git checkout v0.7.1
+    make
+
+To run using docker
+-------------------
+
+First build the image from the dockerfile: ::
+
+    docker build .
+
+Note the uuid given upon successful build. Then you can run nanopolish from the image: ::
+
+    docker run -v /path/to/local/data/data/:/data/ -it :image_id  ./nanopolish eventalign -r /data/reads.fa -b /data/alignments.sorted.bam -g /data/ref.fa
diff --git a/docs/source/manual.rst b/docs/source/manual.rst
new file mode 100644
index 0000000..844fae7
--- /dev/null
+++ b/docs/source/manual.rst
@@ -0,0 +1,413 @@
+.. _manual:
+
+Manual
+===================
+
+Modules available: ::
+
+    nanopolish extract: extract reads in FASTA or FASTQ format from a directory of FAST5 files
+    nanopolish call-methylation: predict genomic bases that may be methylated
+    nanopolish variants: detect SNPs and indels with respect to a reference genome
+    nanopolish variants --consensus: calculate an improved consensus sequence for a draft genome assembly
+    nanopolish eventalign: align signal-level events to k-mers of a reference genome
+
+|
+
+extract
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+This module is used to extract reads in FASTA or FASTQ format from a directory of FAST5 files.  
+
+Input
+"""""""""""""""""""""""
+
+    * path to a directory of FAST5 files modified to contain basecall information
+
+Output
+"""""""""""""""""""""""
+
+    * sequences of reads in FASTA or FASTQ format
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish extract [OPTIONS] <fast5|dir>
+
+.. list-table:: 
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * -  <fast5|dir>
+     - Y
+     - NA
+     - FAST5 or path to directory of FAST5 files.
+
+   * - ``-r``, ``--recurse``
+     - N
+     - NA
+     - Recurse into subdirectories
+
+   * - ``-q``, ``--fastq``
+     - N
+     - fasta format
+     - Use when you want to extract to FASTQ format
+
+   * - ``-t``, ``--type=TYPE``
+     - N
+     - 2d-or-template
+     - The type of read either: {template, complement, 2d, 2d-or-template, any}
+
+   * - ``-b``, ``--basecaller=NAME[:VERSION]``
+     - N
+     - NA
+     - consider only data produced by basecaller NAME, optionally with given exact VERSION
+
+   * - ``-o``, ``--output=FILE``
+     - N
+     - stdout
+     - Write output to FILE
+
+|
+
+index
+--------------------
+
+Overview
+"""""""""""""""""""""""
+Build an index mapping from basecalled reads to the signals measured by the sequencer
+
+Input
+""""""""
+    * path to directory of raw nanopore sequencing data in FAST5 format
+    * basecalled reads
+
+Output
+""""""""
+    * gzipped FASTA format of basecalled reads
+    * index files (fai, gzi, readdb)
+
+Readdb file format
+""""""""""""""""""""
+Readdb file is a tab-separated file that contains two columns. One column represents read ids and the other column represents the corresponding path to FAST5 file: ::
+
+    read_id_1   /path/to/fast5/containing/reads_id_1/signals
+    read_id_2   /path/to/fast5/containing/read_id_2/signals
+
+Usage example
+""""""""""""""
+::
+
+    nanopolish index [OPTIONS] -d nanopore_raw_file_directory reads.fastq
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``-d``, ``--directory``
+     - Y
+     - NA
+     - FAST5 or path to directory of FAST5 files containing ONT sequencing raw signal information.
+
+   * - ``-f``, ``--fast5-fofn``
+     - N
+     - NA
+     - file containing the paths to each fast5 for the run
+
+
+
+call-methylation
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+Classify nucleotides as methylated or not.
+
+Input
+"""""""""""""""""""""""
+
+    * Basecalled ONT reads in FASTA format
+
+Output
+"""""""""""""""""""""""
+
+    * tab-separated file containing per-read log-likelihood ratios
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish call-methylation [OPTIONS] <fast5|dir>
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``-r``, ``--reads=FILE``
+     - Y
+     - NA
+     - the 2D ONT reads are in fasta FILE
+
+   * - ``-b``, ``--bam=FILE``
+     - Y
+     - NA 
+     - the reads aligned to the genome assembly are in bam FILE
+
+   * - ``-g``, ``--genome=FILE``
+     - Y
+     - NA 
+     - the genome we are computing a consensus for is in FILE
+
+   * - ``-t``, ``--threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``--progress``
+     - N
+     - NA
+     - print out a progress message
+
+variants
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+This module is used to call single nucleotide polymorphisms (SNPs) using a signal-level HMM.  
+
+Input
+"""""""""""""""""""""""
+
+    * basecalled reads
+    * alignment info
+    * genome assembly
+
+Output
+"""""""""""""""""""
+
+    * VCF file
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish variants [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``--snps``
+     - N
+     - NA
+     - use flag to only call SNPs
+
+   * - ``--consensus=FILE``
+     - N
+     - NA
+     - run in consensus calling mode and write polished sequence to FILE
+
+   * - ``--fix-homopolymers``
+     - N
+     - NA
+     - use flag to run the experimental homopolymer caller
+
+   * - ``--faster``
+     - N
+     - NA
+     - minimize compute time while slightly reducing consensus accuracy
+
+   * - ``-w``, ``--window=STR``
+     - N
+     - NA
+     - find variants in window STR (format: <chromsome_name>:<start>-<end>)
+
+   * - ``-r``, ``--reads=FILE``
+     - Y
+     - NA
+     - the 2D ONT reads are in fasta FILE
+
+   * - ``-b``, ``--bam=FILE``
+     - Y
+     - NA
+     - the reads aligned to the reference genome are in bam FILE 
+
+   * - ``-e``, ``--event-bam=FILE``
+     - Y
+     - NA
+     - the events aligned to the reference genome are in bam FILE
+
+   * - ``-g``, ``--genome=FILE``
+     - Y
+     - NA
+     - the reference genome is in FILE
+
+   * - ``-o``, ``--outfile=FILE``
+     - N
+     - stdout
+     - write result to FILE
+
+   * - ``-t``, ``--threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``-m``, ``--min-candidate-frequency=F``
+     - N
+     - 0.2
+     - extract candidate variants from the aligned reads when the variant frequency is at least F
+
+   * - ``-d``, ``--min-candidate-depth=D``
+     - N
+     - 20
+     - extract candidate variants from the aligned reads when the depth is at least D
+
+   * - ``-x``, ``--max-haplotypes=N``
+     - N
+     - 1000
+     - consider at most N haplotypes combinations
+
+   * - ``--max-rounds=N``
+     - N
+     - 50
+     - perform N rounds of consensus sequence improvement
+
+   * - ``-c``, ``--candidates=VCF``
+     - N
+     - NA
+     - read variants candidates from VCF, rather than discovering them from aligned reads
+
+   * - ``-a``, ``--alternative-basecalls-bam=FILE``
+     - N
+     - NA
+     - if an alternative basecaller was used that does not output event annotations then use basecalled sequences from FILE. The signal-level events will still be taken from the -b bam
+
+   * - ``--calculate-all-support``
+     - N
+     - NA
+     - when making a call, also calculate the support of the 3 other possible bases
+
+   * - ``--models-fofn=FILE``
+     - N
+     - NA
+     - read alternatives k-mer models from FILE
+
+
+event align
+--------------------
+
+Overview
+"""""""""""""""""""""""
+
+Align nanopore events to reference k-mers
+
+Input
+"""""""""""""""""""""""
+
+    * basecalled reads
+    * alignment information
+    * assembled genome
+
+Usage example
+"""""""""""""""""""""""
+
+::
+
+   nanopolish eventalign [OPTIONS] --reads reads.fa --bam alignments.bam --genome genome.fa
+
+.. list-table::
+   :widths: 20 10 20 50
+   :header-rows: 1
+
+   * - Argument name(s)
+     - Required
+     - Default value
+     - Description
+
+   * - ``--sam``
+     - N
+     - NA
+     - use to write output in SAM format
+
+   * - ``-w, --window=STR``
+     - N
+     - NA
+     - Compute the consensus for window STR (format : ctg:start_id-end_id)
+
+   * - ``-r, --reads=FILE``
+     - Y
+     - NA
+     - the 2D ONT reads are in fasta FILE
+
+   * - ``-b, --bam=FILE``
+     - Y
+     - NA
+     - the reads aligned to the genome assembly are in bam FILE
+
+   * - ``-g, --genome=FILE``
+     - Y
+     - NA
+     - the genome we are computing a consensus for is in FILE
+
+   * - ``-t, --threads=NUM``
+     - N
+     - 1
+     - use NUM threads
+
+   * - ``--scale-events``
+     - N
+     - NA
+     - scale events to the model, rather than vice-versa
+
+   * - ``--progress``
+     - N
+     - NA
+     - print out a progress message
+
+   * - ``-n``, ``--print-read-names``
+     - N
+     - NA
+     - print read names instead of indexes
+
+   * - ``--summary=FILE``
+     - N
+     - NA
+     - summarize the alignment of each read/strand in FILE
+
+   * - ``--samples``
+     - N
+     - NA
+     - write the raw samples for the event to the tsv output
+
+   * - ``--models-fofn=FILE``
+     - N
+     - NA
+     - read alternative k-mer models from FILE
diff --git a/docs/source/quickstart_call_methylation.rst b/docs/source/quickstart_call_methylation.rst
new file mode 100644
index 0000000..40316e7
--- /dev/null
+++ b/docs/source/quickstart_call_methylation.rst
@@ -0,0 +1,125 @@
+.. _quickstart_call_methylation:
+
+Quickstart - calling methylation with nanopolish
+=====================================================
+
+Oxford Nanopore sequencers are sensitive to base modifications. Here we provide a step-by-step tutorial to help you get started with detecting base modifications using nanopolish.
+
+**For more information about our approach:**
+
+* Simpson, Jared T., et al. `"Detecting DNA cytosine methylation using nanopore sequencing." <https://www.nature.com/articles/nmeth.4184>`_ Nature Methods (2017). 
+
+**Requirements**:
+
+* `nanopolish v0.8.4 <installation.html>`_
+* `samtools v1.2 <http://samtools.sourceforge.net/>`_
+* `minimap2 <https://github.com/lh3/minimap2>`_
+
+Download example dataset
+------------------------------------
+
+In this tutorial we will use a subset of the `NA12878 WGS Consortium data <https://github.com/nanopore-wgs-consortium/NA12878/blob/master/Genome.md>`_. You can download the example dataset we will use here (warning: the file is about 2GB): ::
+
+    wget http://s3.climb.ac.uk/nanopolish_tutorial/methylation_example.tar.gz
+    tar -xvf methylation_example.tar.gz
+    cd methylation_example
+
+**Details**:
+
+* Sample :	Human cell line (NA12878)
+* Basecaller : Albacore v2.0.2
+* Region: chr20:5,000,000-10,000,000
+
+In the extracted example data you should find the following files:
+
+* ``albacore_output.fastq`` : the subset of the basecalled reads
+* ``reference.fasta`` : the chromsome 20 reference sequence
+* ``fast5_files/`` : a directory containing signal-level FAST5 files
+
+The reads were basecalled using this albacore command: ::
+
+    read_fast5_basecaller.py -c r94_450bps_linear.cfg -t 8 -i fast5_files -s basecalled/ -o fastq
+
+After the basecaller finished, we merged all of the fastq files together into a single file: ::
+
+    cat basecalled/workspace/pass/*.fastq > albacore_output.fastq
+
+Data preprocessing
+------------------------------------
+
+nanopolish needs access to the signal-level data measured by the nanopore sequencer. To begin, we need to create an index file that links read ids with their signal-level data in the FAST5 files: ::
+
+    nanopolish index -d fast5_files/ albacore_output.fastq
+
+We get the following files: ``albacore_output.fastq.index``, ``albacore_output.fastq.index.fai``, ``albacore_output.fastq.index.gzi``, and ``albacore_output.fastq.index.readdb``.
+
+Aligning reads to the reference genome
+--------------------------------------
+
+Next, we need to align the basecalled reads to the reference genome. We use minimap2 as it is fast enough to map reads to the human genome. In this example we'll pipe the output directly into ``samtools sort`` to get a sorted bam file: ::
+
+    minimap2 -a -x map-ont reference.fasta albacore_output.fastq | samtools sort -T tmp -o albacore_output.sorted.bam
+    samtools index albacore_output.sorted.bam
+
+Calling methylation
+-------------------
+
+Now we're ready to use nanopolish to detect methylated bases (in this case 5-methylcytosine in a CpG context). The command is fairly straightforward - we have to tell it what reads to use (``albacore_output.fastq``), where the alignments are (``albacore_output.sorted.bam``), the reference genome (``reference.fasta``) and what region of the genome we're interested in (``chr20:5,000,000-10,000,000``)::
+	
+    nanopolish call-methylation -t 8 -r albacore_output.fastq -b albacore_output.sorted.bam -g reference.fasta -w "chr20:5,000,000-10,000,000" > methylation_calls.tsv
+
+The output file contains a lot of information including the position of the CG dinucleotide on the reference genome, the ID of the read that was used to make the call, and the log-likelihood ratio calculated by our model: ::
+
+	chromosome  start    end      read_name                             log_lik_ratio  log_lik_methylated  log_lik_unmethylated  num_calling_strands  num_cpgs  sequence
+	chr20       4980553  4980553  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  3.70           -167.47             -171.17               1                    1         TGAGACGGGGT
+	chr20       4980599  4980599  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  2.64           -98.87              -101.51               1                    1         AATCTCGGCTC
+	chr20       4980616  4980616  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  -0.61          -95.35              -94.75                1                    1         ACCTCCGCCTC
+	chr20       4980690  4980690  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  -2.99          -99.58              -96.59                1                    1         ACACCCGGCTA
+	chr20       4980780  4980780  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  5.27           -135.45             -140.72               1                    1         CACCTCGGCCT
+	chr20       4980807  4980807  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  -2.95          -89.20              -86.26                1                    1         ATTACCGGTGT
+	chr20       4980820  4980822  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  7.47           -90.63              -98.10                1                    2         GCCACCGCGCCCA
+	chr20       4980899  4980901  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  3.17           -96.40              -99.57                1                    2         GTATACGCGTTCC
+	chr20       4980955  4980955  c1e202f4-e8f9-4eb8-b9a6-d79e6fab1e9a  0.33           -92.14              -92.47                1                    1         AGTCCCGATAT
+
+
+A positive value in the ``log_lik_ratio`` column indicates support for methylation. We have provided a helper script that can be used to calculate how often each reference position was methylated: ::
+
+	scripts/calculate_methylation_frequency.py -i methylation_calls.tsv > methylation_frequency.tsv
+
+The output is another tab-separated file, this time summarized by genomic position: ::
+
+	chromosome  start    end      num_cpgs_in_group  called_sites  called_sites_methylated  methylated_frequency  group_sequence
+	chr20       5036763  5036763  1                  21            20                       0.952                 split-group
+	chr20       5036770  5036770  1                  21            20                       0.952                 split-group
+	chr20       5036780  5036780  1                  21            20                       0.952                 split-group
+	chr20       5037173  5037173  1                  13            5                        0.385                 AAGGACGTTAT
+
+In the example data set we have also included bisulfite data from ENCODE for the same region of chromosome 20. We can use the included ``compare_methylation.py`` helper script to do a quick comparison between the nanopolish methylation output and bisulfite: ::
+
+    python compare_methylation.py bisulfite.ENCFF835NTC.example.tsv methylation_frequency.tsv > bisulfite_vs_nanopolish.tsv
+
+We can use R to visualize the results - we observe good correlation between the nanopolish methylation calls and bisulfite: ::
+
+    library(ggplot2)
+    library(RColorBrewer)
+    data <- read.table("bisulfite_vs_nanopolish.tsv", header=T)
+
+    # Set color palette for 2D heatmap
+    rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
+    r <- rf(32)
+
+    c <- cor(data$frequency_1, data$frequency_2)
+    title <- sprintf("N = %d r = %.3f", nrow(data), c)
+    ggplot(data, aes(frequency_1, frequency_2)) +
+        geom_bin2d(bins=25) + scale_fill_gradientn(colors=r, trans="log10") +
+        xlab("Bisulfite Methylation Frequency") +
+        ylab("Nanopolish Methylation Frequency") +
+        theme_bw(base_size=20) +
+        ggtitle(title)
+
+Here's what the output should look like:
+
+.. figure:: _static/quickstart_methylation_results.png
+  :scale: 80%
+  :alt: quickstart_methylation_results
+
diff --git a/docs/source/quickstart_consensus.rst b/docs/source/quickstart_consensus.rst
new file mode 100644
index 0000000..59985f0
--- /dev/null
+++ b/docs/source/quickstart_consensus.rst
@@ -0,0 +1,115 @@
+.. _quickstart_consensus:
+
+Quickstart - how to polish the consensus assembly
+===================================================
+
+The original purpose for nanopolish was to improve the consensus assembly accuracy for Oxford Nanopore Technology sequencing reads. Here we provide a step-by-step tutorial to help you get started with our tool.
+
+**Requirements**:
+
+* `nanopolish v0.8.4 <installation.html>`_
+* `samtools v1.2 <http://samtools.sourceforge.net/>`_
+* `bwa v0.7.12 <https://github.com/lh3/bwa>`_
+* `MUMmer <https://github.com/mummer4/mummer>`_
+
+Download example dataset
+------------------------------------
+
+You can download the example dataset we will use here: ::
+
+    wget http://s3.climb.ac.uk/nanopolish_tutorial/ecoli_2kb_region.tar.gz
+    tar -xvf ecoli_2kb_region.tar.gz
+    cd ecoli_2kb_region
+
+**Details**:
+
+* Sample :	E. coli str. K-12 substr. MG1655
+* Instrument : MinION sequencing R9.4 chemistry
+* Basecaller : Albacore v2.0.1
+* Region: "tig00000001:200000-202000"
+* Note: Ligation-mediated PCR amplification performed
+
+This is a subset of reads that aligned to a 2kb region in the E. coli draft assembly. To see how we generated these files please refer to the tutorial :ref:`creati
+ng_example_dataset <here>`.
+
+You should find the following files:
+
+* ``reads.fasta`` : subset of basecalled reads
+* ``draft.fa`` : draft genome assembly
+* ``draft.fa.fai`` : draft genome assembly index
+* ``fast5_files/`` : a directory containing FAST5 files
+* ``ecoli_2kb_region.log`` : a log file for how the dataset was created with nanopolish helper script (``scripts/extract_reads_aligned_to_region.py``) 
+
+For the evaluation step you will need the reference genome: ::
+
+    wget -O ref.fa ftp://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid225/U00096.ffn
+
+Analysis workflow
+-------------------------------
+
+The pipeline below describes the recommended analysis workflow for larger datasets. In this tutorial, we will run through the basic steps of the pipeline for this smaller (2kb) dataset.
+
+.. figure:: _static/nanopolish-workflow.png
+  :scale: 90%
+  :alt: nanopolish-tutorial-workflow
+
+Data preprocessing
+------------------------------------
+
+nanopolish needs access to the signal-level data measured by the nanopore sequencer. To begin, we need to create an index ``readdb`` file that links read ids with their signal-level data in the FAST5 files: ::
+
+    nanopolish index -d fast5_files/ reads.fasta
+
+We get the following files: ``reads.fasta.index``, ``reads.fasta.index.fai``, ``reads.fasta.index.gzi``, and ``reads.fasta.index.readdb``.
+
+Compute the draft genome assembly using canu
+-----------------------------------------------
+
+As computing the draft genome assembly takes a few hours we have included the pre-assembled data for you (``draft.fa``).
+We used the following parameters with `canu <canu.readthedocs.io>`_: ::
+
+    canu \
+        -p ecoli -d outdir genomeSize=4.6m \
+        -nanopore-raw albacore-2.0.1-merged.fastq \
+
+Compute a new consensus sequence for a draft assembly
+------------------------------------------------------------------------
+
+Now that we have ``reads.fasta`` indexed with ``nanopolish index``, and have a draft genome assembly ``draft.fa``, we can begin to improve the assembly with nanopolish. Let us get started! 
+
+First step, is to index the draft genome assembly. We can do that with the following command: ::
+
+    bwa index draft.fa
+
+Next, we align the original reads (``reads.fasta``) to the draft assembly (``draft.fa``) and sort alignments: ::
+
+    bwa mem -x ont2d -t 8 draft.fa reads.fasta | samtools sort -o reads.sorted.bam -T reads.tmp
+    samtools index reads.sorted.bam
+
+**Checkpoint**: we can do a quick check to see if this step worked. The bam file should not be empty. ::
+
+    samtools view reads.sorted.bam | head
+
+Then we run the consensus algorithm. For larger datasets we use ``nanopolish_makerange.py`` to split the draft genome assembly into 50kb segments, so that we can run the consensus algorithm on each segment in parallel. The output would be the polished segments in ``fasta`` format. 
+Since our dataset is only covering a 2kb region, we skip this step and use the following command: ::
+
+    nanopolish variants --consensus polished.fa \
+        -w "tig00000001:200000-202000" \
+        -r reads.fasta \
+        -b reads.sorted.bam \
+        -g draft.fa
+
+We are left with our desired output: ``polished.fa``.
+
+Evaluate the assembly
+---------------------------------
+
+To analyze how nanopolish performed improving the accuracy we use `MUMmer <https://github.com/mummer4/mummer>`_. MUMmer contains "dnadiff", a program that enables us to see a report on alignment statistics. With dnadiff we can compare the two different assemblies. ::
+
+    mkdir analysis
+    MUMmer3.23/dnadiff --prefix analysis/draft.dnadiff ref.fa draft.fa
+    MUMmer3.23/dnadiff --prefix analysis/polished.dnadiff ref.fa polished.fa
+
+This generates ``draft.dnadiff.report`` and ``polished.dnadiff.report`` along with other files. The metric we are interested in is ``AvgIdentity`` under ``[ Alignments ] 1-to-1``, which is a measurement of how similar the genome assemblies are to the reference genome. We expect to see a higher value for the polished assembly than the draft ( ``99.90`` vs ``99.53`` ), concluding that the nanopolish consensus algorithm worked successfully.
+
+.. note:: The example dataset was PCR amplified causing a loss of methylation information. We recommend using the ``-q dam,dcm`` with ``nanopolish variants --consensus`` if you have data with methylation information to account for known bacterial methyltransferases.
diff --git a/docs/source/quickstart_eventalign.rst b/docs/source/quickstart_eventalign.rst
new file mode 100644
index 0000000..fe4dc02
--- /dev/null
+++ b/docs/source/quickstart_eventalign.rst
@@ -0,0 +1,103 @@
+.. _quickstart_eventalign:
+
+Quickstart - how to align events to a reference genome
+========================================================
+
+The eventalign module in nanopolish is used to align events or "squiggles" to a reference genome. We (the developers of nanopolish) use this feature extensively when we want to see what the low-level signal information looks like. It helps us model the signal and discover differences in current that might hint at base modifications. Here we provide a step-by-step tutorial to help you get started with the nanopolish eventalign module.
+
+**For more information about eventalign**:
+
+* `Blog post: "Aligning Nanopore Events to a Reference" <http://simpsonlab.github.io/2015/04/08/eventalign/>`_
+* `Paper: "A complete bacterial genome assembled de novo using only nanopore sequencing data" <https://www.nature.com/articles/nmeth.3444>`_
+
+**Requirements**:
+
+* `nanopolish v0.8.4 <installation.html>`_
+* `samtools v1.2 <http://samtools.sourceforge.net/>`_
+* `bwa v0.7.12 <https://github.com/lh3/bwa>`_
+
+Download example dataset
+------------------------------------
+
+You can download the example dataset we will use here: ::
+
+    wget http://s3.climb.ac.uk/nanopolish_tutorial/ecoli_2kb_region.tar.gz
+    tar -xvf ecoli_2kb_region.tar.gz
+    cd ecoli_2kb_region
+
+**Details**:
+
+* Sample :    E. coli str. K-12 substr. MG1655
+* Instrument : MinION sequencing R9.4 chemistry
+* Basecaller : Albacore v2.0.1
+* Region: "tig00000001:200000-202000"
+* Note: Ligation-mediated PCR amplification performed
+
+This is a subset of reads that aligned to a 2kb region in the E. coli draft assembly. To see how we generated these files please refer to this section: :ref:`creating_example_dataset`. 
+
+You should find the following files:
+
+* ``reads.fasta`` : subset of basecalled reads
+* ``fast5_files/`` : a directory containing FAST5 files
+
+You will need the E. coli reference genome: ::
+
+    wget -O ref.fa ftp://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid225/U00096.ffn
+
+Align the reads with bwa
+--------------------------------
+
+In order to run bwa we first need to index the reference genome: ::
+
+    bwa index ref.fa
+
+**Output files**: ``ref.fa.sa``, ``ref.fa.amb``, ``ref.fa.ann``, ``ref.fa.pac``, and ``ref.fa.bwt``.
+
+We will need to index the reads as well: ::
+
+    nanopolish index -d fast5_files/ reads.fasta
+
+**Output files**: ``reads.fasta.index``, ``reads.fasta.index.fai``, ``reads.fasta.index.gzi``, and ``reads.fasta.index.readdb``.   
+
+Then we can align the reads to the reference: ::
+
+    bwa mem -x ont2d ref.fa reads.fasta | samtools sort -o reads-ref.sorted.bam -T reads.tmp
+    samtools index reads-ref.sorted.bam
+
+**Output files**: ``reads-ref.sorted.bam`` and ``reads-ref.sorted.bam.bai``.
+
+**Checkpoint**: Let's see if the bam file is not truncated. This will check that the beginning of the file contains a valid header, and checks if the EOF is present. This will exit with a non-zero exit code if the conditions were not met: ::
+
+    samtools quickcheck reads-ref.sorted.bam
+ 
+Align the nanopore events to a reference
+-----------------------------------------------
+
+Now we are ready to run nanopolish to align the events to the reference genome: ::
+
+    nanopolish eventalign \
+        --reads reads.fasta \
+        --bam reads-ref.sorted.bam \
+        --genome ref.fa \
+        --scale-events > reads-ref.eventalign.txt
+
+Assess the eventalign output
+-----------------------------------------------
+
+If we take a peek at the first few lines of ``reads-ref.eventalign.txt`` this is what we get: ::
+
+	contig    position  reference_kmer  read_index  strand  event_index  event_level_mean  event_stdv  event_length  model_kmer  model_mean  model_stdv  standardized_level
+	gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0           t       16538        89.82             3.746       0.00100       CTCCAT      92.53       2.49        -0.88
+	gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0           t       16537        88.89             2.185       0.00100       CTCCAT      92.53       2.49        -1.18
+	gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0           t       16536        94.96             2.441       0.00125       CTCCAT      92.53       2.49        0.79
+	gi|545778205|gb|U00096.3|:c514859-514401  3         ATGGAG          0           t       16535        81.63             2.760       0.00150       NNNNNN      0.00        0.00        inf
+	gi|545778205|gb|U00096.3|:c514859-514401  7         AGTTAA          0           t       16534        78.96             2.278       0.00075       TTAACT      75.55       3.52        0.79
+	gi|545778205|gb|U00096.3|:c514859-514401  8         GTTAAT          0           t       16533        98.81             4.001       0.00100       ATTAAC      95.87       3.30        0.72
+	gi|545778205|gb|U00096.3|:c514859-514401  9         TTAATG          0           t       16532        96.92             1.506       0.00150       CATTAA      95.43       3.32        0.36
+	gi|545778205|gb|U00096.3|:c514859-514401  10        TAATGG          0           t       16531        70.86             0.402       0.00100       CCATTA      68.99       3.70        0.41
+	gi|545778205|gb|U00096.3|:c514859-514401  11        AATGGT          0           t       16530        91.24             4.256       0.00175       ACCATT      85.84       2.74        1.60
+
+Example plots
+-------------
+
+In `Figure 1 of our methylation detection paper <https://www.nature.com/articles/nmeth.4184>`_ we show a histogram of ``event_level_mean`` for a selection of k-mers to demonstrate how methylation changes the observed current. The data for these figures was generated by eventalign, which we subsequently plotted in R using ggplot2.
diff --git a/scripts/calculate_methylation_frequency.py b/scripts/calculate_methylation_frequency.py
old mode 100644
new mode 100755
index 0be0870..b6b6371
--- a/scripts/calculate_methylation_frequency.py
+++ b/scripts/calculate_methylation_frequency.py
@@ -9,6 +9,10 @@ from collections import namedtuple
 def make_key(c, s, e):
     return c + ":" + str(s) + ":" + str(e)
 
+def split_key(k):
+    f = k.split(":")
+    return (f[0], int(f[1]), int(f[2]))
+
 class SiteStats:
     def __init__(self, g_size, g_seq):
         self.num_reads = 0
@@ -75,9 +79,10 @@ for record in csv_reader:
 # header
 print("\t".join(["chromosome", "start", "end", "num_cpgs_in_group", "called_sites", "called_sites_methylated", "methylated_frequency", "group_sequence"]))
 
-for key in sites:
+sorted_keys = sorted(sites.keys(), key = lambda x: split_key(x))
+
+for key in sorted_keys:
     if sites[key].called_sites > 0:
         (c, s, e) = key.split(":")
         f = float(sites[key].called_sites_methylated) / sites[key].called_sites
-        print("\t".join([str(x) for x in [c, s, e, sites[key].group_size, sites[key].called_sites, sites[key].called_sites_methylated, f, sites[key].sequence]]))
-
+        print("%s\t%s\t%s\t%d\t%d\t%d\t%.3f\t%s" % (c, s, e, sites[key].group_size, sites[key].called_sites, sites[key].called_sites_methylated, f, sites[key].sequence))
diff --git a/scripts/extract_reads_aligned_to_region.py b/scripts/extract_reads_aligned_to_region.py
new file mode 100755
index 0000000..9a66e56
--- /dev/null
+++ b/scripts/extract_reads_aligned_to_region.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+'''
+========================================================
+Extract info on reads that align to a given region 
+in draft genome assembly.
+========================================================
+'''
+try:
+	from Bio import SeqIO
+	import pysam
+	import argparse
+	import subprocess
+	import tarfile
+	import gzip
+	import sys,os
+except ImportError:
+	print('Missing package(s)')
+	quit()
+
+verbose = False
+log = list()
+
+def main():
+	# --------------------------------------------------------
+	# PART 0: 	Parse input
+	# --------------------------------------------------------
+	parser = argparse.ArgumentParser(description='Extract and package reads within region')
+	parser.add_argument('-v', '--verbose', action="store_true", default=False, required=False, dest="verbose", help="Use for verbose output with info on progress.")
+	parser.add_argument('-b', '--bam', action="store", required=True, dest="bam", help="Sorted bam file created by aligning reads to the draft genome (refer to reads.sorted.bam in Nanopolish README).")
+	parser.add_argument('-r', '--reads', action="store", dest="fa_filename", help="Fasta, fastq, fasta.gz, or fastq.gz file (refer to reads.fa in Nanopolish README)")
+	parser.add_argument('-g', '--genome',  action="store", required=True, dest="draft_ga", help="Draft genome assembly (refer to draft.fa in Nanopolish README).")
+	parser.add_argument('-w', '--window', action="store", required=True, dest="draft_ga_coords", help="Draft genome assembly coordinates wrapped in quotes ex. \"tig000001:10000-20000\".")
+	parser.add_argument('-o', '--output_prefix', action="store", required=False, default="reads_subset", dest="output_prefix", help="Output prefix for tar.gz file and log file.")
+	args = parser.parse_args()
+	
+	# Check to see if user used verbose option
+	global verbose
+	if args.verbose:
+		verbose = True
+
+	# Infer readdb file from fasta/q file
+	readdb = args.fa_filename + ".index.readdb"
+
+	custom_print( "===================================================" )
+	custom_print( "Extract reads that align to given region" )
+	custom_print( "Package all necessary files to reproduce error" )
+	custom_print( "===================================================" )
+
+	# --------------------------------------------------------
+	# PART 1: 	Validate input
+	# --------------------------------------------------------
+	custom_print( "[ Input ]" )
+	custom_print( "[+] Extracting from draft genome assembly coords: " + args.draft_ga_coords )
+	custom_print( "[+] BAM file (reads.fa aligned to draft.fa): " + args.bam )
+	custom_print( "[+] Readdb file: " + readdb )
+	custom_print( "[+] Draft genome assembly (draft.fa): " + args.draft_ga )
+	custom_print( "[+] FASTA/Q file (reads.fa): " + args.fa_filename )
+	custom_print( "[+] Output prefix: " + args.output_prefix ) 
+
+	custom_print( "[ Input check ]" )
+	files = list()
+	files.append(args.bam)
+	files.append(readdb)
+	files.append(args.fa_filename)
+	files.append(args.draft_ga)
+	draft_ga_fai = args.draft_ga + ".fai"
+	files.append(draft_ga_fai)
+
+	for i in files:
+		if not os.path.exists(i) or not os.path.getsize(i) > 0 or not os.access(i, os.R_OK):
+			print( "Expecting " + i + ". But does not exist, is empty or is not readable." )
+			sys.exit(1)
+
+	custom_print( "[ Validated input ] All input files exist, are not-empty, and are readable." )
+
+	# --------------------------------------------------------
+	# PART 2: 	Reassign input argument values	
+	# --------------------------------------------------------
+	# o = old/original, ga = genome assembly, fa = fasta/q file
+	# coords = coordinates, op = output
+	o_bam = args.bam
+	o_readdb = readdb
+	o_fa = args.fa_filename
+	op = args.output_prefix
+	draft_ga_coords = args.draft_ga_coords
+
+	# --------------------------------------------------------
+	# PART 3: 	With user input ref coords, extract all 
+	#		aligned reads within these coordinates, 
+	#		store read_ids, and fast5 files.
+	# --------------------------------------------------------
+	custom_print( "[ Extracting info on reads aligned to region ] \t" + draft_ga_coords )
+	samfile = pysam.AlignmentFile(o_bam, "rb")
+	region_read_ids = list()
+	region_num_reads = 0
+
+	# get all read ids of reads that are aligned to region in draft assembly
+	for read in samfile.fetch(region=draft_ga_coords):
+		id = read.query_name
+		# add to list if not already in list
+		if not id in region_read_ids:
+			# store read id in list
+			region_read_ids.append(id)
+			# count number of reads that were aligned to the given region
+			region_num_reads+=1
+
+	# --------------------------------------------------------
+	# PART 4:   Parse readdb file and find path to fast5 files
+	# 		associated with each read that aligned to region
+	# --------------------------------------------------------
+	# readdb file has 2 columns: one indicating read_id and another indicating the fast5 file the read came from
+	# each row represents a read
+	custom_print( "[ Reading readdb file ]" )
+	region_fast5_files = dict()
+	with open (o_readdb, "r") as file:
+		for line in file:
+			l = line.split("\t")
+			read_id = l.pop(0)
+			if read_id in region_read_ids:
+				fast5_file = l.pop(0)
+				region_fast5_files[str(read_id)] = fast5_file.rstrip()
+
+	# --------------------------------------------------------
+	# PART 5:   Make a region BAM and BAI file
+	# --------------------------------------------------------
+	new_bam = "reads.bam"
+	custom_print( "[ Writing to a new BAM file ] \t" + new_bam )
+	region_reads = pysam.view("-b", o_bam, draft_ga_coords, "-o", new_bam, catch_stdout=False)
+	
+	new_bam_index = new_bam + ".bai"
+	custom_print( "[ Writing to a new BAI file ] \t" + new_bam_index )
+	pysam.index(new_bam, new_bam_index)
+
+	# --------------------------------------------------------
+	# PART 6: 	With user input ref coords, extract all 
+	#		aligned	reads within these coordinates 
+	#		and make new FASTA file
+	# --------------------------------------------------------
+	# detect type of sequences file then handle accordingly
+	file_type = detect_fa_filetype(o_fa)
+	new_fa = "reads.fasta"
+	custom_print( "[ Writing to a new fasta file ]\t" +  new_fa )
+	with open (new_fa, "w") as fout:
+		if ".gz" in file_type:
+			with gzip.open(o_fa, "rt") as fin:
+				if "fasta.gz" in file_type:
+					for record in SeqIO.parse(fin, "fasta"):
+						if record.id in region_read_ids:
+							fout.write(">" + record.id + "\n")
+							fout.write(str(record.seq) + "\n")
+				elif "fastq.gz" in file_type:
+					for record in SeqIO.parse(fin, "fastq"):
+						if record.id in region_read_ids:
+							fout.write(">" + record.id + "\n")
+							fout.write(str(record.seq) + "\n")
+		else:
+			with open(o_fa, "rt") as fin:
+				if "fasta" in file_type:
+					for record in SeqIO.parse(fin, "fasta"):
+						if record.id in region_read_ids:
+							fout.write(">" + record.id + "\n")
+							fout.write(str(record.seq) + "\n")
+				elif "fastq" in file_type:
+					for record in SeqIO.parse(fin, "fastq"):
+						if record.id in region_read_ids:
+							fout.write(">" + record.id + "\n")
+							fout.write(str(record.seq) + "\n")
+
+	# --------------------------------------------------------
+	# PART 7: 	Let's get to tarring
+	# --------------------------------------------------------
+	# While tarring, we need to fix the directory structure
+	# such that the original path to files are not saved.
+	# For each fast5 file we need to extract the basename,
+	# and save it in tar such that we save only the basename,
+	# and not the whole path from the original source.
+	tar_filename = op + ".tar.gz"
+	archive = tarfile.open(tar_filename, "w:gz")
+	custom_print( "[ Creating a tar.gz file ] \t" + tar_filename )
+	custom_print( "[+] FAST5 files: " + op + "/fast5_files/<FAST5 file(s)>" )
+	for r in region_fast5_files.keys():
+		read_id = r
+		f5 = region_fast5_files[r]
+
+		# get basename of fast5 file
+		f5_basename = extract_basename(f5)
+		an = op + "/fast5_files/" + f5_basename
+		archive.add(f5, arcname=an)
+
+	# --------------------------------------------------------
+	# PART 8:	Add new files to tar
+	# 			new fasta, new bam, and new bai with reads 
+	#			in the region given only
+	# --------------------------------------------------------
+	an = op + "/" + new_fa
+	archive.add(new_fa, arcname=an)
+	custom_print( "[+] New FASTA: " + an )
+	
+	an_new_bam = op + "/" + new_bam
+	archive.add(new_bam, arcname=an_new_bam)
+	custom_print( "[+] New BAM: " + an_new_bam )
+
+	an_new_bam_index = op + "/" + new_bam_index
+	archive.add(new_bam_index, arcname=an_new_bam_index)
+	custom_print( "[+] New BAI: " + an_new_bam_index )
+
+	# --------------------------------------------------------
+	# PART 9:	Add original draft genome assembly file
+	#			and the index file
+	# --------------------------------------------------------
+	an_draft_ga = op + "/draft.fa"
+	archive.add(args.draft_ga, arcname=an_draft_ga)
+	custom_print( "[+] Original draft ga: " + an_draft_ga )
+
+	an_draft_ga_fai = op + "/draft.fa.fai"
+	archive.add(i, arcname=an_draft_ga_fai)
+	custom_print( "[+] Original draft ga index: " + an_draft_ga_fai )
+
+	# --------------------------------------------------------
+	# PART 10: 	Check the number of reads in all new files
+	# --------------------------------------------------------
+	custom_print( "[ Output check ] " )
+	# check the length of bam file
+	num_reads_bam = region_num_reads
+	num_reads_fasta = int(float(file_length(new_fa))/2.0)
+	num_fast5_files = len(region_fast5_files)
+	values = list()
+	values.append(num_reads_bam)
+	values.append(num_reads_fasta)
+	custom_print( "[+] Num reads in new BAM: \t" + str(num_reads_bam) )
+	custom_print( "[+] Num reads in new FASTA: \t" + str(num_reads_fasta) )
+	custom_print( "[+] Num files in fast5_files/: \t" + str(num_fast5_files))
+	if not all( v == num_fast5_files for v in values ):
+		print( "[!] WARNING: The number of reads in the new bam, new fasta, and num of fast5 files tarred are not equal..." )
+	else:
+		custom_print( "[ Validated output ] Number of reads in the new bam, new fasta, and num of fast5 files tarred are equal!" )
+
+	# --------------------------------------------------------
+	# FINAL: 	Output log if verbose flag not used
+	# --------------------------------------------------------
+	global log
+	logfile = op + ".log"
+	with open (logfile, "w") as lfile:
+		for s in log:
+			lfile.write(s + "\n")
+	an_logfile = op + "/" + logfile
+	custom_print( "[ Log file ] " +  an_logfile )
+	custom_print( "[ Tar file ] " + str(tar_filename) )
+	custom_print( "[ Finished ] " )
+	archive.add(logfile, arcname=an_logfile)
+	archive.close()
+
+def file_length(filename):
+	# ========================================================
+	# Returns number of lines in a file
+	# --------------------------------------------------------
+	# Input: 	Filename
+	# Output: 	Number of lines in the file ...
+	# ========================================================
+	with open(filename) as f:
+		for i, l in enumerate(f):
+			pass
+	return int(i) + 1
+
+def extract_basename(filename):
+	# ========================================================
+	# Returns base filename
+	# --------------------------------------------------------
+	# Input: 	Filenames with paths
+	# Output: 	Base filename
+	# ========================================================
+	# remove backslashes at the end of the file names that could return empty basenames..
+	a = filename.rstrip("\\")
+	a = a.rstrip("//")
+	b = os.path.basename(a)
+	return str(b)
+
+def detect_fa_filetype(fa_filename):
+	# ========================================================
+	# Detects filetype of sequences input
+	# --------------------------------------------------------
+	# Input: 	FASTA/Q filename
+	# Output: 	Either ['fa.gz', 'fastq.gz', 'fasta.gz', 
+	# 			'fastq', 'fasta']
+	# ========================================================
+	path = fa_filename
+	if path.endswith('fa.gz'):
+		print("Possibly using the reads file generated by nanopolish index? Use original reads file...")	
+	for ext in ['fastq.gz', 'fasta.gz', 'fastq', 'fasta']:
+		if path.endswith(ext):
+			return ext
+	print( "Must be either fasta, fastq, fasta.gz, fastq.gz" )
+	sys.exit(1)
+
+def custom_print(s):
+	# ========================================================
+	# Depending on verbose flag, will save all prints to 
+	# log list, or will print to stdout
+	# --------------------------------------------------------
+	# Input: 	string to print
+	# ========================================================
+	global verbose
+	global log
+	if verbose:
+		print s
+	log.append(s)
+
+if __name__ == "__main__":
+	main()
diff --git a/scripts/nanopolish_merge.py b/scripts/nanopolish_merge.py
index e08bb79..ea059a3 100644
--- a/scripts/nanopolish_merge.py
+++ b/scripts/nanopolish_merge.py
@@ -25,7 +25,7 @@ def merge_into_consensus(consensus, incoming, overlap_length):
 
     assert(len(aln_con) == len(aln_inc))
 
-    for i in xrange(0, len(aln_con) / 2):
+    for i in range(0, len(aln_con) // 2):
         a = aln_con[i]
         b = aln_inc[i]
 
@@ -74,24 +74,29 @@ for fn in sys.argv[1:]:
             segments_by_name[contig] = dict()
 
         segment_start, segment_end = segment_range.split("-")
-
-        sys.stderr.write('Insert %s %s\n' % (contig, segment_start))
         segments_by_name[contig][int(segment_start)] = str(rec.seq)
 
 # Assemble while making sure every segment is present
 for contig_name in sorted(segments_by_name.keys()):
     assembly = ""
     prev_segment = None
+    all_segments_found = True
     for segment_start in sorted(segments_by_name[contig_name]):
 
-        sys.stderr.write('Merging %s %d\n' % (contig_name, segment_start))
         # Ensure the segments overlap
-        assert(prev_segment is None or prev_segment + SEGMENT_LENGTH + OVERLAP_LENGTH > segment_start)
+        if not( prev_segment is None or prev_segment + SEGMENT_LENGTH + OVERLAP_LENGTH > segment_start ):
+            sys.stderr.write("error: segment starting at %s:%d is missing\n" % (contig, prev_segment + SEGMENT_LENGTH + 40))
+            all_segments_found = False
 
         sequence = segments_by_name[contig_name][segment_start]
 
         assembly = merge_into_consensus(assembly, sequence, OVERLAP_LENGTH)
         prev_segment = segment_start
 
+
     # Write final assembly
-    print(">%s\n%s" % (contig_name, assembly))
+    if all_segments_found:
+        print(">%s\n%s" % (contig_name, assembly))
+    else:
+        sys.stderr.write("error: some segments are missing, could not merge contig %s\n" % (contig))
+        sys.exit(1)
diff --git a/src/alignment/nanopolish_alignment_db.cpp b/src/alignment/nanopolish_alignment_db.cpp
index ec8e6ce..357aed1 100644
--- a/src/alignment/nanopolish_alignment_db.cpp
+++ b/src/alignment/nanopolish_alignment_db.cpp
@@ -73,6 +73,7 @@ EventAlignmentRecord::EventAlignmentRecord(SquiggleRead* sr,
         size_t kmer_pos_ref_strand = seq_record.aligned_bases[i].read_pos;
         size_t kmer_pos_read_strand = seq_record.rc ? this->sr->flip_k_strand(kmer_pos_ref_strand, k) : kmer_pos_ref_strand;
         size_t event_idx = this->sr->get_closest_event_to(kmer_pos_read_strand, strand_idx);
+        assert(event_idx != -1);
         this->aligned_events.push_back( { seq_record.aligned_bases[i].ref_pos, (int)event_idx });
     }
     this->rc = strand_idx == 0 ? seq_record.rc : !seq_record.rc;
diff --git a/src/alignment/nanopolish_eventalign.cpp b/src/alignment/nanopolish_eventalign.cpp
index 1d7446d..b7eb14a 100644
--- a/src/alignment/nanopolish_eventalign.cpp
+++ b/src/alignment/nanopolish_eventalign.cpp
@@ -31,11 +31,15 @@
 #include "nanopolish_read_db.h"
 #include "nanopolish_hmm_input_sequence.h"
 #include "nanopolish_pore_model_set.h"
+#include "nanopolish_bam_processor.h"
 #include "H5pubconf.h"
 #include "profiler.h"
 #include "progress.h"
 
 //
+using namespace std::placeholders;
+
+//
 // Getopt
 //
 #define SUBPROGRAM "eventalign"
@@ -508,11 +512,11 @@ EventalignSummary summarize_alignment(const SquiggleRead& sr,
 }
 
 // Realign the read in event space
-void realign_read(EventalignWriter writer,
-                  const ReadDB& read_db, 
-                  const faidx_t* fai, 
-                  const bam_hdr_t* hdr, 
-                  const bam1_t* record, 
+void realign_read(const ReadDB& read_db,
+                  const faidx_t* fai,
+                  const EventalignWriter& writer,
+                  const bam_hdr_t* hdr,
+                  const bam1_t* record,
                   size_t read_idx,
                   int region_start,
                   int region_end)
@@ -524,12 +528,12 @@ void realign_read(EventalignWriter writer,
     SquiggleRead sr(read_name, read_db, opt::write_samples ? SRF_LOAD_RAW_SAMPLES : 0);
 
     if(opt::verbose > 1) {
-        fprintf(stderr, "Realigning %s [%zu %zu]\n", 
+        fprintf(stderr, "Realigning %s [%zu %zu]\n",
                 read_name.c_str(), sr.events[0].size(), sr.events[1].size());
     }
-    
+
     for(int strand_idx = 0; strand_idx < 2; ++strand_idx) {
-        
+
         // Do not align this strand if it was not sequenced
         if(!sr.has_events_for_strand(strand_idx)) {
             continue;
@@ -603,8 +607,10 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
     ref_seq = pore_model->pmalphabet->disambiguate(ref_seq);
     std::string rc_ref_seq = pore_model->pmalphabet->reverse_complement(ref_seq);
 
-    if(ref_offset == 0)
+    // Skip unmapped
+    if((params.record->core.flag & BAM_FUNMAP) != 0) {
         return alignment_output;
+    }
 
     // Get the read-to-reference aligned segments
     std::vector<AlignedSegment> aligned_segments = get_aligned_segments(params.record);
@@ -620,8 +626,9 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
         int max_kmer_idx = params.sr->read_sequence.size() - k;
         trim_aligned_pairs_to_kmer(aligned_pairs, max_kmer_idx);
 
-        if(aligned_pairs.empty())
+        if(aligned_pairs.empty()) {
             return alignment_output;
+        }
 
         bool do_base_rc = bam_is_rev(params.record);
         bool rc_flags[2] = { do_base_rc, !do_base_rc }; // indexed by strand
@@ -672,8 +679,9 @@ std::vector<EventAlignment> align_read_to_ref(const EventAlignmentParameters& pa
             HMMInputSequence hmm_sequence(fwd_subseq, rc_subseq, pore_model->pmalphabet);
             
             // Require a minimum amount of sequence to align to
-            if(hmm_sequence.length() < 2 * k)
+            if(hmm_sequence.length() < 2 * k) {
                 break;
+            }
 
             // Set up HMM input
             HMMInputData input;
@@ -862,42 +870,10 @@ int eventalign_main(int argc, char** argv)
 
     ReadDB read_db;
     read_db.load(opt::reads_file);
-    
-    // Open the BAM and iterate over reads
-
-    // load bam file
-    htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r");
-    assert(bam_fh != NULL);
 
-    // load bam index file
-    std::string index_filename = opt::bam_file + ".bai";
-    hts_idx_t* bam_idx = bam_index_load(index_filename.c_str());
-    if(bam_idx == NULL) {
-        bam_index_error_exit(opt::bam_file);
-    }
-
-    // read the bam header
-    bam_hdr_t* hdr = sam_hdr_read(bam_fh);
-    
     // load reference fai file
     faidx_t *fai = fai_load(opt::genome_file.c_str());
 
-    hts_itr_t* itr;
-
-    // If processing a region of the genome, only emit events aligned to this window
-    int clip_start = -1;
-    int clip_end = -1;
-
-    if(opt::region.empty()) {
-        // TODO: is this valid?
-        itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0);
-    } else {
-
-        fprintf(stderr, "Region: %s\n", opt::region.c_str());
-        itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str());
-        hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end);
-    }
-
 #ifndef H5_HAVE_THREADSAFE
     if(opt::num_threads > 1) {
         fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n");
@@ -909,72 +885,30 @@ int eventalign_main(int argc, char** argv)
     // Initialize output
     EventalignWriter writer = { NULL, NULL, NULL };
 
-    if(opt::output_sam) {
-        writer.sam_fp = hts_open("-", "w");
-        emit_sam_header(writer.sam_fp, hdr);
-    } else {
-        writer.tsv_fp = stdout;
-        emit_tsv_header(writer.tsv_fp);
-    }
-
     if(!opt::summary_file.empty()) {
         writer.summary_fp = fopen(opt::summary_file.c_str(), "w");
         // header
         fprintf(writer.summary_fp, "read_index\tread_name\tfast5_path\tmodel_name\tstrand\tnum_events\t");
         fprintf(writer.summary_fp, "num_steps\tnum_skips\tnum_stays\ttotal_duration\tshift\tscale\tdrift\tvar\n");
     }
-    
-    // Initialize iteration
-    std::vector<bam1_t*> records(opt::batch_size, NULL);
-    for(size_t i = 0; i < records.size(); ++i) {
-        records[i] = bam_init1();
-    }
 
-    int result;
-    size_t num_reads_realigned = 0;
-    size_t num_records_buffered = 0;
-    Progress progress("[eventalign]");
+    // the BamProcessor framework calls the input function with the
+    // bam record, read index, etc passed as parameters
+    // bind the other parameters the worker function needs here
+    auto f = std::bind(realign_read, std::ref(read_db), std::ref(fai), std::ref(writer), _1, _2, _3, _4, _5);
+    BamProcessor processor(opt::bam_file, opt::region, opt::num_threads);
 
-    do {
-        assert(num_records_buffered < records.size());
-        
-        // read a record into the next slot in the buffer
-        result = sam_itr_next(bam_fh, itr, records[num_records_buffered]);
-        
-        num_records_buffered += result >= 0;
-        // realign if we've hit the max buffer size or reached the end of file
-        if(num_records_buffered == records.size() || result < 0) {
-            #pragma omp parallel for            
-            for(size_t i = 0; i < num_records_buffered; ++i) {
-                bam1_t* record = records[i];
-                size_t read_idx = num_reads_realigned + i;
-                if( (record->core.flag & BAM_FUNMAP) == 0) {
-                    realign_read(writer, read_db, fai, hdr, record, read_idx, clip_start, clip_end);
-                }
-            }
-
-            num_reads_realigned += num_records_buffered;
-            num_records_buffered = 0;
-        }
-
-        if(opt::progress) {
-            fprintf(stderr, "Realigned %zu reads in %.1lfs\r", num_reads_realigned, progress.get_elapsed_seconds());
-        }
-    } while(result >= 0);
- 
-    assert(num_records_buffered == 0);
-
-    // cleanup records
-    for(size_t i = 0; i < records.size(); ++i) {
-        bam_destroy1(records[i]);
+    // Copy the bam header to std
+    if(opt::output_sam) {
+        writer.sam_fp = hts_open("-", "w");
+        emit_sam_header(writer.sam_fp, processor.get_bam_header());
+    } else {
+        writer.tsv_fp = stdout;
+        emit_tsv_header(writer.tsv_fp);
     }
 
-    // cleanup
-    sam_itr_destroy(itr);
-    bam_hdr_destroy(hdr);
-    fai_destroy(fai);
-    sam_close(bam_fh);
-    hts_idx_destroy(bam_idx);
+    // run
+    processor.parallel_run(f);
 
     if(writer.sam_fp != NULL) {
         hts_close(writer.sam_fp);
@@ -983,5 +917,7 @@ int eventalign_main(int argc, char** argv)
     if(writer.summary_fp != NULL) {
         fclose(writer.summary_fp);
     }
+
+    fai_destroy(fai);
     return EXIT_SUCCESS;
 }
diff --git a/src/common/nanopolish_bam_processor.cpp b/src/common/nanopolish_bam_processor.cpp
index cbec00e..d851c8f 100644
--- a/src/common/nanopolish_bam_processor.cpp
+++ b/src/common/nanopolish_bam_processor.cpp
@@ -27,8 +27,7 @@ BamProcessor::BamProcessor(const std::string& bam_file,
     assert(m_bam_fh != NULL);
 
     // load bam index file
-    std::string index_filename = m_bam_file + ".bai";
-    m_bam_idx = bam_index_load(index_filename.c_str());
+    m_bam_idx = sam_index_load(m_bam_fh, bam_file.c_str());
     if(m_bam_idx == NULL) {
         bam_index_error_exit(m_bam_file);
     }
diff --git a/src/common/nanopolish_common.h b/src/common/nanopolish_common.h
index d782230..feb20ed 100644
--- a/src/common/nanopolish_common.h
+++ b/src/common/nanopolish_common.h
@@ -18,7 +18,7 @@
 #include "logsum.h"
 
 #define PACKAGE_NAME "nanopolish"
-#define PACKAGE_VERSION "0.8.4"
+#define PACKAGE_VERSION "0.8.5"
 #define PACKAGE_BUGREPORT "https://github.com/jts/nanopolish/issues"
 
 //
diff --git a/src/main/nanopolish.cpp b/src/main/nanopolish.cpp
index f164d73..5dc9fb9 100644
--- a/src/main/nanopolish.cpp
+++ b/src/main/nanopolish.cpp
@@ -62,6 +62,9 @@ int print_version(int, char **)
 
 int main(int argc, char** argv)
 {
+    // Turn off HDF's exception printing, which is generally unhelpful for users
+    hdf5::H5Eset_auto(0, NULL, NULL);
+
     int ret = 0;
     if(argc <= 1) {
         printf("error: no command provided\n");
@@ -76,15 +79,17 @@ int main(int argc, char** argv)
             ret = print_usage( argc - 1, argv + 1);
     }
 
+
     // Emit a warning when some reads had to be skipped
     extern int g_total_reads;
     extern int g_unparseable_reads;
     extern int g_qc_fail_reads;
     extern int g_failed_calibration_reads;
     extern int g_failed_alignment_reads;
+    extern int g_bad_fast5_file;
     if(g_total_reads > 0) {
-        fprintf(stderr, "[post-run summary] total reads: %d unparseable: %d qc fail: %d could not calibrate: %d no alignment: %d\n", 
-            g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads);
+        fprintf(stderr, "[post-run summary] total reads: %d, unparseable: %d, qc fail: %d, could not calibrate: %d, no alignment: %d, bad fast5: %d\n", 
+            g_total_reads, g_unparseable_reads, g_qc_fail_reads, g_failed_calibration_reads, g_failed_alignment_reads, g_bad_fast5_file);
     }
     return ret;
 }
diff --git a/src/nanopolish_call_variants.cpp b/src/nanopolish_call_variants.cpp
index 763265c..36214f8 100644
--- a/src/nanopolish_call_variants.cpp
+++ b/src/nanopolish_call_variants.cpp
@@ -84,7 +84,7 @@ static const char *CONSENSUS_USAGE_MESSAGE =
 "  -e, --event-bam=FILE                 the events aligned to the reference genome are in bam FILE\n"
 "  -g, --genome=FILE                    the reference genome is in FILE\n"
 "  -p, --ploidy=NUM                     the ploidy level of the sequenced genome\n"
-"  -q  --methylation-aware=STR          turn on methylation aware polishing and test motifs given in STR (example: -m dcm,dam)\n"
+"  -q  --methylation-aware=STR          turn on methylation aware polishing and test motifs given in STR (example: -q dcm,dam)\n"
 "      --genotype=FILE                  call genotypes for the variants in the vcf FILE\n"
 "  -o, --outfile=FILE                   write result to FILE [default: stdout]\n"
 "  -t, --threads=NUM                    use NUM threads (default: 1)\n"
@@ -283,14 +283,14 @@ std::vector<Variant> generate_candidate_single_base_edits(const AlignmentDB& ali
 
     // Add all positively-scoring single-base changes into the candidate set
     for(size_t i = region_start; i < region_end; ++i) {
-        
+
         int calling_start = i - opt::screen_flanking_sequence;
         int calling_end = i + 1 + opt::screen_flanking_sequence;
 
         if(!alignments.are_coordinates_valid(contig, calling_start, calling_end)) {
             continue;
         }
-        
+
         std::vector<Variant> tmp_variants;
         for(size_t j = 0; j < 4; ++j) {
             // Substitutions
diff --git a/src/nanopolish_extract.cpp b/src/nanopolish_extract.cpp
index e565865..2d4b1be 100644
--- a/src/nanopolish_extract.cpp
+++ b/src/nanopolish_extract.cpp
@@ -58,11 +58,18 @@ namespace opt
 }
 static std::ostream* os_p;
 
-std::vector< std::pair< unsigned, std::string > >
+struct ExtractionGroup
+{
+    fast5::Basecall_Group_Description bcd;
+    unsigned int strand_idx;
+    std::string basecall_group;
+};
+
+std::vector<ExtractionGroup>
 get_preferred_basecall_groups(const fast5::File& f)
 {
     bool have_2d = false;
-    std::vector< std::pair< unsigned, std::string > > res;
+    std::vector<ExtractionGroup> res;
     // check 2d
     if (opt::read_type == "any"
         or opt::read_type == "2d"
@@ -92,7 +99,7 @@ get_preferred_basecall_groups(const fast5::File& f)
                 and f.have_basecall_events(1, gr))
             {
                 have_2d = true;
-                res.push_back(std::make_pair(2, gr));
+                res.push_back( { bcd, 2, gr } );
                 if (opt::read_type != "any")
                 {
                     break;
@@ -100,6 +107,7 @@ get_preferred_basecall_groups(const fast5::File& f)
             }
         }
     }
+
     // check 1d
     for (unsigned st = 0; st < 2; ++st)
     {
@@ -131,7 +139,8 @@ get_preferred_basecall_groups(const fast5::File& f)
                 if (f.have_basecall_fastq(st, gr)
                     and f.have_basecall_events(st, gr))
                 {
-                    res.push_back(std::make_pair(st, gr));
+                    res.push_back( {bcd, st, gr} );
+
                     if (opt::read_type != "any")
                     {
                         break;
@@ -140,11 +149,12 @@ get_preferred_basecall_groups(const fast5::File& f)
             }
         }
     }
+
     LOG(debug)
         << "preferred_groups: "
         << alg::os_join(res, ", ", [] (const decltype(res)::value_type& p) {
                 std::ostringstream oss;
-                oss << "(" << p.first << ":" << p.second << ")";
+                oss << "(" << p.strand_idx << ":" << p.basecall_group << ")";
                 return oss.str();
             })
         << "\n";
@@ -175,28 +185,39 @@ void process_file(const std::string& fn)
                 LOG(info) << "file [" << fn << "]: no basecalling data suitable for nanoplish\n";
                 return;
             }
+
             ++opt::total_files_used_count;
             const auto& p = l.front();
             // get and parse fastq
-            auto fq = f.get_basecall_fastq(p.first, p.second);
+            auto fq = f.get_basecall_fastq(p.strand_idx, p.basecall_group);
             auto fq_a = f.split_fq(fq);
+                
+            SemVer basecaller_version = parse_semver_string(p.bcd.version);
+            bool is_albacore_2_read = p.bcd.name == "albacore" && basecaller_version.major == 2;
+
             // construct name
             std::string name;
             std::istringstream(fq_a[0]) >> name;
-            std::replace(name.begin(), name.end(), ':', '_');
-            name += ":" + p.second + ":";
-            if (p.first == 0)
-            {
-                name += "template";
-            }
-            else if (p.first == 1)
-            {
-                name += "complement";
-            }
-            else
-            {
-                name += "2d";
-            }
+            
+            // For older basecalled data appened the group information to the read name
+            // For albacore 2 data this isn't needed as nanopolish works directly from the raw
+            if(!is_albacore_2_read) {
+                std::replace(name.begin(), name.end(), ':', '_');
+                name += ":" + p.basecall_group + ":";
+                if (p.strand_idx == 0)
+                {
+                    name += "template";
+                }
+                else if (p.strand_idx == 1)
+                {
+                    name += "complement";
+                }
+                else
+                {
+                    name += "2d";
+                }
+            } 
+
             if (not opt::fastq)
             {
                 (*os_p)
@@ -388,12 +409,15 @@ int extract_main(int argc, char** argv)
     }
 
     // Build the ReadDB from the output file
-    ReadDB read_db;
-    read_db.build(opt::output_file);
-    read_db.save();
 
     std::clog << "[extract] found " << opt::total_files_count
               << " files, extracted " << opt::total_files_used_count
               << " reads\n";
+    
+    if(opt::total_files_used_count > 0) {
+        ReadDB read_db;
+        read_db.build(opt::output_file);
+        read_db.save();
+    } 
     return 0;
 }
diff --git a/src/nanopolish_index.cpp b/src/nanopolish_index.cpp
index 6ddc7ec..fe91dbd 100644
--- a/src/nanopolish_index.cpp
+++ b/src/nanopolish_index.cpp
@@ -54,11 +54,17 @@ static std::ostream* os_p;
 void index_file(ReadDB& read_db, const std::string& fn)
 {
     PROFILE_FUNC("index_file")
-    fast5::File* fp = new fast5::File(fn);
-    if(fp->is_open()) {
-        fast5::Raw_Samples_Params params = fp->get_raw_samples_params();
-        std::string read_id = params.read_id;
-        read_db.add_signal_path(read_id, fn);
+
+    fast5::File* fp = NULL;
+    try {
+        fp = new fast5::File(fn);
+        if(fp->is_open()) {
+            fast5::Raw_Samples_Params params = fp->get_raw_samples_params();
+            std::string read_id = params.read_id;
+            read_db.add_signal_path(read_id, fn);
+        }
+    } catch(hdf5_tools::Exception e) {
+        fprintf(stderr, "skipping invalid fast5 file: %s\n", fn.c_str());
     }
     delete fp;
 } // process_file
@@ -78,7 +84,7 @@ void index_path(ReadDB& read_db, const std::string& path)
                 // recurse
                 index_path(read_db, full_fn);
                 read_db.print_stats();
-            } else if (full_fn.find(".fast5") != -1 && fast5::File::is_valid_file(full_fn)) {
+            } else if (full_fn.find(".fast5") != -1) {
                 index_file(read_db, full_fn);
             }
         }
diff --git a/src/nanopolish_raw_loader.cpp b/src/nanopolish_raw_loader.cpp
index 9ed0076..0461a55 100644
--- a/src/nanopolish_raw_loader.cpp
+++ b/src/nanopolish_raw_loader.cpp
@@ -84,6 +84,7 @@ std::vector<AlignedPair> adaptive_banded_simple_event_align(SquiggleRead& read,
  
     // qc
     double min_average_log_emission = -5.0;
+    int max_gap_threshold = 50;
 
     // banding
     int bandwidth = 100;
@@ -315,6 +316,9 @@ std::vector<AlignedPair> adaptive_banded_simple_event_align(SquiggleRead& read,
 #ifdef DEBUG_ADAPTIVE
     fprintf(stderr, "[adaback] ei: %d ki: %d s: %.2f\n", curr_event_idx, curr_kmer_idx, max_score);
 #endif
+
+    int curr_gap = 0;
+    int max_gap = 0;
     while(curr_kmer_idx >= 0 && curr_event_idx >= 0) {
         
         // emit alignment
@@ -335,10 +339,14 @@ std::vector<AlignedPair> adaptive_banded_simple_event_align(SquiggleRead& read,
         if(from == FROM_D) {
             curr_kmer_idx -= 1;
             curr_event_idx -= 1;
+            curr_gap = 0;
         } else if(from == FROM_U) {
             curr_event_idx -= 1;
+            curr_gap = 0;
         } else {
             curr_kmer_idx -= 1;
+            curr_gap += 1;
+            max_gap = std::max(curr_gap, max_gap);
         }   
     }
     std::reverse(out.begin(), out.end());
@@ -348,12 +356,12 @@ std::vector<AlignedPair> adaptive_banded_simple_event_align(SquiggleRead& read,
     bool spanned = out.front().ref_pos == 0 && out.back().ref_pos == n_kmers - 1;
     
     bool failed = false;
-    if(avg_log_emission < min_average_log_emission || !spanned) {
+    if(avg_log_emission < min_average_log_emission || !spanned || max_gap > max_gap_threshold) {
         failed = true;
         out.clear();
     }
 
-    //fprintf(stderr, "ada\t%s\t%s\t%.2lf\t%zu\t%.2lf\t%d\t%d\n", read.read_name.substr(0, 6).c_str(), failed ? "FAILED" : "OK", events_per_kmer, sequence.size(), avg_log_emission, curr_event_idx, fills);
+    //fprintf(stderr, "ada\t%s\t%s\t%.2lf\t%zu\t%.2lf\t%d\t%d\t%d\n", read.read_name.substr(0, 6).c_str(), failed ? "FAILED" : "OK", events_per_kmer, sequence.size(), avg_log_emission, curr_event_idx, max_gap, fills);
     return out;
 }
 
diff --git a/src/nanopolish_read_db.cpp b/src/nanopolish_read_db.cpp
index a064aee..d085b4d 100644
--- a/src/nanopolish_read_db.cpp
+++ b/src/nanopolish_read_db.cpp
@@ -112,11 +112,22 @@ void ReadDB::import_reads(const std::string& input_filename, const std::string&
 
     // Open writers
     FILE* write_fp = fopen(out_fasta_filename.c_str(), "w");
+    if(write_fp == NULL) {
+        fprintf(stderr, "error: could not open %s for write\n", out_fasta_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
+
     BGZF* bgzf_write_fp = bgzf_dopen(fileno(write_fp), "w");
+    if(bgzf_write_fp == NULL) {
+        fprintf(stderr, "error: could not open %s for bgzipped write\n", out_fasta_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
 
     // read input sequences, add to DB and convert to fasta
+    int ret = 0;
     kseq_t* seq = kseq_init(gz_read_fp);
-    while(kseq_read(seq) >= 0) {
+    while((ret = kseq_read(seq)) >= 0) {
+
         // Check for a path to the fast5 file in the comment of the read
         std::string path = "";
         if(seq->comment.l > 0) {
@@ -157,6 +168,12 @@ void ReadDB::import_reads(const std::string& input_filename, const std::string&
         }
     }
 
+    // check for abnormal exit conditions
+    if(ret <= -2) {
+        fprintf(stderr, "kseq_read returned %d indicating an error with the input file %s\n", ret, input_filename.c_str());
+        exit(EXIT_FAILURE);
+    }
+
     // cleanup
     kseq_destroy(seq);
     
diff --git a/src/nanopolish_squiggle_read.cpp b/src/nanopolish_squiggle_read.cpp
index 9378466..f09cc1d 100644
--- a/src/nanopolish_squiggle_read.cpp
+++ b/src/nanopolish_squiggle_read.cpp
@@ -31,6 +31,7 @@ int g_unparseable_reads = 0;
 int g_qc_fail_reads = 0;
 int g_failed_calibration_reads = 0;
 int g_failed_alignment_reads = 0;
+int g_bad_fast5_file = 0;
 
 const double MIN_CALIBRATION_VAR = 2.5;
 
@@ -73,28 +74,39 @@ SquiggleRead::SquiggleRead(const std::string& name, const ReadDB& read_db, const
     this->events_per_base[0] = events_per_base[1] = 0.0f;
     this->base_model[0] = this->base_model[1] = NULL;
     this->fast5_path = read_db.get_signal_path(this->read_name);
+    if(this->fast5_path == "") {
+        return;
+    }
 
     #pragma omp critical(sr_load_fast5)
     {
-        this->f_p = new fast5::File(fast5_path);
-        assert(this->f_p->is_open());
-
-        // Try to detect whether this read is DNA or RNA
-        this->nucleotide_type = SRNT_DNA;
-        if(this->f_p->have_context_tags_params()) {
-            fast5::Context_Tags_Params context_tags = this->f_p->get_context_tags_params();
-            std::string experiment_type = context_tags["experiment_type"];
-            if(experiment_type == "rna") {
-                this->nucleotide_type = SRNT_RNA;
+        // If for some reason the fast5s become unreadable in between indexing
+        // and executing the fast5::File constructor may throw. For example
+        // see issue #273. We catch here and skip the read in such cases
+        try {
+            this->f_p = new fast5::File(fast5_path);
+            assert(this->f_p->is_open());
+
+            // Try to detect whether this read is DNA or RNA
+            this->nucleotide_type = SRNT_DNA;
+            if(this->f_p->have_context_tags_params()) {
+                fast5::Context_Tags_Params context_tags = this->f_p->get_context_tags_params();
+                std::string experiment_type = context_tags["experiment_type"];
+                if(experiment_type == "rna") {
+                    this->nucleotide_type = SRNT_RNA;
+                }
             }
-        }
 
-        bool is_event_read = is_extract_read_name(this->read_name);
-        if(this->nucleotide_type == SRNT_DNA && is_event_read) {
-            load_from_events(flags);
-        } else {
-            this->read_sequence = read_db.get_read_sequence(read_name);
-            load_from_raw(flags);
+            bool is_event_read = is_extract_read_name(this->read_name);
+            if(this->nucleotide_type == SRNT_DNA && is_event_read) {
+                load_from_events(flags);
+            } else {
+                this->read_sequence = read_db.get_read_sequence(read_name);
+                load_from_raw(flags);
+            }
+        } catch(hdf5_tools::Exception e) {
+            fprintf(stderr, "[warning] fast5 file is unreadable and will be skipped: %s\n", fast5_path.c_str());
+            g_bad_fast5_file += 1;
         }
 
         delete this->f_p;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/nanopolish.git



More information about the debian-med-commit mailing list