[med-svn] [poretools] 01/02: Imported Upstream version 0.5.1

Afif Elghraoui afif-guest at moszumanska.debian.org
Fri Nov 13 08:36:44 UTC 2015


This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository poretools.

commit b6b68bfffd00b54988fec6aa914fb24bc4111871
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Thu Nov 12 23:43:07 2015 -0800

    Imported Upstream version 0.5.1
---
 Dockerfile                            |  40 +++
 MANIFEST.in                           |   1 +
 README.md                             |  23 ++
 dist/poretools-0.3.0.win-amd64.exe    | Bin 0 -> 284988 bytes
 dist/poretools-0.3.1.win-amd64.exe    | Bin 0 -> 285930 bytes
 dist/poretools-0.5.0.win-amd64.exe    | Bin 0 -> 288186 bytes
 dist/poretools.reg                    |   8 +
 docs/Makefile                         | 130 +++++++
 docs/conf.py                          | 246 +++++++++++++
 docs/content/_images/foo.fast5.png    | Bin 0 -> 388201 bytes
 docs/content/_images/hist.png         | Bin 0 -> 18385 bytes
 docs/content/_images/occupancy.png    | Bin 0 -> 118395 bytes
 docs/content/_images/yield.bp.png     | Bin 0 -> 212336 bytes
 docs/content/_images/yield.reads.png  | Bin 0 -> 204363 bytes
 docs/content/examples.rst             | 309 ++++++++++++++++
 docs/content/help.rst                 |  36 ++
 docs/content/installation.rst         | 215 +++++++++++
 docs/content/notebook.rst             |   7 +
 docs/index.rst                        |  61 ++++
 poretools/Event.py                    |  32 ++
 poretools/Fast5File.py                | 586 ++++++++++++++++++++++++++++++
 poretools/Fast5File_pytables.py       | 504 ++++++++++++++++++++++++++
 poretools/__init__.py                 |   5 +
 poretools/combine.py                  |  30 ++
 poretools/events.py                   |  20 ++
 poretools/fasta.py                    |  46 +++
 poretools/fastq.py                    |  33 ++
 poretools/formats.py                  |  23 ++
 poretools/hist.py                     |  75 ++++
 poretools/ipynb/test_run_report.ipynb | 650 ++++++++++++++++++++++++++++++++++
 poretools/nucdist.py                  |  19 +
 poretools/occupancy.py                | 169 +++++++++
 poretools/poretools_main.py           | 407 +++++++++++++++++++++
 poretools/qualdist.py                 |  19 +
 poretools/readstats.py                |  27 ++
 poretools/scripts/__init__.py         |   0
 poretools/scripts/poretools           |   5 +
 poretools/scripts/poretools-script.py |   5 +
 poretools/scripts/poretools.bat       |   4 +
 poretools/squiggle.py                 | 107 ++++++
 poretools/statistics.py               |  51 +++
 poretools/stats.py                    |  63 ++++
 poretools/tabular.py                  |  14 +
 poretools/times.py                    |  43 +++
 poretools/version.py                  |   2 +
 poretools/windows.py                  |  19 +
 poretools/winner.py                   |  25 ++
 poretools/yield_plot.py               | 125 +++++++
 requirements.txt                      |   2 +
 setup.py                              |  41 +++
 50 files changed, 4227 insertions(+)

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..6f9ebe0
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,40 @@
+###############################################
+# Dockerfile to build poretools container image
+# Based on Ubuntu 14.04
+# Build with:
+#   sudo docker build -t poretools .
+###############################################
+
+# Use ubuntu 14.04 base image
+FROM ubuntu:14.04
+
+# set non-interactive mode
+ENV DEBIAN_FRONTEND noninteractive
+
+############# BEGIN INSTALLATION ##############
+
+# Prepare to install R
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
+RUN echo 'deb http://cran.rstudio.com/bin/linux/ubuntu trusty/' >> /etc/apt/sources.list
+RUN apt-get update
+
+# Install dependencies
+RUN apt-get -y install git python-tables python-setuptools python-pip python-dev cython libhdf5-serial-dev r-base python-rpy2
+
+# Upgrade numexpr
+RUN pip install numexpr --upgrade
+
+# Install R packages
+RUN Rscript -e 'options("repos" = c(CRAN = "http://cran.rstudio.com/")); install.packages("codetools"); install.packages("MASS"); install.packages("ggplot2")'
+
+# Install poretools
+RUN git clone https://github.com/arq5x/poretools /tmp/poretools
+RUN cd /tmp/poretools && python setup.py install
+
+############## INSTALLATION END ##############
+
+# Set entrypoint so container can be used as executable
+ENTRYPOINT ["poretools"]
+
+# File author/maintainer info
+MAINTAINER Stephen Turner <lastname at virginia dot edu>
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..d54bfb5
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+requirements.txt
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4bcd327
--- /dev/null
+++ b/README.md
@@ -0,0 +1,23 @@
+### poretools: a toolkit for working with nanopore sequencing data from Oxford Nanopore.
+
+*Nick Loman and Aaron Quinlan*
+
+**Note:** this software is in an alpha state; the code is changing rapidly and the API and CLI may change at any time.
+
+Complete installation instructions and usage examples can be found on the [poretools documentation site](http://poretools.readthedocs.org).
+
+Requirements
+===================
+- HDF5 >= 1.8.7 (http://www.hdfgroup.org/HDF5/)
+- R >= 3.0.0
+- Python >= 2.7
+- rpy2 >= 2.4.2
+- h5py >= 2.0
+
+
+
+
+
+
+[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/arq5x/poretools/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
+
diff --git a/dist/poretools-0.3.0.win-amd64.exe b/dist/poretools-0.3.0.win-amd64.exe
new file mode 100644
index 0000000..2e23116
Binary files /dev/null and b/dist/poretools-0.3.0.win-amd64.exe differ
diff --git a/dist/poretools-0.3.1.win-amd64.exe b/dist/poretools-0.3.1.win-amd64.exe
new file mode 100644
index 0000000..e8fe847
Binary files /dev/null and b/dist/poretools-0.3.1.win-amd64.exe differ
diff --git a/dist/poretools-0.5.0.win-amd64.exe b/dist/poretools-0.5.0.win-amd64.exe
new file mode 100644
index 0000000..7affd6e
Binary files /dev/null and b/dist/poretools-0.5.0.win-amd64.exe differ
diff --git a/dist/poretools.reg b/dist/poretools.reg
new file mode 100644
index 0000000..88887f1
--- /dev/null
+++ b/dist/poretools.reg
@@ -0,0 +1,8 @@
+Windows Registry Editor Version 5.00
+[HKEY_LOCAL_MACHINE\Software\Python]
+[HKEY_LOCAL_MACHINE\Software\Python\Pythoncore]
+[HKEY_LOCAL_MACHINE\Software\Python\Pythoncore\2.7]
+[HKEY_LOCAL_MACHINE\Software\Python\Pythoncore\2.7\InstallPath]
+ @="C:\\Anaconda"
+[HKEY_LOCAL_MACHINE\Software\Python\Pythoncore\2.7\PythonPath]
+ @="C:\\Anaconda;C:\\Anaconda\\Lib\\;C:\\Anaconda\\DLLs\\"
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..071223b
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,130 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/track.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/track.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/track"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/track"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	make -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..54d2d1d
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,246 @@
+# -*- coding: utf-8 -*-
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('../'))
+
+#from poretools import __version__ as version
+version = '0.5.0'
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo',
+              'sphinx.ext.coverage', 'sphinx.ext.pngmath', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'poretools'
+copyright = u'2014'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = version
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ["themes"]
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+html_short_title = project + " v" + release
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = 'gemini.png'
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = ''
+#html_style = ''
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {
+#    'index': ['sidebar-intro.html', 'sourcelink.html', 'searchbox.html']
+#}
+
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+# html_domain_indices = True
+
+# If false, no index is generated.
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+html_show_sphinx = False
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'poretools-docs'
+
+# Google analytics
+# googleanalytics_id = "UA-24167610-15"
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+# latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+# latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+    ('index', 'poretools.tex', u'poretools Documentation', u'Nick Loman and Aaron Quinlan', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+
+# Additional stuff for the LaTeX preamble.
+# latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+
+# If false, no module index is generated.
+# latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'gemini', u'poretools Documentation', [u'Nick Loman and Aaron Quinlan'], 1)
+]
+
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'http://docs.python.org/': None}
+
+
+class Mock(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args, **kwargs):
+        return Mock()
+
+    @classmethod
+    def __getattr__(cls, name):
+        if name in ('__file__', '__path__'):
+            return '/dev/null'
+        elif name[0] == name[0].upper():
+            return type(name, (), {})
+        else:
+            return Mock()
+
+MOCK_MODULES = []
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = Mock()
diff --git a/docs/content/_images/foo.fast5.png b/docs/content/_images/foo.fast5.png
new file mode 100644
index 0000000..b71fd24
Binary files /dev/null and b/docs/content/_images/foo.fast5.png differ
diff --git a/docs/content/_images/hist.png b/docs/content/_images/hist.png
new file mode 100644
index 0000000..443c3bd
Binary files /dev/null and b/docs/content/_images/hist.png differ
diff --git a/docs/content/_images/occupancy.png b/docs/content/_images/occupancy.png
new file mode 100644
index 0000000..35885cb
Binary files /dev/null and b/docs/content/_images/occupancy.png differ
diff --git a/docs/content/_images/yield.bp.png b/docs/content/_images/yield.bp.png
new file mode 100644
index 0000000..9ec6219
Binary files /dev/null and b/docs/content/_images/yield.bp.png differ
diff --git a/docs/content/_images/yield.reads.png b/docs/content/_images/yield.reads.png
new file mode 100644
index 0000000..9d84cb7
Binary files /dev/null and b/docs/content/_images/yield.reads.png differ
diff --git a/docs/content/examples.rst b/docs/content/examples.rst
new file mode 100644
index 0000000..04e0039
--- /dev/null
+++ b/docs/content/examples.rst
@@ -0,0 +1,309 @@
+###############
+Usage examples
+###############
+
+===================
+poretools ``fastq``
+===================
+Extract sequences in FASTQ format from a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools fastq fast5/*.fast5
+
+Or, if there are too many files for your OS to do the wildcard expansion, just provide a directory.
+``poreutils`` will automatically find all of the FAST5 files in the directory.
+
+.. code-block:: bash
+
+    poretools fastq fast5/
+
+
+Extract sequences in FASTQ format from a set of FAST5 files.
+    
+.. code-block:: bash
+
+    poretools fastq fast5/
+    poretools fastq --min-length 5000 fast5/
+    poretools fastq --type all fast5/
+    poretools fastq --type fwd fast5/
+    poretools fastq --type rev fast5/
+    poretools fastq --type 2D fast5/
+    poretools fastq --type fwd,rev fast5/
+
+
+Only extract sequence with more complement events than template. These are the so-called "high quality 2D reads" and are the most accurate sequences from a 
+given run.
+
+.. code-block:: bash
+
+    poretools fastq --type 2D --high-quality fast5/
+
+===================
+poretools ``fasta``
+===================
+Extract sequences in FASTA format from a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools fasta fast5/
+    poretools fasta --min-length 5000 fast5/
+    poretools fasta --type all fast5/
+    poretools fasta --type fwd fast5/
+    poretools fasta --type rev fast5/
+    poretools fasta --type 2D fast5/
+    poretools fasta --type fwd,rev fast5/
+
+=====================
+poretools ``combine``
+=====================
+Create a tarball from a set of FAST5 (HDF5) files.
+
+.. code-block:: bash
+
+    # plain tar (recommended for speed)
+    poretools combine -o foo.fast5.tar fast5/*.fast5
+
+    # gzip
+    poretools combine -o foo.fast5.tar.gz fast5/*.fast5
+
+    # bzip2
+    poretools combine -o foo.fast5.tar.bz2 fast5/*.fast5
+
+========================
+poretools ``yield_plot``
+========================
+Create a collector's curve reflecting the sequencing yield over time for a set of reads. There are two types of plots. The first is the yield of reads over time:
+
+.. code-block:: bash
+
+    poretools yield_plot --plot-type reads fast5/
+
+The result should look something like:\
+
+.. image:: _images/yield.reads.png
+    :width: 400pt
+    
+The second is the yield of base pairs over time:
+
+.. code-block:: bash
+
+    poretools yield_plot --plot-type basepairs fast5/
+
+The result should look something like:
+    
+.. image:: _images/yield.bp.png
+    :width: 400pt
+
+Of course, you can save to PDF or PNG with `--saveas`:
+
+.. code-block:: bash
+
+    poretools yield_plot \
+              --plot-type basepairs \
+              --saveas foo.pdf\
+              fast5/
+
+    poretools yield_plot \
+              --plot-type basepairs \
+              --saveas foo.png\
+              fast5/
+
+If you don't like the default aesthetics, try `--theme-bw`:
+
+.. code-block:: bash
+
+    poretools yield_plot --theme-bw fast5/
+
+
+======================
+poretools ``squiggle``
+======================
+Make a "squiggle" plot of the signal over time for a given read or set of reads
+
+.. code-block:: bash
+
+    poretools squiggle fast5/foo.fast5
+
+
+The result should look something like:
+
+.. image:: _images/foo.fast5.png
+    :width: 400pt
+
+If you don't like the default aesthetics, try `--theme-bw`:
+
+.. code-block:: bash
+
+    poretools squiggle --theme-bw fast5/
+
+
+Other options:
+
+.. code-block:: bash
+
+    # save as PNG
+    poretools squiggle --saveas png fast5/foo.fast5
+
+    # save as PDF
+    poretools squiggle --saveas pdf fast5/foo.fast5
+
+    # make a PNG for each FAST5 file in a directory
+    poretools squiggle --saveas png fast5/
+
+====================
+poretools ``winner``
+====================
+Report the longest read among a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools winner fast5/
+    poretools winner --type all fast5/
+    poretools winner --type fwd fast5/
+    poretools winner --type rev fast5/
+    poretools winner --type 2D fast5/
+    poretools winner --type fwd,rev fast5/
+
+===================
+poretools ``stats``
+===================
+Collect read size statistics from a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools stats fast5/
+    total reads 2286.000000
+    total base pairs    8983574.000000
+    mean    3929.822397
+    median  4011.500000
+    min 13.000000
+    max 6864.000000
+
+===================
+poretools ``hist``
+===================
+Plot a histogram of read sizes from a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools hist fast5/
+    poretools hist --min-length 1000 --max-length 10000 fast5/
+
+    poretools hist --num-bins 20 --max-length 10000 fast5/
+
+If you don't like the default aesthetics, try `--theme-bw`:
+
+.. code-block:: bash
+
+    poretools hist --theme-bw fast5/
+
+The result should look something like:
+
+.. image:: _images/hist.png
+    :width: 400pt    
+
+=====================
+poretools ``nucdist``
+=====================
+Look at the nucleotide composition of a set of FAST5 files.
+
+.. code-block:: bash
+ 
+    poretools nucdist fast5/
+    A   78287   335291  0.233489714904
+    C   75270   335291  0.224491561062
+    T   92575   335291  0.276103444471
+    G   84754   335291  0.252777438106
+    N   4405    335291  0.0131378414571
+
+======================
+poretools ``qualdist``
+======================
+Look at the quality score composition of a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools qualdist fast5/
+    !   0   83403   335291  0.248748102395
+    "   1   46151   335291  0.137644613187
+    #   2   47463   335291  0.141557632027
+    $   3   34471   335291  0.102809201559
+    %   4   24879   335291  0.0742012162569
+    &   5   20454   335291  0.0610037251224
+    '   6   16783   335291  0.0500550268274
+    (   7   13699   335291  0.0408570465655
+    )   8   11356   335291  0.0338690868529
+    *   9   9077    335291  0.0270720061081
+    +   10  6492    335291  0.0193622852984
+    ,   11  4891    335291  0.014587328619
+    -   12  3643    335291  0.0108651887465
+    .   13  2585    335291  0.00770972080968
+    /   14  1969    335291  0.0058725107444
+    0   15  1475    335291  0.00439916371152
+    1   16  1146    335291  0.00341792651756
+    2   17  902 335291  0.00269020045274
+    3   18  790 335291  0.00235616225905
+    4   19  619 335291  0.0018461575169
+    5   20  532 335291  0.00158668142002
+    6   21  440 335291  0.00131229290378
+    7   22  397 335291  0.00118404609727
+    8   23  379 335291  0.00113036138757
+    9   24  313 335291  0.000933517452004
+    :   25  327 335291  0.000975272226215
+    ;   26  138 335291  0.000411582774366
+    <   27  121 335291  0.000360880548538
+    =   28  96  335291  0.000286318451733
+    >   29  76  335291  0.000226668774289
+    ?   30  69  335291  0.000205791387183
+    @   31  61  335291  0.000181931516205
+    A   32  48  335291  0.000143159225866
+    B   33  23  335291  6.8597129061e-05
+    C   34  14  335291  4.17547742111e-05
+    D   35  6   335291  1.78949032333e-05
+    F   37  3   335291  8.94745161666e-06
+
+=====================
+poretools ``tabular``
+=====================
+Dump the length, name, seq, and qual of the sequence in one or a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools tabular foo.fast5 
+    length  name    sequence    quals
+    10    @channel_100_read_14_complement   GTCCCCAACAACAC    $%%'"$"%!)
+
+====================
+poretools ``events``
+====================
+Extract the raw nanopore events from each FAST5 file.
+
+.. code-block:: bash
+
+    poretools events burn-in-run-2 | head -5
+    file    strand  mean    start   stdv    length  model_state model_level move    p_model_state   mp_model_state  p_mp_model_state    p_A p_C p_G p_T raw_index
+    burn-in-run-2/ch100_file15_strand.fast5  template    56.4648513559   6595.744    1.62598948551   0.026   TGCAT   56.064011186    0   0.076552246287  TGCAT   0.076552246287  0.0980897489641 0.46074353628   0.320651683129  1.90528272165e-05   0
+    burn-in-run-2/ch100_file15_strand.fast5  template    53.2614042745   6595.77 1.12361695715   0.0262  GCATA   54.0674114279   1   0.162623875514  GCGAC   0.183337198021  0.437486003645  0.214306730736  0.335497877123  0.0103035924549 1
+    burn-in-run-2/ch100_file15_strand.fast5  template    51.0001271042   6595.7962   1.07380437991   0.1422  CATAG   52.1964606541   1   0.186606921109  CATAG   0.186606921109  0.424764995152  0.205766683286  0.0905615869544 0.277004168889  2
+    burn-in-run-2/ch100_file15_strand.fast5  template    49.6976788934   6595.9384   1.03634357984   0.0364  ATAGC   51.1117557194   1   0.181952967376  ATAGC   0.181952967376  0.296106771209  0.408638426765  0.0754069980523 0.217721405945  3
+    burn-in-run-2/ch100_file15_strand.fast5  template    51.7633085659   6595.9748   1.04743182078   0.0456  TAGCA   52.6955397413   1   0.192582310652  TAGCA   0.192582310652  0.250481934498  0.311756355221  0.311208716953  0.12343821687   4
+
+===================
+poretools ``times``
+===================
+Extract the start time of each detected molecule into tabular format.
+
+=======================
+poretools ``occupancy``
+=======================
+Plot the throughput performance of each pore on the flowcell during a given sequencing run.
+
+.. code-block:: bash
+
+    poretools occupancy fast5/
+
+The result should look something like:
+
+.. image:: _images/occupancy.png
+    :width: 400pt    
diff --git a/docs/content/help.rst b/docs/content/help.rst
new file mode 100644
index 0000000..00b161f
--- /dev/null
+++ b/docs/content/help.rst
@@ -0,0 +1,36 @@
+##########
+Options
+##########
+
+The following demonstrates the options available in ``poretools``.
+
+.. code-block:: bash
+
+    poretools --help
+    usage: poretools [-h] [-v]
+
+                     {combine,fastq,fasta,stats,hist,events,readstats,tabular,nucdist,qualdist,winner,wiggle,times}
+                     ...
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -v, --version         Installed poretools version
+
+    [sub-commands]:
+      {combine,fastq,fasta,stats,hist,events,readstats,tabular,nucdist,qualdist,winner,wiggle,times}
+        combine             Combine a set of FAST5 files in a TAR achive
+        fastq               Extract FASTQ sequences from a set of FAST5 files
+        fasta               Extract FASTA sequences from a set of FAST5 files
+        stats               Get read size stats for a set of FAST5 files
+        hist                Plot read size histogram for a set of FAST5 files
+        events              Extract each nanopore event for each read
+        readstats           Extract signal information for each read over time.
+        tabular             Extract the lengths and name/seq/quals from a set of
+                            FAST5 files in TAB delimited format
+        nucdist             Get the nucl. composition of a set of FAST5 files
+        qualdist            Get the qual score composition of a set of FAST5 files
+        winner              Get the longest read from a set of FAST5 files
+        squiggle            Plot the observed signals for FAST5 reads
+        times               Return the start times from a set of FAST5 files in
+                            tabular format
+        yield_plot          Plot the yield over time for a set of FAST5 files
\ No newline at end of file
diff --git a/docs/content/installation.rst b/docs/content/installation.rst
new file mode 100644
index 0000000..3970027
--- /dev/null
+++ b/docs/content/installation.rst
@@ -0,0 +1,215 @@
+############
+Installation
+############
+
+
+====================
+Basic Installation
+====================
+.. code-block:: bash
+
+	git clone https://github.com/arq5x/poretools
+	cd poretools
+
+Install as root:
+
+.. code-block:: bash
+
+	python setup.py install
+
+Install as a plain old user who has root access:
+
+.. code-block:: bash
+
+	sudo python setup.py install
+
+Install as a plain old who lacks ``sudo`` priveleges:
+
+.. code-block:: bash
+
+	# details: https://docs.python.org/2/install/index.html#alternate-installation-the-user-scheme
+	python setup.py install --user
+	
+	# now update your PATH such that it includes the directory to which poretools was just copied.
+	# look for a line in the installation log like: Installing poretools script to /home/arq5x/.local/bin
+        # in this case, I would either add that path to the PATH environment variable for the current session:
+        export PATH=$PATH:/home/arq5x/.local/bin
+        
+        # or, better yet add it to your .bashrc file.
+        # at this point you should be able to run the poretools executable from anywhere on your system.
+        poretools --help
+=================================
+Installing on Windows with MinKNOW installed
+=================================
+
+MinKNOW installs the Anaconda distribution of Python, which means that h5py is already installed.
+
+However, currently MinKNOW does not update the Windows registry to specify that Anaconda is the default version of Python, which makes installing packages tricky. To address this, some changes need to be made to the registry. This can be fixed by downloading the following file:
+
+	<https://raw.githubusercontent.com/arq5x/poretools/master/dist/poretools.reg>
+
+Ensure it is named 'poretools.reg' and then run it (by double-clicking). Windows will prompt you about making changes to the registry, which you should agree to.
+
+The only additional dependency that is required is rpy2 and R.
+
+Download rpy2 from the pre-built binary page at: <http://www.lfd.uci.edu/~gohlke/pythonlibs/>. You want the version for Python 2.7 on 64-bit Windows. Run the installer.
+
+Then, to install poretools, simply download and run the Windows installer:
+
+        <https://github.com/arq5x/poretools/blob/master/dist/poretools-0.3.1.win-amd64.exe?raw=true>
+
+==================================
+Plotting with R on Windows
+==================================
+
+If you wish to use the R plots (experimental, on Windows) you also need to:
+
+Download R for Windows from: <http://cran.r-project.org/bin/windows/base/>
+
+Run the installer, then start up R and install ggplot2:
+
+.. code-block:: R
+
+	install.packages("ggplot2")
+
+You need to set two environment variables to run poretools currently:
+
+.. code-block:: bash
+
+	set R_HOME=c:\Program Files\R\R-3.1.1
+	set R_USER=c:\Users\MY USER\Documents
+
+You may also need to add the following directory to your PATH:
+
+.. code-block:: bash
+
+        C:\Program Files\R\R-3.1.1\bin\x64
+        
+Instructions for updating your PATH on Windows can be found here: http://geekswithblogs.net/renso/archive/2009/10/21/how-to-set-the-windows-path-in-windows-7.aspx
+
+=================================
+Installing on OS X
+=================================
+
+First, you should install a proper package manager for OS X. In our experience, `HomeBrew <http://brew.sh/>`_ works extremely well.
+
+To install HomeBrew, you run the following command (lifted from the HomeBrew site):
+
+.. code-block:: bash
+
+	ruby -e "$(curl -fsSL https://raw.github.com/Homebrew/homebrew/go/install)"
+
+Using HomeBrew, install HDF5 from the HomeBrew Science "tap";
+
+.. code-block:: bash
+	
+	brew tap homebrew/science 
+	brew install hdf5
+
+Now, you will need to install the R statistical analysis software (you may already have this...). The `CRAN <http://cran.r-project.org/bin/macosx/>`_ website houses automatic installation packages for different versions of OS X.  Here are links to such packages for `Snow Leopard and higher <http://cran.r-project.org/bin/macosx/R-3.1.1-snowleopard.pkg>`_ as well as `Mavericks <http://cran.r-project.org/bin/macosx/R-3.1.1-mavericks.pkg>`_.
+
+At this point, you can install poretools.
+
+.. code-block:: bash
+
+	git clone https://github.com/arq5x/poretools
+	cd poretools
+
+Install as an administrator of your machine:
+
+.. code-block:: bash
+
+	sudo python setup.py install
+
+Install as a plain old who lacks ``sudo`` priveleges:
+
+.. code-block:: bash
+
+	# details: https://docs.python.org/2/install/index.html#alternate-installation-the-user-scheme
+	python setup.py install --user
+
+=================================
+Installing dependencies on Ubuntu
+=================================
+
+Package dependencies
+
+.. code-block:: bash
+
+	sudo apt-get install git python-setuptools python-dev cython libhdf5-serial-dev
+
+Then install R 3.0, this requires a bit of hacking. You need to replace 'precise' with the appropriate version if you are on a different Ubuntu version, see <http://cran.r-project.org/bin/linux/ubuntu/README> for more details.
+
+.. code-block:: bash
+
+	sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
+
+Open in a text editor (as sudo) the file ``/etc/apt/sources.list`` and add the following line to the bottom, for Ubuntu 12.04:
+
+.. code-block:: bash
+
+	deb http://www.stats.bris.ac.uk/R/bin/linux/ubuntu precise/
+
+Or, for Ubuntu 14.04:
+
+.. code-block:: bash
+
+	deb http://www.stats.bris.ac.uk/R/bin/linux/ubuntu trusty/ 
+
+Then, run the following commands to install R 3.0:
+
+.. code-block:: bash
+
+	sudo apt-get update
+	sudo apt-get install r-base python-rpy2
+
+Start R
+
+.. code-block:: bash
+
+	R
+
+Then run the following commands within the R programme, and follow any prompts:
+
+.. code-block:: R
+
+	options("repos" = c(CRAN = "http://cran.rstudio.com/"))
+	install.packages("codetools")
+	install.packages("MASS")
+	install.packages("ggplot2")
+
+Then install poretools, finally:
+
+.. code-block:: bash
+
+	git clone https://github.com/arq5x/poretools
+	cd poretools
+	sudo python setup.py install
+	poretools
+
+============
+In the cloud
+============
+
+Amazon Web Services machine image ID: ami-4c0ec424
+
+==========
+Via docker
+==========
+
+Build the docker container yourself (preferred):
+
+.. code-block:: bash
+	git clone https://github.com/arq5x/poretools
+	cd poretools
+	docker build -t poretools .
+	docker run poretools --help
+
+Or use the pre-built `image from Docker Hub <https://registry.hub.docker.com/u/stephenturner/poretools/>`_: 
+
+.. code-block:: bash
+
+	docker pull stephenturner/poretools
+	docker run stephenturner/poretools --help
+
+To run the poretools container on data residing on the host machine, run ``docker run -h`` and look at the help for the ``-v`` option.
diff --git a/docs/content/notebook.rst b/docs/content/notebook.rst
new file mode 100644
index 0000000..dbfd4d4
--- /dev/null
+++ b/docs/content/notebook.rst
@@ -0,0 +1,7 @@
+################
+IPython Notebook
+################
+
+An IPython notebook demonstrating the functionality and output of ``poretools`` is available in the repository. 
+Use this link to view it via the ``nbviewer`` service: 
+<http://nbviewer.ipython.org/github/arq5x/poretools/blob/master/poretools/ipynb/test_run_report.ipynb>
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..65f5d19
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,61 @@
+==========================================================================================
+**poretools**: *a toolkit for working with nanopore sequencing data from Oxford Nanopore.*
+==========================================================================================
+The MinION (TM) from Oxford Nanopore Technologies (ONT) is the first nanopore sequencer to be 
+commercialised and is now available to early-access users. The MinION (TM) is a USB-connected, 
+portable nanopore sequencer which permits real-time analysis of streaming event data. 
+Currently, the research community lacks a standardized toolkit for the analysis of nanopore datasets.
+
+We have therefore develped ``poretools``, a flexible toolkit for exploring datasets generated by 
+nanopore sequencing devices from MinION for the purposes of quality control and downstream analysis. 
+``Poretools`` operates directly on the native FAST5 (a variant of the HDF5 standard) file format produced 
+by ONT and provides a wealth of format conversion utilities and data exploration and visualization tools. 
+
+A preprint of the ``poretools`` manuscript is available on bioarxiv: http://biorxiv.org/content/early/2014/07/23/007401
+
+Below are a few examples of common usage.
+
+1. Extract sequences in FASTQ format from a set of FAST5 files.
+    
+.. code-block:: bash
+
+    poretools fastq fast5/
+
+2. Make a collector's curve of the yield from a sequencing run.
+
+.. code-block:: bash
+
+    poretools yield_plot --plot-type reads fast5/
+
+3. Plot a histogram of read sizes from a set of FAST5 files.
+
+.. code-block:: bash
+
+    poretools hist fast5/
+
+=================
+Table of contents
+=================
+
+.. toctree::
+   :maxdepth: 3
+
+   content/installation
+   content/help
+   content/notebook
+   content/examples
+
+=================
+Requirements
+=================
+  - HDF5 >= 1.8.7 (http://www.hdfgroup.org/HDF5/)
+  - R >= 3.0.0
+  - Python >= 2.7
+  - rpy2 >= 2.4.2
+  - h5py >= 2.0.0
+
+.. note::
+    Please note that Anaconda and Python(x,y) already have all these dependencies installed, other than R/Rpy2:
+    Anaconda (Linux, Windows, OS X): https://store.continuum.io/cshop/anaconda/ Python(x,y) (Windows): https://code.google.com/p/pythonxy/
+
+
diff --git a/poretools/Event.py b/poretools/Event.py
new file mode 100644
index 0000000..877ad17
--- /dev/null
+++ b/poretools/Event.py
@@ -0,0 +1,32 @@
+class Event(object):
+	"""
+	Very basic class to represent a nanopore 
+	translocation event for a single pore
+	based upon data in the Events table of 
+	a Oxford Nanopore FAST5 (HDF5) file
+	"""
+	def __init__(self, row):
+		self.row = row
+		self.mean = row['mean']
+		self.start = row['start']
+		self.stdv = row['stdv']
+		self.length = row['length']
+		self.model_state = row['model_state']
+		self.model_level = row['model_level']
+		self.move = row['move']
+		self.p_model_state = row['p_model_state']
+		self.mp_state = row['mp_state']
+		self.p_mp_state = row['p_mp_state']
+		self.p_A = row['p_A']
+		self.p_C = row['p_C']
+		self.p_G = row['p_G']
+		self.p_T = row['p_T']
+
+	def __repr__(self):
+		return '\t'.join([str(s) for s in [self.mean, self.start, self.stdv,
+										   self.length, self.model_state,
+										   self.model_level, self.move,
+										   self.p_model_state, 
+										   self.mp_state, self.p_mp_state,
+										   self.p_A, self.p_C, 
+										   self.p_G, self.p_T]])
diff --git a/poretools/Fast5File.py b/poretools/Fast5File.py
new file mode 100644
index 0000000..72bfa41
--- /dev/null
+++ b/poretools/Fast5File.py
@@ -0,0 +1,586 @@
+import sys
+import os
+import glob
+import tarfile
+import shutil
+import h5py
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+
+# poretools imports
+import formats
+from Event import Event
+
+fastq_paths = {'template' : '/Analyses/Basecall_2D_000/BaseCalled_template',
+               'complement' : '/Analyses/Basecall_2D_000/BaseCalled_complement',
+               'twodirections' : '/Analyses/Basecall_2D_000/BaseCalled_2D'}
+
+FAST5SET_FILELIST = 0
+FAST5SET_DIRECTORY = 1
+FAST5SET_SINGLEFILE = 2
+FAST5SET_TARBALL = 3
+PORETOOLS_TMPDIR = '.poretools_tmp'
+
+class Fast5FileSet(object):
+
+	def __init__(self, fileset):
+		if isinstance(fileset, list):
+			self.fileset = fileset
+		elif isinstance(fileset, str):
+			self.fileset = [fileset]
+		self.set_type = None
+		self.num_files_in_set = None
+		self._extract_fast5_files()
+
+	def get_num_files(self):
+		"""
+		Return the number of files in the FAST5 set.
+		"""
+		if self.num_files_in_set is None and self.set_type == FAST5SET_TARBALL:
+			self.num_files_in_set = len(self.files)
+		return self.num_files_in_set
+
+	def __iter__(self):
+		return self
+
+	def next(self):
+		try:
+			return Fast5File(self.files.next())
+		except Exception as e:
+			# cleanup our mess
+			if self.set_type == FAST5SET_TARBALL:
+				shutil.rmtree(PORETOOLS_TMPDIR)
+			raise StopIteration
+
+	def _extract_fast5_files(self):
+
+		# return as-is if list of files
+		if len(self.fileset) > 1:
+			self.files = iter(self.fileset)
+			self.num_files_in_set = len(self.fileset)
+			self.set_type = FAST5SET_FILELIST
+		elif len(self.fileset) == 1:
+			# e.g. ['/path/to/dir'] or ['/path/to/file']
+			f = self.fileset[0]
+			# is it a directory?
+			if os.path.isdir(f):
+				pattern = f + '/' + '*.fast5'
+				files = glob.glob(pattern)
+				self.files = iter(files)
+				self.num_files_in_set = len(files)
+				self.set_type = FAST5SET_DIRECTORY
+				if not len(files):
+					logger.warning("Directory is empty!")
+
+			# is it a tarball?
+			elif tarfile.is_tarfile(f):
+				if os.path.isdir(PORETOOLS_TMPDIR):
+					shutil.rmtree(PORETOOLS_TMPDIR)
+				os.mkdir(PORETOOLS_TMPDIR)
+
+				self.files = TarballFileIterator(f)
+				# set to None to delay initialisation
+				self.num_files_in_set = None
+				self.set_type = FAST5SET_TARBALL
+
+			# just a single FAST5 file.
+			else:
+				self.files = iter([f])
+				self.num_files_in_set = 1
+				self.set_type = FAST5SET_SINGLEFILE
+		else:
+			logger.error("Directory %s could not be opened. Exiting.\n" % dir)
+			sys.exit()
+
+class TarballFileIterator:
+	def _fast5_filename_filter(self, filename):
+		return os.path.basename(filename).endswith('.fast5') and not os.path.basename(filename).startswith('.')
+
+	def __init__(self, tarball):
+		self._tarball = tarball
+		self._tarfile = tarfile.open(tarball)
+
+	def __del__(self):
+		self._tarfile.close()
+
+	def __iter__(self):
+		return self
+
+	def next(self):
+		while True:
+			tarinfo = self._tarfile.next()
+			if tarinfo is None:
+				raise StopIteration
+			elif self._fast5_filename_filter(tarinfo.name):
+				break
+		self._tarfile.extract(tarinfo, path=PORETOOLS_TMPDIR)
+		return os.path.join(PORETOOLS_TMPDIR, tarinfo.name)
+
+	def __len__(self):
+		with tarfile.open(self._tarball) as tar:
+			return len(tar.getnames())
+
+
+class Fast5File(object):
+
+	def __init__(self, filename):
+		self.filename = filename
+		self.is_open = self.open()
+
+		self.fastas = {}
+		self.fastqs = {}
+		
+		# pre-load the FASTQ data
+		#self._extract_fastqs_from_fast5()
+
+		# booleans for lazy loading (speed)
+		self.have_fastqs = False
+		self.have_fastas = False
+		self.have_templates = False
+		self.have_complements = False
+		self.have_metadata = False
+
+	####################################################################
+	# Public API methods
+	####################################################################
+
+	def open(self):
+		"""
+		Open an ONT Fast5 file, assuming HDF5 format
+		"""
+		try:
+			self.hdf5file = h5py.File(self.filename, 'r')
+			return True
+		except Exception, e:
+			logger.warning("Cannot open file: %s. Perhaps it is corrupt? Moving on.\n" % self.filename)
+			return False
+			
+	def close(self):
+		"""
+		Close an open an ONT Fast5 file, assuming HDF5 format
+		"""
+		if self.is_open:
+			self.hdf5file.close()
+
+	def has_2D(self):
+		"""
+		Return TRUE if the FAST5 has a 2D base-called sequence.
+		Return FALSE otherwise.
+		"""
+		if self.have_fastas is False:
+			self._extract_fastas_from_fast5()
+			self.have_fastas = True
+
+		if self.fastas.get('twodirections') is not None:
+			return True
+		return False
+
+	def get_fastqs(self, choice):
+		"""
+		Return the set of base called sequences in the FAST5
+		in FASTQ format.
+		"""
+		if self.have_fastqs is False:
+			self._extract_fastqs_from_fast5()
+			self.have_fastqs = True
+
+		# TODO "best". What is "best"?
+		fqs = []
+		if choice == "all":
+			for fastq in self.fastqs:
+				fqs.append(self.fastqs[fastq])
+		elif choice == "fwd":
+				fqs.append(self.fastqs.get('template'))
+		elif choice == "rev":
+				fqs.append(self.fastqs.get('complement'))
+		elif choice == "2D":
+				fqs.append(self.fastqs.get('twodirections'))
+		elif choice == "fwd,rev":
+				fqs.append(self.fastqs.get('template'))
+				fqs.append(self.fastqs.get('complement'))
+
+		return fqs
+
+
+	def get_fastas(self, choice):
+		"""
+		Return the set of base called sequences in the FAST5
+		in FASTQ format.
+		"""
+		if self.have_fastas is False:
+			self._extract_fastas_from_fast5()
+			self.have_fastas = True
+
+		# TODO "best". What is "best"?
+		fas = []
+		if choice == "all":
+			for fasta in self.fastas:
+				fas.append(self.fastas[fasta])
+		elif choice == "fwd":
+				fas.append(self.fastas.get('template'))
+		elif choice == "rev":
+				fas.append(self.fastas.get('complement'))
+		elif choice == "2D":
+				fas.append(self.fastas.get('twodirections'))
+		elif choice == "fwd,rev":
+				fas.append(self.fastas.get('template'))
+				fas.append(self.fastas.get('complement'))
+
+		return fas
+
+	def get_fastas_dict(self):
+                """
+                Return the set of base called sequences in the FAST5
+                in FASTQ format.
+                """
+                if self.have_fastas is False:
+                        self._extract_fastas_from_fast5()
+                        self.have_fastas = True
+
+		return self.fastas
+
+	def get_fastq(self):
+		"""
+		Return the base called sequence in the FAST5
+		in FASTQ format. Try 2D then template, then complement.
+		If all fail, return None
+		"""
+		if self.have_fastqs is False:
+			self._extract_fastqs_from_fast5()
+			self.have_fastqs = True
+
+		if not self.fastqs:
+			return None
+		elif self.fastqs.get('twodirections') is not None:
+			return self.fastqs.get('twodirections')
+		elif self.fastqs.get('template') is not None:
+			return self.fastqs.get('template')
+		elif self.fastqs.get('complement') is not None:
+			return self.fastqs.get('complement')
+
+
+	def get_fasta(self):
+		"""
+		Return the base called sequence in the FAST5
+		in FASTA format. Try 2D then template, then complement.
+		If all fail, return None
+		"""
+		if not self.fastas:
+			return None
+		elif self.fastas.get('twodirections') is not None:
+			return self.fastas.get('twodirections')
+		elif self.fastas.get('template') is not None:
+			return self.fastas.get('template')
+		elif self.fastas.get('complement') is not None:
+			return self.fastas.get('complement')
+
+	def get_template_events(self):
+		"""
+		Return the table of event data for the template strand
+		"""
+		if self.have_templates is False:
+			self._extract_template_events()
+			self.have_templates = True
+
+		return self.template_events
+
+	def get_complement_events(self):
+		"""
+		Return the table of event data for the complement strand
+		"""
+		if self.have_complements is False:
+			self._extract_complement_events()
+			self.have_complements = True
+		
+		return self.complement_events
+
+	####################################################################
+	# Flowcell Metadata methods
+	####################################################################
+
+	def get_exp_start_time(self):
+		"""
+		Return the starting time at which signals were collected
+		for the given read.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['exp_start_time']
+		except:
+			return None
+
+	def get_channel_number(self):
+		"""
+		Return the channel (pore) number at which signals were collected
+		for the given read.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['channel_id'].attrs['channel_number']
+		except:
+			pass
+
+		try:
+			return self.keyinfo['read_id'].attrs['channel_number']
+		except:
+			return None
+
+	def find_read_number_block(self):
+		path = "/Analyses/Basecall_2D_000"
+		basecall = self.hdf5file[path]
+		path = basecall.get('InputEvents', getlink=True)
+
+		# the soft link target seems broken?
+		newpath = "/" + "/".join(path.path.split("/")[:-1])
+
+		node = self.hdf5file[newpath]
+
+		return node
+
+	def find_event_timing_block(self):
+		path = "/Analyses/Basecall_2D_000/BaseCalled_template"
+		try:
+			node = self.hdf5file[path]
+			path = node.get('Events')
+#, getlink=True)
+			return path
+		except Exception:
+			return None
+
+	def get_read_number(self):
+		"""
+		Return the read number for the pore representing the given read.
+		"""
+		node = self.find_read_number_block()
+		if node:
+			try:
+				return node.attrs['read_number']
+			except:
+				return None
+		return None
+
+	def get_duration(self):
+		node = self.find_event_timing_block()
+		if node:
+			return int(node.attrs['duration'])
+		return None
+
+	def get_start_time(self):
+		exp_start_time	= self.get_exp_start_time()
+	
+		node = self.find_event_timing_block()
+		if node:
+			return int(exp_start_time) + int(node.attrs['start_time'])
+	
+		return None
+
+	def get_end_time(self):
+		exp_start_time	= self.get_exp_start_time()
+		start_time = self.get_start_time()
+		duration = self.get_duration()
+
+		if start_time and duration:
+			return start_time + duration
+		else:
+			return None
+
+	def get_version_name(self):
+		"""
+		Return the flow cell version name.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['version_name']
+		except:
+			return None
+
+	def get_run_id(self):
+		"""
+		Return the run id.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['run_id']
+		except:
+			return None
+
+	def get_heatsink_temp(self):
+		"""
+		Return the heatsink temperature.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['heatsink_temp']
+		except:
+			return None
+
+	def get_asic_temp(self):
+		"""
+		Return the ASIC temperature.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['asic_temp']
+		except:
+			return None
+
+	def get_flowcell_id(self):
+		"""
+		Return the flowcell_id.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['flowcell_id']
+		except:
+			return None
+
+	def get_run_purpose(self):
+		"""
+		Return the exp_script_purpose.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['exp_script_purpose']
+		except:
+			return None
+
+	def get_asic_id(self):
+		"""
+		Return the flowcell's ASIC id.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo['tracking_id'].attrs['asic_id']
+		except:
+			return None
+
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+	def get_device_id(self):
+		"""
+		Return the flowcell's device id.
+		"""
+		try:
+			return self.keyinfo['tracking_id'].attrs['device_id']
+		except:
+			return None
+
+
+	def get_template_events_count(self):
+		"""
+		Pull out the event count for the template strand
+		"""
+		try:
+			table = self.hdf5file[fastq_paths['template']]
+			return len(table['Events'][()])
+		except Exception, e:
+			return 0
+
+	def get_complement_events_count(self):
+		"""
+		Pull out the event count for the complementary strand
+		"""
+		try:
+			table = self.hdf5file[fastq_paths['complement']]
+			return len(table['Events'][()])
+		except Exception, e:
+			return 0
+
+	def is_high_quality(self):
+		if self.get_complement_events_count() > \
+		   self.get_template_events_count():
+			return True
+		else:
+			return False
+
+	####################################################################
+	# Private API methods
+	####################################################################
+
+	def _extract_fastqs_from_fast5(self):
+		"""
+		Return the sequence in the FAST5 file in FASTQ format
+		"""
+		for id, h5path in fastq_paths.iteritems(): 
+			try:
+				table = self.hdf5file[h5path]
+				fq = formats.Fastq(table['Fastq'][()])
+				fq.name += "_" + id + ":" + self.filename
+				self.fastqs[id] = fq
+			except Exception, e:
+				pass
+
+	def _extract_fastas_from_fast5(self):
+		"""
+		Return the sequence in the FAST5 file in FASTA format
+		"""
+		for id, h5path in fastq_paths.iteritems(): 
+			try:
+				table = self.hdf5file[h5path]
+				fa = formats.Fasta(table['Fastq'][()])
+				fa.name += "_" + id + " " + self.filename
+				self.fastas[id] = fa
+			except Exception, e:
+				pass
+
+	def _extract_template_events(self):
+		"""
+		Pull out the event information for the template strand
+		"""
+		try:
+			table = self.hdf5file[fastq_paths['template']]
+			self.template_events = [Event(x) for x in table['Events'][()]]
+		except Exception, e:
+			self.template_events = []
+
+	def _extract_complement_events(self):
+		"""
+		Pull out the event information for the complementary strand
+		"""
+		try:
+			table = self.hdf5file[fastq_paths['complement']]
+			self.complement_events = [Event(x) for x in table['Events'][()]]
+		except Exception, e:
+			self.complement_events = []
+
+	def _get_metadata(self):
+		try:
+			self.keyinfo = self.hdf5file['/UniqueGlobalKey']
+		except Exception, e:
+			try:
+				self.keyinfo = self.hdf5file['/Key']
+			except Exception, e:
+				self.keyinfo = None
+				logger.warning("Cannot find keyinfo. Exiting.\n")
diff --git a/poretools/Fast5File_pytables.py b/poretools/Fast5File_pytables.py
new file mode 100644
index 0000000..3794bd3
--- /dev/null
+++ b/poretools/Fast5File_pytables.py
@@ -0,0 +1,504 @@
+import sys
+import os
+import glob
+import tarfile
+import shutil
+import tables as pyhdf5
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+
+# poretools imports
+import formats
+from Event import Event
+
+fastq_paths = {'template' : '/Analyses/Basecall_2D_000/BaseCalled_template',
+               'complement' : '/Analyses/Basecall_2D_000/BaseCalled_complement',
+               'twodirections' : '/Analyses/Basecall_2D_000/BaseCalled_2D'}
+
+FAST5SET_FILELIST = 0
+FAST5SET_DIRECTORY = 1
+FAST5SET_SINGLEFILE = 2
+FAST5SET_TARBALL = 3
+PORETOOOLS_TMPDIR = '.poretools_tmp'
+
+class Fast5FileSet(object):
+
+	def __init__(self, fileset):
+		if isinstance(fileset, list):
+			self.fileset = fileset
+		elif isinstance(fileset, str):
+			self.fileset = [fileset]
+		self.set_type = None
+		self.num_files_in_set = None
+		self._extract_fast5_files()
+
+	def get_num_files(self):
+		"""
+		Return the number of files in the FAST5 set.
+		"""
+		return self.num_files_in_set
+
+	def __iter__(self):
+		return self
+
+	def next(self):
+		try:
+			return Fast5File(self.files.next())
+		except Exception as e:
+			# cleanup our mess
+			if self.set_type ==	 FAST5SET_TARBALL:
+				shutil.rmtree(PORETOOOLS_TMPDIR)
+			raise StopIteration
+
+	def _extract_fast5_files(self):
+
+		# return as-is if list of files
+		if len(self.fileset) > 1:
+			self.files = iter(self.fileset)
+			self.num_files_in_set = len(self.fileset)
+			self.set_type = FAST5SET_FILELIST
+		elif len(self.fileset) == 1:
+			# e.g. ['/path/to/dir'] or ['/path/to/file']
+			f = self.fileset[0]
+			# is it a directory?
+			if os.path.isdir(f):
+				pattern = f + '/' + '*.fast5'
+				files = glob.glob(pattern)
+				self.files = iter(files)
+				self.num_files_in_set = len(files)
+				self.set_type = FAST5SET_DIRECTORY
+				if not len(files):
+					logger.warning("Directory is empty!")
+
+			# is it a tarball?
+			elif tarfile.is_tarfile(f):
+				if os.path.isdir(PORETOOOLS_TMPDIR):
+					shutil.rmtree(PORETOOOLS_TMPDIR)
+				os.mkdir(PORETOOOLS_TMPDIR)
+				
+				tar = tarfile.open(f)
+				tar.extractall(PORETOOOLS_TMPDIR)
+				self.files = (PORETOOOLS_TMPDIR + '/' + f for f in tar.getnames())
+				self.num_files_in_set = len(tar.getnames())
+				self.set_type = FAST5SET_TARBALL
+
+			# just a single FAST5 file.
+			else:
+				self.files = iter([f])
+				self.num_files_in_set = 1
+				self.set_type = FAST5SET_SINGLEFILE
+		else:
+			logger.error("Directory %s could not be opened. Exiting.\n" % dir)
+			sys.exit()
+
+
+class Fast5File(object):
+
+	def __init__(self, filename):
+		self.filename = filename
+		self.is_open = self.open()
+
+		self.fastas = {}
+		self.fastqs = {}
+		
+		# pre-load the FASTQ data
+		#self._extract_fastqs_from_fast5()
+
+		# booleans for lazy loading (speed)
+		self.have_fastqs = False
+		self.have_fastas = False
+		self.have_templates = False
+		self.have_complements = False
+		self.have_metadata = False
+
+	####################################################################
+	# Public API methods
+	####################################################################
+
+	def open(self):
+		"""
+		Open an ONT Fast5 file, assuming HDF5 format
+		"""
+		try:
+			self.hdf5file = pyhdf5.open_file(self.filename, 'r')
+			return True
+		except Exception, e:
+			logger.warning("Cannot open file: %s. Perhaps it is corrupt? Moving on.\n" % self.filename)
+			return False
+			
+	def close(self):
+		"""
+		Close an open an ONT Fast5 file, assuming HDF5 format
+		"""
+		if self.is_open:
+			self.hdf5file.close()
+
+
+	def get_fastqs(self, choice):
+		"""
+		Return the set of base called sequences in the FAST5
+		in FASTQ format.
+		"""
+		if self.have_fastqs is False:
+			self._extract_fastqs_from_fast5()
+			self.have_fastqs = True
+
+		# TODO "best". What is "best"?
+		fqs = []
+		if choice == "all":
+			for fastq in self.fastqs:
+				fqs.append(self.fastqs[fastq])
+		elif choice == "fwd":
+				fqs.append(self.fastqs.get('template'))
+		elif choice == "rev":
+				fqs.append(self.fastqs.get('complement'))
+		elif choice == "2D":
+				fqs.append(self.fastqs.get('twodirections'))
+		elif choice == "fwd,rev":
+				fqs.append(self.fastqs.get('template'))
+				fqs.append(self.fastqs.get('complement'))
+
+		return fqs
+
+
+	def get_fastas(self, choice):
+		"""
+		Return the set of base called sequences in the FAST5
+		in FASTQ format.
+		"""
+		if self.have_fastas is False:
+			self._extract_fastas_from_fast5()
+			self.have_fastas = True
+
+		# TODO "best". What is "best"?
+		fas = []
+		if choice == "all":
+			for fasta in self.fastas:
+				fas.append(self.fastas[fasta])
+		elif choice == "fwd":
+				fas.append(self.fastas.get('template'))
+		elif choice == "rev":
+				fas.append(self.fastas.get('complement'))
+		elif choice == "2D":
+				fas.append(self.fastas.get('twodirections'))
+		elif choice == "fwd,rev":
+				fas.append(self.fastas.get('template'))
+				fas.append(self.fastas.get('complement'))
+
+		return fas
+
+
+	def get_fastq(self):
+		"""
+		Return the base called sequence in the FAST5
+		in FASTQ format. Try 2D then template, then complement.
+		If all fail, return None
+		"""
+		if self.have_fastqs is False:
+			self._extract_fastqs_from_fast5()
+			self.have_fastqs = True
+
+		if not self.fastqs:
+			return None
+		elif self.fastqs.get('twodirections') is not None:
+			return self.fastqs.get('twodirections')
+		elif self.fastqs.get('template') is not None:
+			return self.fastqs.get('template')
+		elif self.fastqs.get('complement') is not None:
+			return self.fastqs.get('complement')
+
+
+	def get_fasta(self):
+		"""
+		Return the base called sequence in the FAST5
+		in FASTA format. Try 2D then template, then complement.
+		If all fail, return None
+		"""
+		if not self.fastas:
+			return None
+		elif self.fastas.get('twodirections') is not None:
+			return self.fastas.get('twodirections')
+		elif self.fastas.get('template') is not None:
+			return self.fastas.get('template')
+		elif self.fastas.get('complement') is not None:
+			return self.fastas.get('complement')
+
+	def get_template_events(self):
+		"""
+		Return the table of event data for the template strand
+		"""
+		if self.have_templates is False:
+			self._extract_template_events()
+			self.have_templates = True
+
+		return self.template_events
+
+	def get_complement_events(self):
+		"""
+		Return the table of event data for the complement strand
+		"""
+		if self.have_complements is False:
+			self._extract_complement_events()
+			self.have_complements = True
+		
+		return self.complement_events
+
+	####################################################################
+	# Flowcell Metadata methods
+	####################################################################
+
+	def get_exp_start_time(self):
+		"""
+		Return the starting time at which signals were collected
+		for the given read.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('exp_start_time')
+		except:
+			return None
+
+	def get_channel_number(self):
+		"""
+		Return the channel (pore) number at which signals were collected
+		for the given read.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.channel_id._f_getAttr('channel_number')
+		except:
+			return None
+
+	def find_read_number_block(self):
+		path = "/Analyses/Basecall_2D_000/InputEvents"
+		try:
+			newpath = self.hdf5file.getNode(path)
+
+			# the soft link target seems broken?
+			newpath = "/" + "/".join(newpath.target.split("/")[:-1])
+# + '/Events'
+
+			node = self.hdf5file.getNode(newpath)
+
+			return node
+		except Exception:
+			pass
+
+	def find_event_timing_block(self):
+		path = "/Analyses/Basecall_2D_000/BaseCalled_template/Events"
+		try:
+			return self.hdf5file.getNode(path)
+		except Exception:
+			pass
+		
+		return None
+
+	def get_read_number(self):
+		"""
+		Return the read number for the pore representing the given read.
+		"""
+		node = self.find_read_number_block()
+		if node:
+			try:
+				return node._f_getAttr('read_number')
+			except:
+				return None
+		return None
+
+	def get_duration(self):
+		node = self.find_event_timing_block()
+		if node:
+			return int(node._f_getAttr('duration'))
+		return None
+
+	def get_start_time(self):
+		exp_start_time	= self.get_exp_start_time()
+	
+		node = self.find_event_timing_block()
+		if node:
+			return int(exp_start_time) + int(node._f_getAttr('start_time'))
+	
+		return None
+
+	def get_end_time(self):
+		exp_start_time	= self.get_exp_start_time()
+		start_time = self.get_start_time()
+		duration = self.get_duration()
+
+		if start_time and duration:
+			return start_time + duration
+		else:
+			return None
+
+	def get_version_name(self):
+		"""
+		Return the flow cell version name.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('version_name')
+		except:
+			return None
+
+	def get_run_id(self):
+		"""
+		Return the run id.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('run_id')
+		except:
+			return None
+
+	def get_heatsink_temp(self):
+		"""
+		Return the heatsink temperature.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('heatsink_temp')
+		except:
+			return None
+
+	def get_asic_temp(self):
+		"""
+		Return the ASIC temperature.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('asic_temp')
+		except:
+			return None
+
+	def get_flowcell_id(self):
+		"""
+		Return the flowcell_id.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('flowcell_id')
+		except:
+			return None
+
+	def get_run_purpose(self):
+		"""
+		Return the exp_script_purpose.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('exp_script_purpose')
+		except:
+			return None
+
+	def get_asic_id(self):
+		"""
+		Return the flowcell's ASIC id.
+		"""
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('asic_id')
+		except:
+			return None
+
+		if self.have_metadata is False:
+			self._get_metadata()
+			self.have_metadata = True
+
+	def get_device_id(self):
+		"""
+		Return the flowcell's device id.
+		"""
+		try:
+			return self.keyinfo.tracking_id._f_getAttr('device_id')
+		except:
+			return None
+
+	####################################################################
+	# Private API methods
+	####################################################################
+
+	def _extract_fastqs_from_fast5(self):
+		"""
+		Return the sequence in the FAST5 file in FASTQ format
+		"""
+		for id, h5path in fastq_paths.iteritems(): 
+			try:
+				table = self.hdf5file.getNode(h5path)
+				fq = formats.Fastq(table.Fastq[()])
+				fq.name += "_" + id + ":" + self.filename
+				self.fastqs[id] = fq
+			except Exception, e:
+				pass
+
+	def _extract_fastas_from_fast5(self):
+		"""
+		Return the sequence in the FAST5 file in FASTA format
+		"""
+		for id, h5path in fastq_paths.iteritems(): 
+			try:
+				table = self.hdf5file.getNode(h5path)
+				fa = formats.Fasta(table.Fastq[()])
+				fa.name += "_" + id + " " + self.filename
+				self.fastas[id] = fa
+			except Exception, e:
+				pass
+
+	def _extract_template_events(self):
+		"""
+		Pull out the event information for the template strand
+		"""
+		try:
+			table = self.hdf5file.getNode(fastq_paths['template'])
+			self.template_events = [Event(x) for x in table.Events]
+		except Exception, e:
+			self.template_events = []
+
+	def _extract_complement_events(self):
+		"""
+		Pull out the event information for the complementary strand
+		"""
+		try:
+			table = self.hdf5file.getNode(fastq_paths['complement'])
+			self.complement_events = [Event(x) for x in table.Events]
+		except Exception, e:
+			self.complement_events = []
+
+	def _get_metadata(self):
+		try:
+			self.keyinfo = self.hdf5file.getNode('/UniqueGlobalKey')
+		except Exception, e:
+			try:
+				self.keyinfo = self.hdf5file.getNode('/Key')
+			except Exception, e:
+				self.keyinfo = None
+				logger.warning("Cannot find keyinfo. Exiting.\n")
diff --git a/poretools/__init__.py b/poretools/__init__.py
new file mode 100644
index 0000000..c5c2412
--- /dev/null
+++ b/poretools/__init__.py
@@ -0,0 +1,5 @@
+import os
+import sys
+import scripts
+from Fast5File import *
+from version import __version__
diff --git a/poretools/combine.py b/poretools/combine.py
new file mode 100644
index 0000000..4d165d3
--- /dev/null
+++ b/poretools/combine.py
@@ -0,0 +1,30 @@
+import tarfile
+import sys
+import Fast5File
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+
+def run(parser, args):
+	
+	if args.tar_filename.endswith('.tar'):
+		tar = tarfile.open(args.tar_filename, mode='w')
+	elif args.tar_filename.endswith('.gz'):
+		tar = tarfile.open(args.tar_filename, mode='w:gz')
+	elif args.tar_filename.endswith('.bz2'):
+		tar = tarfile.open(args.tar_filename, mode='w:bz2')
+	else:
+		logger.error("Unrecognized FAST5 archive extension. Exiting.\n")
+		sys.exit()
+
+	file_count = 0
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		tar.add(fast5.filename)
+		fast5.close()
+		file_count += 1
+	tar.close()
+
+	logger.info("%s successfully created from %d FAST5 files.\n" % \
+		(args.tar_filename, file_count))
diff --git a/poretools/events.py b/poretools/events.py
new file mode 100644
index 0000000..3391f3e
--- /dev/null
+++ b/poretools/events.py
@@ -0,0 +1,20 @@
+import Fast5File
+
+def run(parser, args):
+
+	# print header.
+	keys = ['file', 'strand', 'mean', 'start', 'stdv', \
+			'length', 'model_state', 'model_level', 'move', \
+			'p_model_state', 'mp_model_state', 'p_mp_model_state', \
+			'p_A', 'p_C', 'p_G', 'p_T', 'raw_index']
+	print "\t".join(keys)
+	
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+
+		for event in fast5.get_template_events():
+			print '\t'.join([fast5.filename, 'template', str(event)]) 
+		for event in fast5.get_complement_events():
+			print '\t'.join([fast5.filename, 'complement', str(event)]) 
+
+		fast5.close()
+
diff --git a/poretools/fasta.py b/poretools/fasta.py
new file mode 100644
index 0000000..a442a23
--- /dev/null
+++ b/poretools/fasta.py
@@ -0,0 +1,46 @@
+import Fast5File
+import sys
+
+def run(parser, args):
+
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+
+		if args.start_time or args.end_time:
+			read_start_time = fast5.get_start_time()
+			read_end_time = fast5.get_end_time()
+			if args.start_time and args.start_time > read_start_time:
+				fast5.close()
+				continue
+			if args.end_time and args.end_time < read_end_time:
+				fast5.close()
+				continue
+
+		fas = fast5.get_fastas(args.type)
+		
+		# high quality 2D: means there are more nanopore events on the 
+		# complement strand than on the template strand. We also
+		# require there to be a 2D base-called sequence from Metrichor.
+		if args.high_quality:
+			if (fast5.get_complement_events_count() <= \
+			   fast5.get_template_events_count()) or not fast5.has_2D():
+				fast5.close()
+				continue
+
+		# norem quality 2D : means there are less (or equal) nanopore 
+		# events on the complement strand than on the template strand. 
+		# We also require there to be a 2D base-called sequence from Metrichor.
+		if args.normal_quality:
+			if (fast5.get_complement_events_count() > \
+			   fast5.get_template_events_count()) or not fast5.has_2D():
+				fast5.close()
+				continue
+
+		for fa in fas:
+			if fa is None or \
+			len(fa.seq) < args.min_length:			
+				continue
+
+			print fa
+
+		fast5.close()
+
diff --git a/poretools/fastq.py b/poretools/fastq.py
new file mode 100644
index 0000000..3f3be43
--- /dev/null
+++ b/poretools/fastq.py
@@ -0,0 +1,33 @@
+import Fast5File
+import sys
+
+def run(parser, args):
+	
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+
+		if args.start_time or args.end_time:
+			read_start_time = fast5.get_start_time()
+			read_end_time = fast5.get_end_time()
+			if args.start_time and args.start_time > read_start_time:
+				fast5.close()
+				continue
+			if args.end_time and args.end_time < read_end_time:
+				fast5.close()
+				continue
+
+		fas = fast5.get_fastqs(args.type)
+		if args.high_quality:
+			if fast5.get_complement_events_count() <= \
+			   fast5.get_template_events_count():
+				fast5.close()
+				continue
+
+		for fa in fas:
+			if fa is None or \
+			len(fa.seq) < args.min_length:			
+				continue
+
+			print fa
+
+		fast5.close()
+
diff --git a/poretools/formats.py b/poretools/formats.py
new file mode 100644
index 0000000..77b5a5c
--- /dev/null
+++ b/poretools/formats.py
@@ -0,0 +1,23 @@
+class Fastq(object):
+	def __init__(self, s):
+		self.s = s
+		self.parse()
+
+	def parse(self):
+		(self.name, self.seq, self.sep, self.qual) = self.s.strip().split('\n')
+
+	def __repr__(self):
+		return '\n'.join([self.name, self.seq, self.sep, self.qual])
+
+
+class Fasta(object):
+	def __init__(self, s):
+		self.s = s
+		self.parse()
+
+	def parse(self):
+		(self.name, self.seq, self.sep, self.qual) = self.s.strip().split('\n')
+		self.name = self.name.lstrip('@')
+
+	def __repr__(self):
+		return '\n'.join(['>'+self.name, self.seq])
\ No newline at end of file
diff --git a/poretools/hist.py b/poretools/hist.py
new file mode 100644
index 0000000..eeaa020
--- /dev/null
+++ b/poretools/hist.py
@@ -0,0 +1,75 @@
+import sys
+import Fast5File
+import rpy2.robjects as robjects
+import rpy2.robjects.lib.ggplot2 as ggplot2
+from rpy2.robjects.packages import importr
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+logger.setLevel(logging.INFO)
+
+def plot_hist(sizes, args):
+	"""
+	Use rpy2 to plot a histogram of the read sizes
+	"""
+	r = robjects.r
+	r.library("ggplot2")
+	grdevices = importr('grDevices')
+
+	sizes = robjects.IntVector([s for s in sizes \
+                if s < args.max_length and s > args.min_length])
+
+	sizes_min = min(sizes)
+	sizes_max = max(sizes)
+
+	binwidth = (sizes_max - sizes_min) / args.num_bins
+
+	d = {'sizes' : sizes}
+	df = robjects.DataFrame(d)
+
+        # plot
+        gp = ggplot2.ggplot(df)
+
+        if not args.theme_bw:
+            pp = gp + ggplot2.aes_string(x='sizes') \
+	                + ggplot2.geom_histogram(binwidth=binwidth)
+        else:
+            pp = gp + ggplot2.aes_string(x='sizes') \
+                + ggplot2.geom_histogram(binwidth=binwidth) \
+                + ggplot2.theme_bw()	    	
+
+	if args.saveas is not None:
+		plot_file = args.saveas
+		if plot_file.endswith(".pdf"):
+			grdevices.pdf(plot_file, width = 8.5, height = 8.5)
+		elif plot_file.endswith(".png"):
+			grdevices.png(plot_file, width = 8.5, height = 8.5, 
+				units = "in", res = 300)
+		else:
+			logger.error("Unrecognized extension for %s!" % (plot_file))
+			sys.exit()
+
+		pp.plot()
+		grdevices.dev_off()
+	else:
+		pp.plot()
+		# keep the plot open until user hits enter
+		print('Type enter to exit.')
+		raw_input()
+
+def run(parser, args):
+	sizes = []
+	files_processed = 0
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		fq = fast5.get_fastq()
+		if fq is not None:
+			sizes.append(len(fq.seq))
+		files_processed += 1
+		if files_processed % 100 == 0:
+			logger.info("%d files processed." % files_processed)
+		fast5.close()
+
+	plot_hist(sizes, args)
+
+
diff --git a/poretools/ipynb/test_run_report.ipynb b/poretools/ipynb/test_run_report.ipynb
new file mode 100644
index 0000000..8136e81
--- /dev/null
+++ b/poretools/ipynb/test_run_report.ipynb
@@ -0,0 +1,650 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:d0b818846b182639011099ca56fd583aa186cae4284de49607ba34231dc100f2"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This IPython notebook file demonstrates some of the functionality available in __poretools__, and an example run report for a recent R7 chemistry nanopore run. "
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Some of the examples call out to R for plotting, so to make this work in this notebook we need to load the ``rpy2.ipython`` module."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%load_ext rpy2.ipython"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 53
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "__poretools__ can run either on an individual FAST5 file, a directory containing FAST5 files, or a tar archive of FAST5 files. Here we set up a `$directory` variable for use for the rest of the tutorial. You could change this and run the same commands on your data."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "directory='/mnt/borage/nick/nanopore/data/Flowcell6/downloads'"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 18
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!find $directory -maxdepth 1 -name \"*.fast5\" | wc -l"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "60196\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 65
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "There are 60,196 FAST5 files in the directory."
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "__poretools__ has a number of different command line options. Running __poretools__ with no parameters gives us a brief list (and complies with [Torsten's first rule](http://www.gigasciencejournal.com/content/2/1/15))"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "usage: poretools [-h] [-v]\r\n",
+        "                 \r\n",
+        "                 {combine,fastq,fasta,stats,hist,events,readstats,tabular,nucdist,qualdist,winner,squiggle,times,yield_plot}\r\n",
+        "                 ...\r\n",
+        "poretools: error: too few arguments\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 21
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We can get more information if we run __poretools__ with the -h (help) option."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools -h"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "usage: poretools [-h] [-v]\r\n",
+        "                 \r\n",
+        "                 {combine,fastq,fasta,stats,hist,events,readstats,tabular,nucdist,qualdist,winner,squiggle,times,yield_plot}\r\n",
+        "                 ...\r\n",
+        "\r\n",
+        "optional arguments:\r\n",
+        "  -h, --help            show this help message and exit\r\n",
+        "  -v, --version         Installed poretools version\r\n",
+        "\r\n",
+        "[sub-commands]:\r\n",
+        "  {combine,fastq,fasta,stats,hist,events,readstats,tabular,nucdist,qualdist,winner,squiggle,times,yield_plot}\r\n",
+        "    combine             Combine a set of FAST5 files in a TAR achive\r\n",
+        "    fastq               Extract FASTQ sequences from a set of FAST5 files\r\n",
+        "    fasta               Extract FASTA sequences from a set of FAST5 files\r\n",
+        "    stats               Get read size stats for a set of FAST5 files\r\n",
+        "    hist                Plot read size histogram for a set of FAST5 files\r\n",
+        "    events              Extract each nanopore event for each read.\r\n",
+        "    readstats           Extract signal information for each read over time.\r\n",
+        "    tabular             Extract the lengths and name/seq/quals from a set of\r\n",
+        "                        FAST5 files in TAB delimited format\r\n",
+        "    nucdist             Get the nucl. composition of a set of FAST5 files\r\n",
+        "    qualdist            Get the qual score composition of a set of FAST5 files\r\n",
+        "    winner              Get the longest read from a set of FAST5 files\r\n",
+        "    squiggle            Plot the observed signals for FAST5 reads.\r\n",
+        "    times               Return the start times from a set of FAST5 files in\r\n",
+        "                        tabular format\r\n",
+        "    yield_plot          Plot the yield over time for a set of FAST5 files\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 22
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Let's start with a simple one, the ``stats`` command, this will give us some basic statistics about our reads.\n",
+      "\n",
+      "The ``-q`` option stops ``poretools`` outputting any warning messages."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools stats -q $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "total reads\t104969\r\n",
+        "total base pairs\t550200969\r\n",
+        "mean\t5241.56\r\n",
+        "median\t4616\r\n"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "min\t5\r\n",
+        "max\t154417\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 68
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "How do we have 104,969 reads from 60,196 FAST5 files? That's because forward, reverse and two-directional reads are all counted separately."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools stats -q --type fwd $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "total reads\t53914\r\n",
+        "total base pairs\t290880019\r\n",
+        "mean\t5395.26\r\n",
+        "median\t4441\r\n",
+        "min\t5\r\n",
+        "max\t154417\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 69
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We have 53,914 forward reads in our total dataset."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools stats -q --type rev $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "total reads\t29622\r\n",
+        "total base pairs\t124718901\r\n",
+        "mean\t4210.35\r\n",
+        "median\t3765\r\n",
+        "min\t5\r\n",
+        "max\t44835\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 71
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools stats -q --type 2D $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "total reads\t21433\r\n",
+        "total base pairs\t134602049\r\n",
+        "mean\t6280.13\r\n",
+        "median\t6020\r\n",
+        "min\t211\r\n",
+        "max\t38598\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 70
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We have 21,433 two-direction reads, which is about 40% of the reads which have been base-called and about 72% of the reads that have a detectable complement strand."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools readstats -q $directory > readstats.txt"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 73
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!wc -l readstats.txt"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "``readstats`` gives you a line per every FAST5 file in your dataset. The columns are:\n",
+      "*   start_time (represented as a UNIX timestamp, e.g. seconds since the UNIX epoch)\n",
+      "*   pore number\n",
+      "*   read number (not working ATM, the format changed)\n",
+      "*   length of forward read\n",
+      "*   length of reverse read"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!head -10 readstats.txt"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "1404984747\t491\tNone\t301\t0\r\n",
+        "1405005921\t325\tNone\t299\t0\r\n",
+        "None\t410\tNone\t0\t0\r\n",
+        "1404940637\t415\tNone\t5271\t5113\r\n",
+        "1405002720\t222\tNone\t3064\t0\r\n",
+        "1404959466\t209\tNone\t5901\t5634\r\n",
+        "1404949019\t12\tNone\t4524\t3576\r\n",
+        "1404930734\t470\tNone\t345\t0\r\n",
+        "1404936756\t491\tNone\t310\t0\r\n",
+        "1404947432\t109\tNone\t697\t0\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 75
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "One useful plot you can easily do with the output of read stats is to plot the number of events in forward reads against reverse reads. Ideally every read would have a similar number, which would indicate the hairpin is correctly attached and the strand translocation rate is controlled by the enzyme."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%R stats=read.table(\"readstats.txt\", sep=\"\\t\")\n",
+      "%R stats=subset(stats, V4 < 20000 & V5 < 20000)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<pre>\n",
+        "&ltclass 'pandas.core.frame.DataFrame'&gt\n",
+        "Int64Index: 59048 entries, 0 to 59047\n",
+        "Data columns (total 5 columns):\n",
+        "V1    59048  non-null values\n",
+        "V2    59048  non-null values\n",
+        "V3    59048  non-null values\n",
+        "V4    59048  non-null values\n",
+        "V5    59048  non-null values\n",
+        "dtypes: int32(2), object(3)\n",
+        "</pre>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 79,
+       "text": [
+        "<class 'pandas.core.frame.DataFrame'>\n",
+        "Int64Index: 59048 entries, 0 to 59047\n",
+        "Data columns (total 5 columns):\n",
+        "V1    59048  non-null values\n",
+        "V2    59048  non-null values\n",
+        "V3    59048  non-null values\n",
+        "V4    59048  non-null values\n",
+        "V5    59048  non-null values\n",
+        "dtypes: int32(2), object(3)"
+       ]
+      }
+     ],
+     "prompt_number": 79
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%R smoothScatter(stats$V4,stats$V5)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAeAAAAHgCAIAAADytinCAAAgAElEQVR4nO2de5AcxX3HZ+/2njq9\nkLCMAkJHJLANshxbYBMUCSNkhKlQgIUQhDi2FWRzpkIBTgx5OcQVymASP8CPqrMsY5LYwQ4+BEFg\n48hSsA0R+JGUhGJkELIk63EguDvdY+92J3/sTk/v/Lp/2z2zMzez9/3UVmnU09PdMzvb9+vv79fd\nOdd1HQAAAOmjabIbAAAAQA06aAAASCnooAEAIKWggwYAgJSCDhoAAFIKOmgAAEgp6KABACCloIMG\nAICUgg4aAABSCjpoAABIKeigAQAgpaCDBgCAlIIOGgAAUgo6aAAASCnooAEAIKWggwYAgJSCDhoA\nAFIKOmgAAEgp6KABACCloIMGAICUgg4aAABSCjpoAABIKeigAQAgpaCDBgCAlIIOGgAAUgo6aAAA\nSCnooAEAI [...]
+      }
+     ],
+     "prompt_number": 77
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools winner -q --type 2D $directory > winner.fasta"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This will just use the header data to generate a squiggle plot:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!head -1 winner.fasta | sed 's/>.* //' | xargs poretools squiggle --saveas png --num-facets 12"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 126
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "squiggle=!head -1 winner.fasta | sed 's/>.* //'\n",
+      "squiggle=squiggle[0] + '.png'\n",
+      "Image(squiggle)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "png": "iVBORw0KGgoAAAANSUhEUgAACfYAAAzkCAMAAAAgP32bAAADAFBMVEUAAAABAQECAgIDAwMEBAQF\nBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcY\nGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKior\nKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+\nPj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBR\nUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2Nk\nZGRlZWVmZ [...]
+       "prompt_number": 127,
+       "text": [
+        "<IPython.core.display.Image at 0x68f6dd0>"
+       ]
+      }
+     ],
+     "prompt_number": 127
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools yield_plot -q --theme-bw --saveas yield_plot.png --plot-type reads $directory "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 128
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools yield_plot -q --theme-bw --saveas yield_plot.pdf --plot-type reads $directory "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Image(\"yield_plot.png\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "png": "iVBORw0KGgoAAAANSUhEUgAACfYAAAn2CAMAAAA8GfGsAAADAFBMVEUAAAABAQECAgIDAwMEBAQF\nBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcY\nGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKior\nKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+\nPj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBR\nUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2Nk\nZGRlZWVmZ [...]
+       "prompt_number": 86,
+       "text": [
+        "<IPython.core.display.Image at 0x69066d0>"
+       ]
+      }
+     ],
+     "prompt_number": 86
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools hist -q --theme-bw --min-length 1000 --max-length 40000 --saveas hist.png $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 87
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools hist -q --theme-bw --min-length 1000 --max-length 40000 --saveas hist.pdf $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 129
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Image(\"hist.png\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "png": "iVBORw0KGgoAAAANSUhEUgAACfYAAAn2CAMAAAA8GfGsAAAC+lBMVEUAAAABAQECAgIDAwMEBAQF\nBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUWFhYXFxcY\nGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJycoKCgpKSkqKior\nKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs8PDw9PT0+\nPj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tMTExNTU1OTk5PT09QUFBR\nUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2Nk\nZGRlZWVmZ [...]
+       "prompt_number": 88,
+       "text": [
+        "<IPython.core.display.Image at 0x6906610>"
+       ]
+      }
+     ],
+     "prompt_number": 88
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools nucdist -q $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "A\t65660764\t296997439\t0.221081919834\r\n",
+        "C\t70989346\t296997439\t0.239023428077\r\n",
+        "T\t70730220\t296997439\t0.23815094244\r\n",
+        "G\t75164353\t296997439\t0.253080811919\r\n",
+        "N\t14452756\t296997439\t0.0486628977296\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 89
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "!poretools qualdist -q $directory"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "!\t0\t13821031\t296997405\t0.0465358645137\r\n",
+        "\"\t1\t25112550\t296997405\t0.0845547791907\r\n",
+        "#\t2\t30275124\t296997405\t0.101937335109\r\n",
+        "$\t3\t28014262\t296997405\t0.0943249386304\r\n",
+        "%\t4\t27335226\t296997405\t0.0920386021555\r\n",
+        "&\t5\t29026969\t296997405\t0.097734756302\r\n",
+        "'\t6\t28000641\t296997405\t0.0942790762768\r\n",
+        "(\t7\t25262940\t296997405\t0.0850611472514\r\n",
+        ")\t8\t21469851\t296997405\t0.0722896922281\r\n",
+        "*\t9\t17021140\t296997405\t0.0573107364356\r\n",
+        "+\t10\t12194629\t296997405\t0.0410597156564\r\n",
+        ",\t11\t9132498\t296997405\t0.0307494201843\r\n",
+        "-\t12\t6955173\t296997405\t0.0234182955235\r\n",
+        ".\t13\t5258815\t296997405\t0.0177066025207\r\n",
+        "/\t14\t3952021\t296997405\t0.0133065842781\r\n",
+        "0\t15\t2980747\t296997405\t0.0100362728759\r\n",
+        "1\t16\t2267230\t296997405\t0.00763383774346\r\n",
+        "2\t17\t1751907\t296997405\t0.00589872830707\r\n",
+        "3\t18\t1375773\t296997405\t0.00463227279713\r\n",
+        "4\t19\t1098801\t296997405\t0.00369969899232\r\n",
+        "5\t20\t888009\t296997405\t0.00298995541729\r\n",
+        "6\t21\t726878\t296997405\t0.00244742205744\r\n",
+        "7\t22\t599176\t296997405\t0.0020174452366\r\n",
+        "8\t23\t497704\t296997405\t0.00167578568574\r\n",
+        "9\t24\t415916\t296997405\t0.00140040280823\r\n",
+        ":\t25\t746873\t296997405\t0.002514745878\r\n",
+        ";\t26\t215364\t296997405\t0.00072513764893\r\n",
+        "<\t27\t176731\t296997405\t0.000595059071307\r\n",
+        "=\t28\t138388\t296997405\t0.000465956933193\r\n",
+        ">\t29\t103529\t296997405\t0.000348585537305\r\n",
+        "?\t30\t73216\t296997405\t0.000246520672462\r\n",
+        "@\t31\t49430\t296997405\t0.000166432430613\r\n",
+        "A\t32\t30164\t296997405\t0.000101563176958\r\n",
+        "B\t33\t16107\t296997405\t5.42327970845e-05\r\n",
+        "C\t34\t8018\t296997405\t2.69968688784e-05\r\n",
+        "D\t35\t3409\t296997405\t1.14782147676e-05\r\n",
+        "E\t36\t996\t296997405\t3.35356465488e-06\r\n",
+        "F\t37\t129\t296997405\t4.34347229398e-07\r\n",
+        "G\t38\t11\t296997405\t3.70373606463e-08\r\n",
+        "I\t40\t2\t296997405\t6.73406557205e-09\r\n",
+        "L\t43\t5\t296997405\t1.68351639301e-08\r\n",
+        "P\t47\t2\t296997405\t6.73406557205e-09\r\n",
+        "V\t53\t3\t296997405\t1.01010983581e-08\r\n",
+        "Z\t57\t5\t296997405\t1.68351639301e-08\r\n",
+        "h\t71\t2\t296997405\t6.73406557205e-09\r\n",
+        "j\t73\t4\t296997405\t1.34681311441e-08\r\n",
+        "m\t76\t3\t296997405\t1.01010983581e-08\r\n",
+        "s\t82\t3\t296997405\t1.01010983581e-08\r\n"
+       ]
+      }
+     ],
+     "prompt_number": 90
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
diff --git a/poretools/nucdist.py b/poretools/nucdist.py
new file mode 100644
index 0000000..94a7fae
--- /dev/null
+++ b/poretools/nucdist.py
@@ -0,0 +1,19 @@
+import Fast5File
+from collections import Counter
+
+def run(parser, args):
+
+	nuc_count = Counter()
+	total_nucs = 0
+
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		fq = fast5.get_fastq()
+		if fq is not None:
+			for n in fq.seq:
+				nuc_count[n] += 1
+				total_nucs += 1
+		fast5.close()
+
+	for n in nuc_count:
+		print '\t'.join(str(s) for s in [n, nuc_count[n], 
+			total_nucs, float(nuc_count[n]) / float(total_nucs)])
\ No newline at end of file
diff --git a/poretools/occupancy.py b/poretools/occupancy.py
new file mode 100644
index 0000000..d944c24
--- /dev/null
+++ b/poretools/occupancy.py
@@ -0,0 +1,169 @@
+import Fast5File
+from time import strftime, localtime
+from collections import defaultdict, Counter
+import rpy2.robjects.lib.ggplot2 as gg
+import rpy2.robjects as robjects
+from rpy2.robjects.packages import importr
+import sys
+import string
+import random
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+def minion_flowcell_layout():
+	seeds = [125,121,117,113,109,105,101,97,
+	         93,89,85,81,77,73,69,65,
+	         61,57,53,49,45,41,37,33,
+	         29,25,21,17,13,9,5,1]
+	
+	flowcell_layout = []
+	for s in seeds:
+		for block in range(4):
+			for row in range(4):
+				flowcell_layout.append(s + 128*block + row)
+	return flowcell_layout
+
+def plot_read_count(parser, args, tot_reads_per_pore):
+	"""
+	Plot the pore performance
+	"""
+	r = robjects.r
+	r.library("ggplot2")
+	grdevices = importr('grDevices')
+
+	flowcell_layout = minion_flowcell_layout()
+
+	pore_values = []
+	for pore in flowcell_layout:
+		if pore in tot_reads_per_pore:
+			pore_values.append(tot_reads_per_pore[pore])
+		else:
+			pore_values.append(0)		
+	
+	# make a data frame of the lists
+	d = {'rownum': robjects.IntVector(range(1,17)*32),
+		 'colnum': robjects.IntVector(sorted(range(1,33)*16)),
+		 'tot_reads': robjects.IntVector(pore_values),
+		 'labels': robjects.IntVector(flowcell_layout)
+		 }
+
+	df = robjects.DataFrame(d)
+	gp = gg.ggplot(df)
+	pp = gp + gg.aes_string(y = 'factor(rownum, rev(rownum))', \
+		                         x = 'factor(colnum)') \
+            + gg.geom_point(gg.aes_string(color='tot_reads'), size = 7) \
+            + gg.geom_text(gg.aes_string(label ='labels'), colour="white", size = 2) \
+            + gg.scale_colour_gradient2(low = "black", mid= "black", high="red") \
+            + gg.coord_fixed(ratio=1.4) \
+            + gg.labs(x=gg.NULL, y=gg.NULL)
+
+	if args.saveas is not None:
+		plot_file = args.saveas
+		if plot_file.endswith(".pdf"):
+			grdevices.pdf(plot_file, width = 11, height = 8.5)
+		elif plot_file.endswith(".png"):
+			grdevices.png(plot_file, width = 11, height = 8.5, 
+				units = "in", res = 300)
+		else:
+			logger.error("Unrecognized extension for %s!" % (plot_file))
+			sys.exit()
+
+		pp.plot()
+		grdevices.dev_off()
+	else:
+		pp.plot()
+		# keep the plot open until user hits enter
+		print('Type enter to exit.')
+		raw_input()
+
+
+def plot_total_bp(parser, args, tot_bp_per_pore):
+	"""
+	Plot the pore performance
+	"""
+	import math
+	r = robjects.r
+	r.library("ggplot2")
+	grdevices = importr('grDevices')
+
+	flowcell_layout = minion_flowcell_layout()
+
+	pore_values = []
+	for pore in flowcell_layout:
+		if pore in tot_bp_per_pore:
+			pore_values.append(math.log10(tot_bp_per_pore[pore]))
+		else:
+			pore_values.append(0)		
+	
+	# make a data frame of the lists
+	d = {'rownum': robjects.IntVector(range(1,17)*32),
+		 'colnum': robjects.IntVector(sorted(range(1,33)*16)),
+		 'log10_tot_bp': robjects.IntVector(pore_values),
+		 'labels': robjects.IntVector(flowcell_layout)
+		 }
+
+	df = robjects.DataFrame(d)
+	gp = gg.ggplot(df)
+	pp = gp + gg.aes_string(y = 'factor(rownum, rev(rownum))', \
+		                         x = 'factor(colnum)') \
+            + gg.geom_point(gg.aes_string(color='log10_tot_bp'), size = 7) \
+            + gg.geom_text(gg.aes_string(label ='labels'), colour="white", size = 2) \
+            + gg.scale_colour_gradient2(low = "black", mid= "black", high="red") \
+            + gg.coord_fixed(ratio=1.4) \
+            + gg.labs(x=gg.NULL, y=gg.NULL)
+
+	if args.saveas is not None:
+		plot_file = args.saveas
+		if plot_file.endswith(".pdf"):
+			grdevices.pdf(plot_file, width = 11, height = 8.5)
+		elif plot_file.endswith(".png"):
+			grdevices.png(plot_file, width = 11, height = 8.5, 
+				units = "in", res = 300)
+		else:
+			logger.error("Unrecognized extension for %s!" % (plot_file))
+			sys.exit()
+
+		pp.plot()
+		grdevices.dev_off()
+	else:
+		pp.plot()
+		# keep the plot open until user hits enter
+		print('Type enter to exit.')
+		raw_input()
+
+
+def run(parser, args):
+
+	tot_reads_per_pore = Counter()
+	tot_bp_per_pore = Counter()
+
+	print "\t".join(['channel_number', 'start_time', 'duration'])
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		if fast5.is_open:
+			fq = fast5.get_fastq()
+			
+			start_time = fast5.get_start_time()
+			if start_time is None:
+				logger.warning("No start time for %s!" % (fast5.filename))
+				fast5.close()
+				continue
+
+			pore_id = fast5.get_channel_number()
+			tot_reads_per_pore[int(pore_id)] += 1
+			tot_bp_per_pore[int(pore_id)] += len(fq.seq)
+
+			lt = localtime(start_time)
+			print "\t".join([
+				str(pore_id),
+				str(start_time),
+				str(fast5.get_duration())])
+			fast5.close()
+
+	if args.plot_type == 'read_count':
+		plot_read_count(parser, args, tot_reads_per_pore)
+	elif args.plot_type == 'total_bp':
+		plot_total_bp(parser, args, tot_bp_per_pore)
+
+
diff --git a/poretools/poretools_main.py b/poretools/poretools_main.py
new file mode 100755
index 0000000..d6d4c6e
--- /dev/null
+++ b/poretools/poretools_main.py
@@ -0,0 +1,407 @@
+#!/usr/bin/env python
+
+import os.path
+import sys
+import argparse
+
+#logger
+import logging
+logger = logging.getLogger('poretools')
+
+# poretools imports
+import poretools.version
+
+def run_subtool(parser, args):
+    if args.command == 'combine':
+        import combine as submodule
+    elif args.command == 'events':
+        import events as submodule
+    elif args.command == 'fasta':
+        import fasta as submodule
+    elif args.command == 'fastq':
+        import fastq as submodule
+    elif args.command == 'hist':
+        import hist as submodule
+    elif args.command == 'nucdist':
+        import nucdist as submodule
+    elif args.command == 'occupancy':
+        import occupancy as submodule
+    elif args.command == 'qualdist':
+        import qualdist as submodule
+    elif args.command == 'readstats':
+        import readstats as submodule
+    elif args.command == 'stats':
+        import stats as submodule
+    elif args.command == 'tabular':
+        import tabular as submodule
+    elif args.command == 'times':
+        import times as submodule
+    elif args.command == 'squiggle':
+        import squiggle as submodule
+    elif args.command == 'winner':
+        import winner as submodule
+    elif args.command == 'yield_plot':
+        import yield_plot as submodule
+
+    # run the chosen submodule.
+    submodule.run(parser, args)
+
+class ArgumentParserWithDefaults(argparse.ArgumentParser):
+    def __init__(self, *args, **kwargs):
+        super(ArgumentParserWithDefaults, self).__init__(*args, **kwargs)
+	self.add_argument("-q", "--quiet", help="Do not output warnings to stderr",
+                        action="store_true",
+                        dest="quiet")
+
+def main():
+    logging.basicConfig()
+
+    #########################################
+    # create the top-level parser
+    #########################################
+    parser = argparse.ArgumentParser(prog='poretools', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-v", "--version", help="Installed poretools version",
+                        action="version",
+                        version="%(prog)s " + str(poretools.version.__version__))
+    subparsers = parser.add_subparsers(title='[sub-commands]', dest='command', parser_class=ArgumentParserWithDefaults)
+
+    #########################################
+    # create the individual tool parsers
+    #########################################
+
+    ##########
+    # combine
+    ##########
+    parser_combine = subparsers.add_parser('combine',
+                                        help='Combine a set of FAST5 files in a TAR achive')
+    parser_combine.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_combine.add_argument('-o',
+                              dest='tar_filename',
+                              metavar='STRING',
+                              required=True,
+                              help='The name of the output TAR archive for the set of FAST5 files.')
+    parser_combine.set_defaults(func=run_subtool)
+
+
+    ##########
+    # FASTQ
+    ##########
+    parser_fastq = subparsers.add_parser('fastq',
+                                        help='Extract FASTQ sequences from a set of FAST5 files')
+    parser_fastq.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_fastq.add_argument('--type',
+                              dest='type',
+                              metavar='STRING',
+                              choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
+                              default='all',
+                              help='Which type of FASTA entries should be reported? Def.=all')
+    parser_fastq.add_argument('--start',
+                              dest='start_time',
+                              default=None,
+                              type=int,
+                              help='Only report reads from after start timestamp')
+    parser_fastq.add_argument('--end',
+                              dest='end_time',
+                              default=None,
+                              type=int,
+                              help='Only report reads from before end timestamp')
+    parser_fastq.add_argument('--min-length',
+                              dest='min_length',
+                              default=0,
+                              type=int,
+                              help=('Minimum read length for FASTA entry to be reported.'))
+    parser_fastq.add_argument('--high-quality',
+                              dest='high_quality',
+                              default=False,
+                              action='store_true',
+                              help=('Only report reads with more complement events than template.'))
+    parser_fastq.set_defaults(func=run_subtool)
+
+
+    ##########
+    # FASTA
+    ##########
+    parser_fasta = subparsers.add_parser('fasta',
+                                        help='Extract FASTA sequences from a set of FAST5 files')
+    parser_fasta.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_fasta.add_argument('--type',
+                              dest='type',
+                              metavar='STRING',
+                              choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
+                              default='all',
+                              help='Which type of FASTQ entries should be reported? Def.=all')
+    parser_fasta.add_argument('--start',
+                              dest='start_time',
+                              default=None,
+                              type=int,
+                              help='Only report reads from after start timestamp')
+    parser_fasta.add_argument('--end',
+                              dest='end_time',
+                              default=None,
+                              type=int,
+                              help='Only report reads from before end timestamp')
+    parser_fasta.add_argument('--min-length',
+                              dest='min_length',
+                              default=0,
+                              type=int,
+                              help=('Minimum read length for FASTA entry to be reported.'))
+    parser_fasta.add_argument('--high-quality',
+                              dest='high_quality',
+                              default=False,
+                              action='store_true',
+                              help=('Only report reads with more complement events than template.'))
+    parser_fasta.add_argument('--normal-quality',
+                              dest='normal_quality',
+                              default=False,
+                              action='store_true',
+                              help=('Only report reads with fewer complement events than template.'))
+    parser_fasta.set_defaults(func=run_subtool)
+
+
+    ##########
+    # stats
+    ##########
+    parser_stats = subparsers.add_parser('stats',
+                                        help='Get read size stats for a set of FAST5 files')
+    parser_stats.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_stats.add_argument('--type',
+                              dest='type',
+                              metavar='STRING',
+                              choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
+                              default='all',
+                              help='Which type of FASTQ entries should be reported? Def.=all')
+    parser_stats.add_argument('--full-tsv',
+                              dest='full_tsv',
+                              default=False,
+                              action='store_true',
+                              help=('Verbose output in tab-separated format.'))
+    parser_stats.set_defaults(func=run_subtool)
+
+
+    ##########
+    # hist
+    ##########
+    parser_hist = subparsers.add_parser('hist',
+                                        help='Plot read size histogram for a set of FAST5 files')
+    parser_hist.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_hist.add_argument('--min-length',
+                              dest='min_length',
+                              default=0,
+                              type=int,
+                              help=('Minimum read length to be included in histogram.'))
+    parser_hist.add_argument('--max-length',
+                              dest='max_length',
+                              default=1000000000,
+                              type=int,
+                              help=('Maximum read length to be included in histogram.'))
+    parser_hist.add_argument('--num-bins',
+                              dest='num_bins',
+                              default=50,
+                              type=int,
+                              help=('The number of histogram bins.'))
+    parser_hist.add_argument('--saveas',
+                             dest='saveas',
+                             metavar='STRING',
+                             help='Save the plot to a file.',
+                             default=None)
+    parser_hist.add_argument('--theme-bw',
+                             dest='theme_bw',
+                             default=False,
+                             action='store_true',
+                             help="Use the ggplot2 black and white theme.")
+
+    parser_hist.set_defaults(func=run_subtool)
+
+
+    ###########
+    # events
+    ###########
+    parser_events = subparsers.add_parser('events',
+                                        help='Extract each nanopore event for each read.')
+    parser_events.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_events.set_defaults(func=run_subtool)
+
+    
+    ###########
+    # readstats
+    ###########
+    parser_readstats = subparsers.add_parser('readstats',
+                                        help='Extract signal information for each read over time.')
+    parser_readstats.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_readstats.set_defaults(func=run_subtool)
+
+
+    ##########
+    # tabular
+    ##########
+    parser_tabular = subparsers.add_parser('tabular',
+                                        help='Extract the lengths and name/seq/quals from a set of FAST5 files in TAB delimited format')
+    parser_tabular.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_tabular.add_argument('--type',
+                              dest='type',
+                              metavar='STRING',
+                              choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
+                              default='all',
+                              help='Which type of FASTA entries should be reported? Def.=all')
+    parser_tabular.set_defaults(func=run_subtool)
+
+    
+    #########
+    # nucdist
+    #########
+    parser_nucdist = subparsers.add_parser('nucdist',
+                                        help='Get the nucl. composition of a set of FAST5 files')
+    parser_nucdist.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_nucdist.set_defaults(func=run_subtool)
+
+    
+    ##########
+    # qualdist
+    ##########
+    parser_qualdist = subparsers.add_parser('qualdist',
+                                        help='Get the qual score composition of a set of FAST5 files')
+    parser_qualdist.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_qualdist.set_defaults(func=run_subtool)
+
+
+    ##########
+    # winner
+    ##########
+    parser_winner = subparsers.add_parser('winner',
+                                        help='Get the longest read from a set of FAST5 files')
+    parser_winner.add_argument('files', metavar='FILES', nargs='+',
+                               help='The input FAST5 files.')
+    parser_winner.add_argument('--type',
+                              dest='type',
+                              metavar='STRING',
+                              choices=['all', 'fwd', 'rev', '2D', 'fwd,rev'],
+                              default='all',
+                              help='Which type of FASTA entries should be reported? Def.=all')
+    parser_winner.set_defaults(func=run_subtool)
+
+    ###########
+    # squiggle
+    ###########
+    parser_squiggle = subparsers.add_parser('squiggle',
+                                        help='Plot the observed signals for FAST5 reads.')
+    parser_squiggle.add_argument('files', metavar='FILES', nargs='+',
+                             help='The input FAST5 files.')
+    parser_squiggle.add_argument('--saveas',
+                             dest='saveas',
+                             metavar='STRING',
+                             choices=['pdf', 'png'],
+                             help='Save the squiggle plot to a file.',
+                             default=None)
+    parser_squiggle.add_argument('--num-facets',
+                              dest='num_facets',
+                              metavar='INTEGER',
+                              default=6,
+                              type=int,
+                              help=('The number of plot facets (sub-plots). More is better for long reads. (def=6)'))
+    parser_squiggle.add_argument('--theme-bw',
+                             dest='theme_bw',
+                             default=False,
+                             action='store_true',
+                             help="Use the ggplot2 black and white theme.")
+
+    parser_squiggle.set_defaults(func=run_subtool)
+
+    ##########
+    # times
+    ##########
+    parser_times = subparsers.add_parser('times',
+                                        help='Return the start times from a set of FAST5 files in tabular format')
+    parser_times.add_argument('files', metavar='FILES', nargs='+',
+                               help='The input FAST5 files.')
+    parser_times.set_defaults(func=run_subtool)
+
+    ############
+    # yield_plot
+    ############
+    parser_yield_plot = subparsers.add_parser('yield_plot',
+                                        help='Plot the yield over time for a set of FAST5 files')
+    parser_yield_plot.add_argument('files', metavar='FILES', nargs='+',
+                               help='The input FAST5 files.')
+    parser_yield_plot.add_argument('--saveas',
+                             dest='saveas',
+                             metavar='STRING',
+                             help='Save the plot to a file. Extension (.pdf or .png) drives type.',
+                             default=None)
+    parser_yield_plot.add_argument('--plot-type',
+                             dest='plot_type',
+                             metavar='STRING',
+                             choices=['reads', 'basepairs'],
+                             help='Save the wiggle plot to a file (def=reads).',
+                             default='reads')
+    parser_yield_plot.add_argument('--theme-bw',
+                             dest='theme_bw',
+                             default=False,
+                             action='store_true',
+                             help="Use the ggplot2 black and white theme.")
+    parser_yield_plot.add_argument('--extrapolate',
+                             dest='extrapolate',
+                             metavar='INTEGER',
+                             default=0,
+                             help="Fit a curve and extrapolate to n hours")
+    parser_yield_plot.add_argument('--skip',
+                             dest='skip',
+                             metavar='INTEGER',
+                             type=int,
+                             default=1,
+                             help="Only plot every n points to reduce size")
+    parser_yield_plot.add_argument('--savedf',
+                             dest='savedf',
+                             metavar='STRING',
+                             help='Save the data frame used to construct plot to a file.',
+                             default=None)
+                             
+    parser_yield_plot.set_defaults(func=run_subtool)
+
+    ############
+    # yield_plot
+    ############
+    parser_occupancy = subparsers.add_parser('occupancy',
+                                        help='Inspect pore activity over time for a set of FAST5 files')
+    parser_occupancy.add_argument('files', metavar='FILES', nargs='+',
+                               help='The input FAST5 files.')
+    parser_occupancy.add_argument('--saveas',
+                             dest='saveas',
+                             metavar='STRING',
+                             help='Save the plot to a file. Extension (.pdf or .png) drives type.',
+                             default=None)
+    parser_occupancy.add_argument('--plot-type',
+                             dest='plot_type',
+                             metavar='STRING',
+                             choices=['read_count', 'total_bp'],
+                             help='The type of plot to generate',
+                             default='read_count')
+
+
+    parser_occupancy.set_defaults(func=run_subtool)
+
+    #######################################################
+    # parse the args and call the selected function
+    #######################################################
+    args = parser.parse_args()
+
+    if args.quiet:
+        logger.setLevel(logging.ERROR)
+
+    try:
+      args.func(parser, args)
+    except IOError, e:
+         if e.errno != 32:  # ignore SIGPIPE
+             raise
+
+if __name__ == "__main__":
+    main()
diff --git a/poretools/qualdist.py b/poretools/qualdist.py
new file mode 100644
index 0000000..66fbb26
--- /dev/null
+++ b/poretools/qualdist.py
@@ -0,0 +1,19 @@
+import Fast5File
+from collections import Counter
+
+def run(parser, args):
+
+	qual_count = Counter()
+	total_nucs = 0
+
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		fq = fast5.get_fastq()
+		if fq is not None:
+			for q in fq.qual:
+				qual_count[ord(q)-33] += 1
+				total_nucs += 1
+		fast5.close()
+
+	for q in qual_count:
+		print '\t'.join(str(s) for s in [chr(q+33), q, qual_count[q], 
+			total_nucs, float(qual_count[q]) / float(total_nucs)])
\ No newline at end of file
diff --git a/poretools/readstats.py b/poretools/readstats.py
new file mode 100644
index 0000000..3d7658f
--- /dev/null
+++ b/poretools/readstats.py
@@ -0,0 +1,27 @@
+import Fast5File
+
+def run(parser, args):
+
+	print "start_time\tchannel_number\tread_number\ttemplate_events\tcomplement_events"
+
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+
+		start_time = fast5.get_start_time()
+		channel_number = fast5.get_channel_number()
+		read_number = fast5.get_read_number()
+
+		template_events = fast5.get_template_events()
+		if template_events is not None:
+			template_len = len(template_events)
+		else:
+			template_len = 0
+
+		complement_events = fast5.get_complement_events()
+		if complement_events is not None:
+			complement_len = len(complement_events)
+		else:
+			complement_len = 0
+
+		print "%s\t%s\t%s\t%s\t%s" % (start_time, channel_number, read_number, template_len, complement_len)
+
+		fast5.close()
diff --git a/poretools/scripts/__init__.py b/poretools/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/poretools/scripts/poretools b/poretools/scripts/poretools
new file mode 100755
index 0000000..40cd981
--- /dev/null
+++ b/poretools/scripts/poretools
@@ -0,0 +1,5 @@
+#!/usr/bin/python -E
+
+import poretools.scripts
+import poretools.poretools_main
+poretools.poretools_main.main()
diff --git a/poretools/scripts/poretools-script.py b/poretools/scripts/poretools-script.py
new file mode 100644
index 0000000..7a68453
--- /dev/null
+++ b/poretools/scripts/poretools-script.py
@@ -0,0 +1,5 @@
+#!C:\Anaconda\python.exe
+# EASY-INSTALL-SCRIPT: 'poretools==0.2.0','poretools'
+__requires__ = 'poretools==0.2.0'
+import pkg_resources
+pkg_resources.run_script('poretools==0.2.0', 'poretools')
diff --git a/poretools/scripts/poretools.bat b/poretools/scripts/poretools.bat
new file mode 100644
index 0000000..14f59bc
--- /dev/null
+++ b/poretools/scripts/poretools.bat
@@ -0,0 +1,4 @@
+ at echo off
+set PYFILE=%~f0
+set PYFILE=%PYFILE:~0,-4%-script.py
+"%~f0\..\..\python.exe" "%PYFILE%" %*
diff --git a/poretools/squiggle.py b/poretools/squiggle.py
new file mode 100644
index 0000000..311964f
--- /dev/null
+++ b/poretools/squiggle.py
@@ -0,0 +1,107 @@
+import os
+import sys
+import rpy2.robjects as robjects
+import rpy2.robjects.lib.ggplot2 as ggplot2
+from rpy2.robjects.packages import importr
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+import Fast5File
+
+def plot_squiggle(args, filename, start_times, mean_signals):
+	"""
+	Use rpy2 to create a squiggle plot of the read
+	"""
+	r = robjects.r
+	r.library("ggplot2")
+	grdevices = importr('grDevices')
+
+	# set t_0 as the first measured time for the read.
+	t_0 = start_times[0]
+	total_time = start_times[-1] - start_times[0]
+	# adjust times to be relative to t_0
+	r_start_times = robjects.FloatVector([t - t_0 for t in start_times])
+	r_mean_signals = robjects.FloatVector(mean_signals)
+	
+	# infer the appropriate number of events given the number of facets
+	num_events = len(r_mean_signals)
+	events_per_facet = (num_events / args.num_facets) + 1
+	# dummy variable to control faceting
+	facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))])
+
+	# make a data frame of the start times and mean signals
+	d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category}
+	df = robjects.DataFrame(d)
+
+	gp = ggplot2.ggplot(df)
+	if not args.theme_bw:
+		pp = gp + ggplot2.aes_string(x='start', y='mean') \
+			+ ggplot2.geom_step(size=0.25) \
+			+ ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
+			+ ggplot2.scale_x_continuous('Time (seconds)') \
+			+ ggplot2.scale_y_continuous('Mean signal (picoamps)') \
+			+ ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
+			+ ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)})
+	else:
+		pp = gp + ggplot2.aes_string(x='start', y='mean') \
+			+ ggplot2.geom_step(size=0.25) \
+			+ ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
+			+ ggplot2.scale_x_continuous('Time (seconds)') \
+			+ ggplot2.scale_y_continuous('Mean signal (picoamps)') \
+			+ ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
+			+ ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \
+			+ ggplot2.theme_bw()
+
+	if args.saveas is not None:
+		plot_file = os.path.basename(filename) + "." + args.saveas
+		if os.path.isfile(plot_file):
+			raise Exception('Cannot create plot for %s: plot file %s already exists' % (filename, plot_file))
+		if args.saveas == "pdf":
+			grdevices.pdf(plot_file, width = 8.5, height = 11)
+		elif args.saveas == "png":
+			grdevices.png(plot_file, width = 8.5, height = 11, 
+				units = "in", res = 300)
+		pp.plot()
+		grdevices.dev_off()
+	else:
+		pp.plot()
+		# keep the plot open until user hits enter
+		print('Type enter to exit.')
+		raw_input()
+
+def do_plot_squiggle(args, fast5):
+	start_times = []
+	mean_signals = []
+
+	for event in fast5.get_template_events():
+		start_times.append(event.start)
+		mean_signals.append(event.mean)
+
+	if start_times:
+		plot_squiggle(args, fast5.filename, start_times, mean_signals)
+	else:
+		logger.warning("Could not extract template events for read: %s.\n" \
+			% fast5.filename)
+
+	fast5.close()
+
+
+def run(parser, args):
+
+	fast5_set = Fast5File.Fast5FileSet(args.files)
+
+	first_fast5 = fast5_set.next()
+	for fast5 in fast5_set:
+		# only create a squiggle plot for multiple reads if saving to file.
+		if args.saveas is None:
+			sys.exit("""Please use --saveas when plotting"""
+					 """ multiple FAST5 files as input.\n""")
+		if first_fast5 is not None:
+			do_plot_squiggle(args, first_fast5)
+			first_fast5 = None
+		do_plot_squiggle(args, fast5)
+
+	if first_fast5 is not None:
+		do_plot_squiggle(args, first_fast5)
diff --git a/poretools/statistics.py b/poretools/statistics.py
new file mode 100644
index 0000000..4ab2a34
--- /dev/null
+++ b/poretools/statistics.py
@@ -0,0 +1,51 @@
+def mean(l):
+	"""
+	Return the mean of a list of numbers
+	"""
+	if isinstance(l, list):
+		if len(l):
+			return float(sum(l)) / float(len(l))
+		else:
+			return None
+	else:
+		return None
+
+def median(l):
+	"""
+	Return the median of a list of numbers
+	"""
+	if isinstance(l, list):
+		l = sorted(l)
+		if len(l) % 2 > 0:
+			mid = len(l) / 2
+			return l[mid]
+		else:
+			low = len(l) / 2 - 1
+			high = len(l) / 2
+			return float(l[low] + l[high]) / 2.0
+	else:
+		return None
+
+def NX(l, x=[25,50,75]):
+        """
+        Returns NX for all x for a list of numbers l.
+        Default: N25, N50, N75
+        Assumes all values in list x are between 0 and 100.
+        Interpretation: When NX = NX_value, X% of data (in bp) is contained in reads at least NX_value bp long.
+        """
+	if isinstance(l, list) and isinstance(x, list):
+		l = sorted(l)
+		x = sorted(x)
+		total = sum(l)
+                nxsum = 0
+                nxvalues = {e:0 for e in x}
+		for e in x:
+                        xpct = total*e/100.0
+                        while nxsum < xpct and l:
+                                nxsum += l[-1]
+                                lastsize = l.pop()
+                        nxvalues[e] = lastsize
+                return nxvalues
+
+	else:
+		return None
diff --git a/poretools/stats.py b/poretools/stats.py
new file mode 100644
index 0000000..b2b9920
--- /dev/null
+++ b/poretools/stats.py
@@ -0,0 +1,63 @@
+import statistics as stat
+import Fast5File
+import logging
+from collections import defaultdict
+logger = logging.getLogger('poretools')
+
+def run(parser, args):
+	if args.full_tsv:
+		files = 0
+		basecalled_files = 0
+		stats = defaultdict(list)
+		for fast5 in Fast5File.Fast5FileSet(args.files):
+			files += 1
+			fas = fast5.get_fastas_dict()
+			if len(fas) > 0:
+				basecalled_files += 1
+			for category, fa in fas.iteritems():
+				if fa is not None:
+					stats[category].append(len(fa.seq))
+					if category == 'twodirections':
+						if fast5.is_high_quality():
+							stats['2D_hq'].append(len(fa.seq))
+
+			fast5.close()
+
+		print "files\ttotal reads\t%d" % (files)
+		print "files\ttotal base-called reads\t%d" % (basecalled_files)
+		for category in sorted(stats.keys()):
+			sizes = stats[category]
+
+			if len(sizes) > 0:
+				print "%s\ttotal reads\t%d" % (category, len(sizes))
+				print "%s\ttotal base pairs\t%d" % (category, sum(sizes))
+				print "%s\tmean\t%.2f" % (category, stat.mean(sizes))
+				print "%s\tmedian\t%d" % (category, stat.median(sizes))
+				print "%s\tmin\t%d" % (category, min(sizes))
+				print "%s\tmax\t%d" % (category, max(sizes))
+				nxvalues = stat.NX(sizes, [25,50,75])
+				print "%s\tN25\t%d" % (category, nxvalues[25])
+				print "%s\tN50\t%d" % (category, nxvalues[50])
+				print "%s\tN75\t%d" % (category, nxvalues[75])
+			else:
+				logger.warning("No valid sequences observed.\n")
+	else:
+		sizes = []
+		for fast5 in Fast5File.Fast5FileSet(args.files):
+			fas = fast5.get_fastas(args.type)
+			sizes.extend([len(fa.seq) for fa in fas if fa is not None])
+			fast5.close()
+
+		if len(sizes) > 0:
+			print "total reads\t%d" % (len(sizes))
+			print "total base pairs\t%d" % (sum(sizes))
+			print "mean\t%.2f" % (stat.mean(sizes))
+			print "median\t%d" % (stat.median(sizes))
+			print "min\t%d" % (min(sizes))
+			print "max\t%d" % (max(sizes))
+                        nxvalues = stat.NX(sizes, [25,50,75])
+                        print "N25\t%d" % (nxvalues[25])
+                        print "N50\t%d" % (nxvalues[50])
+                        print "N75\t%d" % (nxvalues[75])
+		else:
+			logger.warning("No valid sequences observed.\n")
diff --git a/poretools/tabular.py b/poretools/tabular.py
new file mode 100644
index 0000000..24a6a2e
--- /dev/null
+++ b/poretools/tabular.py
@@ -0,0 +1,14 @@
+import Fast5File
+
+def run(parser, args):
+	
+	print '\t'.join(['length', 'name', 'sequence', 'quals'])
+	
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		fqs = fast5.get_fastqs(args.type)
+		for fq in fqs:
+			if fq is None:
+				fast5.close()
+				continue
+			print '\t'.join([str(len(fq.seq)), fq.name, fq.seq, fq.qual])
+		fast5.close()
\ No newline at end of file
diff --git a/poretools/times.py b/poretools/times.py
new file mode 100644
index 0000000..577ec39
--- /dev/null
+++ b/poretools/times.py
@@ -0,0 +1,43 @@
+import Fast5File
+from time import strftime, localtime
+import sys
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+def run(parser, args):
+	print '\t'.join(['channel', 'filename', 'read_length', 
+		'exp_starttime', 'unix_timestamp', 'duration', 
+		'unix_timestamp_end', 'iso_timestamp', 'day', 
+		'hour', 'minute'])
+	
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		if fast5.is_open:
+			
+			fq = fast5.get_fastq()
+			
+			start_time = fast5.get_start_time()
+			if start_time is None:
+				logger.warning("No start time for %s!" % (fast5.filename))
+				fast5.close()
+				continue
+
+			if fq is not None:
+				read_length = len(fq.seq)
+			else:
+				read_length = 0
+
+			lt = localtime(start_time)
+			print "\t".join([fast5.get_channel_number(),
+				fast5.filename, 
+				str(read_length),
+				fast5.get_exp_start_time(),
+				str(start_time), \
+				str(fast5.get_duration()),
+				str(fast5.get_end_time()),
+				strftime('%Y-%m-%dT%H:%M:%S%z', lt),
+				strftime('%d', lt),
+				strftime('%H', lt),
+				strftime('%M', lt)])
+			fast5.close()
diff --git a/poretools/version.py b/poretools/version.py
new file mode 100644
index 0000000..b72a06e
--- /dev/null
+++ b/poretools/version.py
@@ -0,0 +1,2 @@
+__version__="0.5.1"
+
diff --git a/poretools/windows.py b/poretools/windows.py
new file mode 100644
index 0000000..f63e343
--- /dev/null
+++ b/poretools/windows.py
@@ -0,0 +1,19 @@
+import _winreg
+import os
+
+current_version = None
+
+try:
+	key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, "SOFTWARE\\R-core\\R")
+except Exception:
+	key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, "SOFTWARE\\R-core\\R")
+
+version = _winreg.QueryValueEx(key, "Current Version")[0]
+install_path = _winreg.QueryValueEx(key, "InstallPath")[0]
+
+os.environ['R_HOME'] = install_path
+os.environ['R_USER'] = os.environ['HOMEPATH'] + '\\Documents'
+
+print "Setting R_HOME to %s" % (install_path,)
+print "Setting R_USER to %s" % (os.environ['R_USER'])
+
diff --git a/poretools/winner.py b/poretools/winner.py
new file mode 100644
index 0000000..a6671ac
--- /dev/null
+++ b/poretools/winner.py
@@ -0,0 +1,25 @@
+import Fast5File
+import sys
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+
+
+def run(parser, args):
+	longest_size = 0
+	longest_read = None
+	
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		fas = fast5.get_fastas(args.type)
+
+		for fa in fas:
+			if fa and len(fa.seq) > longest_size:
+				longest_size = len(fa.seq)
+				longest_read = fa
+
+		fast5.close()
+
+	logger.info("Wow, it's a whopper: your longest read is %d bases." % (longest_size,))
+	print longest_read
+
diff --git a/poretools/yield_plot.py b/poretools/yield_plot.py
new file mode 100644
index 0000000..26ce09e
--- /dev/null
+++ b/poretools/yield_plot.py
@@ -0,0 +1,125 @@
+import Fast5File
+from time import strftime, localtime
+import rpy2.robjects.lib.ggplot2 as ggplot2
+import rpy2.robjects as robjects
+from rpy2.robjects.packages import importr
+
+#logging
+import logging
+logger = logging.getLogger('poretools')
+logger.setLevel(logging.INFO)
+
+def plot_collectors_curve(args, start_times, read_lengths):
+	"""
+	Use rpy2 to create a collectors curve of the run
+	"""
+	r = robjects.r
+	r.library("ggplot2")
+	grdevices = importr('grDevices')
+
+	# set t_0 as the first measured time for the read.
+	t_0 = start_times[0]
+
+	# adjust times to be relative to t_0
+	r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \
+		for t in start_times])
+	r_read_lengths = robjects.IntVector(read_lengths)
+
+	# compute the cumulative based on reads or total base pairs
+	if args.plot_type == 'reads':
+		y_label = "Total reads"
+		cumulative = \
+			r.cumsum(robjects.IntVector([1] * len(start_times)))
+	elif args.plot_type == 'basepairs':
+		y_label = "Total base pairs"
+		cumulative = r.cumsum(r_read_lengths)
+
+	step = args.skip
+	# make a data frame of the lists
+	d = {'start': robjects.FloatVector([r_start_times[n] for n in xrange(0, len(r_start_times), step)]), 
+		'lengths': robjects.IntVector([r_read_lengths[n] for n in xrange(0, len(r_read_lengths), step)]),
+		'cumul': robjects.IntVector([cumulative[n] for n in xrange(0, len(cumulative), step)])}
+	df = robjects.DataFrame(d)
+
+
+	if args.savedf:
+		robjects.r("write.table")(df, file=args.savedf, sep="\t")
+
+	# title
+	total_reads = len(read_lengths)
+	total_bp = sum(read_lengths)
+	plot_title = "Yield: " \
+		+ str(total_reads) + " reads and " \
+		+ str(total_bp) + " base pairs."
+
+	# plot
+	gp = ggplot2.ggplot(df)
+	pp = gp + ggplot2.aes_string(x='start', y='cumul') \
+		+ ggplot2.geom_step(size=2) \
+		+ ggplot2.scale_x_continuous('Time (hours)') \
+		+ ggplot2.scale_y_continuous(y_label) \
+		+ ggplot2.ggtitle(plot_title)
+
+        # extrapolation
+	if args.extrapolate:
+		start = robjects.ListVector({'a': 1, 'b': 1})
+                pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls',
+                                              formula='y~a*I((x*3600)^b)',
+                                              se='FALSE', start=start) \
+                        + ggplot2.xlim(0, float(args.extrapolate))
+
+	if args.theme_bw:
+		pp = pp + ggplot2.theme_bw()	
+
+	if args.saveas is not None:
+		plot_file = args.saveas
+		if plot_file.endswith(".pdf"):
+			grdevices.pdf(plot_file, width = 8.5, height = 8.5)
+		elif plot_file.endswith(".png"):
+			grdevices.png(plot_file, width = 8.5, height = 8.5, 
+				units = "in", res = 300)
+		else:
+			logger.error("Unrecognized extension for %s!" % (plot_file))
+			sys.exit()
+
+		pp.plot()
+		grdevices.dev_off()
+	else:
+		pp.plot()
+		# keep the plot open until user hits enter
+		print('Type enter to exit.')
+		raw_input()
+
+def run(parser, args):
+	
+	start_times = []
+	read_lengths = []
+	files_processed = 0
+	for fast5 in Fast5File.Fast5FileSet(args.files):
+		if fast5.is_open:
+			
+			fq = fast5.get_fastq()
+			
+			start_time = fast5.get_start_time()
+			if start_time is None:
+				logger.warning("No start time for %s!" % (fast5.filename))
+				fast5.close()
+				continue
+
+			start_times.append(start_time)
+			if fq is not None:
+				read_lengths.append(len(fq.seq))
+			else:
+				read_lengths.append(0)
+			fast5.close()
+
+		files_processed += 1
+		if files_processed % 100 == 0:
+			logger.info("%d files processed." % files_processed)
+	
+
+
+	# sort the data by start time
+	start_times, read_lengths = (list(t) for t in zip(*sorted(zip(start_times, read_lengths))))
+	plot_collectors_curve(args, start_times, read_lengths)
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5fba58b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+h5py>=2.0.0
+rpy2>=2.4.2
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..89bdd6a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,41 @@
+import os
+import sys
+from setuptools import setup
+
+version_py = os.path.join(os.path.dirname(__file__), 'poretools', 'version.py')
+version = open(version_py).read().strip().split('=')[-1].replace('"','')
+long_description = """
+``poretools`` is a toolset for working with nanopore sequencing data'
+"""
+
+with open("requirements.txt", "r") as f:
+    install_requires = [x.strip() for x in f.readlines()]
+
+setup(
+        name="poretools",
+        version=version,
+        install_requires=install_requires,
+        requires = ['python (>=2.7, <3.0)'],
+        packages=['poretools',
+                  'poretools.scripts'],
+        author="Nick Loman and Aaron Quinlan",
+        description='A toolset for working with nanopore sequencing data',
+        long_description=long_description,
+        url="http://poretools.readthedocs.org",
+        package_dir = {'poretools': "poretools"},
+        package_data = {'poretools': []},
+        zip_safe = False,
+        include_package_data=True,
+        #scripts = ['poretools/scripts/poretools'],
+        entry_points = {
+            'console_scripts' : [
+                 'poretools = poretools.poretools_main:main', 
+            ],
+        },  
+        author_email="arq5x at virginia.edu",
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'Intended Audience :: Science/Research',
+            'License :: OSI Approved :: GNU General Public License (GPL)',
+            'Topic :: Scientific/Engineering :: Bio-Informatics']
+    )

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/poretools.git



More information about the debian-med-commit mailing list