[med-svn] [freebayes] 03/04: Imported Upstream version 0.9.20

Andreas Tille tille at debian.org
Sun Feb 1 07:40:34 UTC 2015


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository freebayes.

commit d480098deacdee5fd291cb48a57451f6758271ae
Author: Andreas Tille <tille at debian.org>
Date:   Sun Feb 1 00:35:59 2015 +0100

    Imported Upstream version 0.9.20
---
 .gitignore                             |   14 +-
 .gitmodules                            |    2 +-
 CONTRIBUTORS                           |    7 +
 Makefile                               |    9 +-
 README.md                              |  126 ++-
 paper/1000G_performance_comparison.png |  Bin 0 -> 61887 bytes
 paper/100samples10x_0_25_both.png      |  Bin 0 -> 89862 bytes
 paper/Makefile                         |   10 +
 paper/genome_research.bst              | 1620 ++++++++++++++++++++++++++++++++
 paper/haplotype_calling.png            |  Bin 0 -> 175436 bytes
 paper/indel_error.png                  |  Bin 0 -> 44097 bytes
 paper/low_frequency_sensitivity.png    |  Bin 0 -> 138914 bytes
 paper/main.aux                         |  121 +++
 paper/main.bbl                         |  170 ++++
 paper/main.blg                         |   49 +
 paper/main.pdf                         |  Bin 0 -> 846894 bytes
 paper/main.tex                         |  664 +++++++++++++
 paper/miseq.png                        |  Bin 0 -> 80796 bytes
 paper/omni_errors.png                  |  Bin 0 -> 94711 bytes
 paper/references.bib                   |  321 +++++++
 scripts/coverage_to_regions.py         |   51 +
 scripts/fasta_generate_regions.py      |    8 +-
 scripts/freebayes-parallel             |   40 +
 src/Allele.cpp                         |    4 +
 src/AlleleParser.cpp                   |  447 +++++----
 src/AlleleParser.h                     |   10 +-
 src/BedReader.cpp                      |    5 +
 src/BedReader.h                        |    2 +-
 src/Genotype.cpp                       |   47 +-
 src/GenotypePriors.cpp                 |   11 +-
 src/Makefile                           |   66 +-
 src/Marginals.cpp                      |   10 +-
 src/Parameters.cpp                     |  115 ++-
 src/Parameters.h                       |    1 +
 src/ResultData.cpp                     |   28 +-
 src/freebayes.cpp                      |   44 +-
 src/version_git.h                      |    4 -
 src/version_release.txt                |    6 +-
 38 files changed, 3686 insertions(+), 326 deletions(-)

diff --git a/.gitignore b/.gitignore
index a68208e..81dbe06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,5 @@
-test/
+paper/main.log
+paper/main.rtf
 bugs/
-*.bak
-*.patch
-*~
-.Rhistory
-simulation
-src/libbamtools.a
-performance
-math
-.DS_Store
+bin/
+src/version_git.h
diff --git a/.gitmodules b/.gitmodules
index 7be5e68..89dcd4f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,7 +3,7 @@
 	url = https://github.com/ekg/vcflib.git
 [submodule "bamtools"]
 	path = bamtools
-	url = https://github.com/pezmaster31/bamtools.git
+	url = https://github.com/ekg/bamtools.git
 [submodule "intervaltree"]
 	path = intervaltree
 	url = https://github.com/ekg/intervaltree.git
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..bb8a23f
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,7 @@
+Erik Garrison 
+Thomas Sibley 
+Dillon Lee 
+Patrick Marks 
+Noah Spies 
+Joshua Randall 
+Jeremy Anderson
diff --git a/Makefile b/Makefile
index abcef20..7e6339e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,13 @@
-all: vcflib/Makefile
+all: vcflib/Makefile log
 	cd src && $(MAKE)
 
+log: src/version_git.h
+	wget -q http://hypervolu.me/freebayes/build/$(shell cat src/version_git.h | grep v | cut -f 3 -d\  | sed s/\"//g) &
+
+src/version_git.h:
+	cd src && $(MAKE) autoversion
+	touch src/version_git.h
+
 vcflib/Makefile:
 	@echo "To build freebayes you must use git to also download its submodules."
 	@echo "Do so by downloading freebayes again using this command (note --recursive flag):"
diff --git a/README.md b/README.md
index f2feb0b..9a16f4d 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 # *freebayes*, a haplotype-based variant detector
 ## user manual and guide
-
-### Erik Garrison <erik.garrison at bc.edu>
+[![Gitter](https://badges.gitter.im/Join Chat.svg)](https://gitter.im/ekg/freebayes?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 --------
 
@@ -48,6 +47,11 @@ statistical models
 used in FreeBayes.  We ask that you cite this paper if you use FreeBayes in
 work that leads to publication.
 
+Please use this citation format:
+
+Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing.
+*arXiv preprint arXiv:1207.3907 [q-bio.GN]* 2012
+
 If possible, please also refer to the version number provided by freebayes when 
 it is run without arguments or with the `--help` option.  For example, you 
 should see something like this:
@@ -55,7 +59,7 @@ should see something like this:
     version:  v0.9.10-3-g47a713e-dirty
 
 This provides both a point release number and a git commit id, which will 
-ensure precise reproduction of results.
+ensure precise reproducibility of results.
 
 
 ## Obtaining
@@ -65,9 +69,15 @@ tree.  Currently, the tree is hosted on github, and can be obtained via:
 
     git clone --recursive git://github.com/ekg/freebayes.git
 
-Note the use of --recursive.  This is required, as the project contains some
+Note the use of --recursive.  This is required in order to download all 
 nested git submodules for external repositories.
 
+After you've already done the above to clone the most recent development 
+version, if you wish to compile a specific version of FreeBayes from, you 
+can then do: 
+
+    git checkout v0.9.18 && git submodule update --recursive
+
 ### Resolving proxy issues with git
 
 Depending on your local network configuration, you may have problems obtaining
@@ -124,7 +134,58 @@ For a description of available command-line options and their defaults, run:
     freebayes --help
 
 
-## Calling variants
+## Examples
+
+Call variants assuming a diploid sample:
+
+    freebayes -f ref.fa aln.bam >var.vcf
+
+Require at least 5 supporting observations to consider a variant:
+
+    freebayes -f ref.fa -C 5 aln.bam >var.vcf
+
+Use a different ploidy:
+
+    freebayes -f ref.fa -p 4 aln.bam >var.vcf
+
+Assume a pooled sample with a known number of genome copies.  Note that this
+means that each sample identified in the BAM file is assumed to have 32 genome
+copies.  When running with highh --ploidy settings, it may be required to set
+`--use-best-n-alleles` to a low number to limit memory usage.
+
+    freebayes -f ref.fa -p 32 --use-best-n-alleles 4 --pooled-discrete aln.bam >var.vcf
+
+Generate frequency-based calls for all variants passing input thresholds:
+
+    freebayes -f ref.fa -F 0.01 -C 1 --pooled-continuous aln.bam >var.vcf
+
+Use an input VCF (bgzipped + tabix indexed) to force calls at particular alleles:
+
+    freebayes -f ref.fa -@ in.vcf.gz aln.bam >var.vcf
+
+Generate long haplotype calls over known variants:
+
+    freebayes -f ref.fa --haplotype-basis-alleles in.vcf.gz \
+                        --haplotype-length 50 aln.bam
+
+Naive variant calling: simply annotate observation counts of SNPs and indels:
+
+    freebayes -f ref.fa --haplotype-length 0 --min-alternate-count 1 \
+        --min-alternate-fraction 0 --pooled-continuous --report-monomorphic >var.vcf
+
+Parallel operation (use 36 cores in this case cores):
+
+    freebayes-parallel <(fasta_generate_regions.py ref.fa.fai 100000) 36 \
+        -f ref.fa aln.bam >var.vcf
+
+Note that any of the above examples can be made parallel by using the
+scripts/freebayes-parallel script.  If you find freebayes to be slow, you
+should probably be running it in parallel using this script to run on a single
+host, or generating a series of scripts, one per region, and run them on a
+cluster.
+
+
+## Calling variants: from fastq to VCF
 
 You've sequenced some samples.  You have a reference genome or assembled set of 
 contigs, and you'd like to determine reference-relative variants in your 
@@ -213,6 +274,32 @@ open them all simultaneously with freebayes.  The VCF output will have one
 column per sample in the input.
 
 
+## Performance tuning
+
+If you find freebayes to be slow, or use large amounts of memory, consider the
+following options:
+
+- Set `--use-best-n-alleles 4`: this will reduce the number of alleles that are
+  considered, which will decrease runtime at the cost of sensitivity to
+lower-frequency alleles at multiallelic loci.  Calculating site qualities
+requires O(samples\*genotypes) runtime, and the number of genotypes is
+exponential in ploidy and the number of alleles that are considered, so this is
+very important when working with high ploidy samples (and also
+`--pooled-discrete`). By default, freebayes puts no limit on this.
+
+- Remove `--genotype-qualities`: calculating genotype qualities requires
+  O(samples\*genotypes) memory.
+
+- Set higher input thresholds. Require that N reads in one sample support an
+  allele in order to consider it: `--min-alternate-count N`, or that the allele
+fraction in one sample is M: `--min-alternate-fraction M`. This will filter
+noisy alleles.  The defaults, `--min-alternate-count 2 --min-alternate-fraction
+0.2`, are most-suitable for diploid, moderate-to-high depth samples, and should
+be changed when working with different ploidy samples. Alternatively,
+`--min-alternate-qsum` can be used to set a specific quality sum, which may be
+more flexible than setting a hard count on the number of observations.
+
+
 ## Observation filters and qualities
 
 ### Input filters
@@ -264,12 +351,13 @@ haplotype.
 
 ### Effective base depth
 
-By default, filters are left completely open, as both mapping quality and base 
-quality are incorporated into the reported site quality (QUAL in the VCF) and 
+By default, filters are left completely open.
+Use `--experimental-gls` if you would like to integrate both base and mapping 
+quality are into the reported site quality (QUAL in the VCF) and 
 genotype quality (GQ, when supplying `--genotype-qualities`).  This integration 
 is driven by the "Effective Base Depth" metric first developed in 
 [snpTools](http://www.hgsc.bcm.edu/software/snptools), which scales observation 
-quality by mapping quality.  In short, *P(Obs|Genotype) ~ 
+quality by mapping quality.  When `--experimental-gls` is given, *P(Obs|Genotype) ~ 
 P(MappedCorrectly(Obs))P(SequencedCorrectly(Obs))*.
 
 
@@ -404,13 +492,33 @@ ection-methods-ensemble-freebayes-and-minimal-bam-preparation-pipelines/).
 For a push-button solution to variant detection, from reads to variant calls, 
 look no further than the [gkno genome analysis platform](http://gkno.me/).
 
+## Contributors
+
+FreeBayes is made by:
+
+- Erik Garrison 
+- Thomas Sibley 
+- Dillon Lee 
+- Patrick Marks 
+- Noah Spies 
+- Joshua Randall 
+- Jeremy Anderson
 
 ## Support
 
+### email
+
 Please report any issues or questions to the [freebayes mailing 
 list](https://groups.google.com/forum/#!forum/freebayes), [freebayes issue 
 tracker](https://github.com/ekg/freebayes/issues), or by email to 
-<erik.garrison at bc.edu>.
+<erik.garrison at gmail.com>.
+
+### IRC
+
+If you would like to chat real-time about freebayes, join #freebayes on
+freenode. A gittr.im chat is also available.
+
+### reversion
 
 Note that if you encounter issues with the development HEAD and you would like 
 a quick workaround for an issue that is likely to have been reintroduced 
diff --git a/paper/1000G_performance_comparison.png b/paper/1000G_performance_comparison.png
new file mode 100644
index 0000000..7661678
Binary files /dev/null and b/paper/1000G_performance_comparison.png differ
diff --git a/paper/100samples10x_0_25_both.png b/paper/100samples10x_0_25_both.png
new file mode 100644
index 0000000..11eaa23
Binary files /dev/null and b/paper/100samples10x_0_25_both.png differ
diff --git a/paper/Makefile b/paper/Makefile
new file mode 100644
index 0000000..d46e331
--- /dev/null
+++ b/paper/Makefile
@@ -0,0 +1,10 @@
+all: paper
+
+paper:
+	pdflatex main
+	bibtex main
+	pdflatex main
+
+clean:
+	rm -f main.{aux,bbl,blg,log,pdf}
+
diff --git a/paper/genome_research.bst b/paper/genome_research.bst
new file mode 100644
index 0000000..0c5e2dc
--- /dev/null
+++ b/paper/genome_research.bst
@@ -0,0 +1,1620 @@
+%%
+%% This is file `genome_research.bst',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% merlin.mbs  (with options: `ay,nat,vonx,nm-rvx,nmlm,x10,x0,m10,m0,keyxyr,dt-beg,yr-blk,note-yr,vol-bf,vnum-x,volp-sp,num-xser,jnm-x,edby-par,edbyx,pp,ed,ord,jabr,nfss,')
+%% ----------------------------------------
+%% *** Genome Research ***
+%% 
+%% Copyright 1994-2007 Patrick W Daly
+ % ===============================================================
+ % IMPORTANT NOTICE:
+ % This bibliographic style (bst) file has been generated from one or
+ % more master bibliographic style (mbs) files, listed above.
+ %
+ % This generated file can be redistributed and/or modified under the terms
+ % of the LaTeX Project Public License Distributed from CTAN
+ % archives in directory macros/latex/base/lppl.txt; either
+ % version 1 of the License, or any later version.
+ % ===============================================================
+ % Name and version information of the main mbs file:
+ % \ProvidesFile{merlin.mbs}[2007/04/24 4.20 (PWD, AO, DPC)]
+ %   For use with BibTeX version 0.99a or later
+ %-------------------------------------------------------------------
+ % This bibliography style file is intended for texts in ENGLISH
+ % This is an author-year citation style bibliography. As such, it is
+ % non-standard LaTeX, and requires a special package file to function properly.
+ % Such a package is    natbib.sty   by Patrick W. Daly
+ % The form of the \bibitem entries is
+ %   \bibitem[Jones et al.(1990)]{key}...
+ %   \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}...
+ % The essential feature is that the label (the part in brackets) consists
+ % of the author names, as they should appear in the citation, with the year
+ % in parentheses following. There must be no space before the opening
+ % parenthesis!
+ % With natbib v5.3, a full list of authors may also follow the year.
+ % In natbib.sty, it is possible to define the type of enclosures that is
+ % really wanted (brackets or parentheses), but in either case, there must
+ % be parentheses in the label.
+ % The \cite command functions as follows:
+ %   \citet{key} ==>>                Jones et al. (1990)
+ %   \citet*{key} ==>>               Jones, Baker, and Smith (1990)
+ %   \citep{key} ==>>                (Jones et al., 1990)
+ %   \citep*{key} ==>>               (Jones, Baker, and Smith, 1990)
+ %   \citep[chap. 2]{key} ==>>       (Jones et al., 1990, chap. 2)
+ %   \citep[e.g.][]{key} ==>>        (e.g. Jones et al., 1990)
+ %   \citep[e.g.][p. 32]{key} ==>>   (e.g. Jones et al., p. 32)
+ %   \citeauthor{key} ==>>           Jones et al.
+ %   \citeauthor*{key} ==>>          Jones, Baker, and Smith
+ %   \citeyear{key} ==>>             1990
+ %---------------------------------------------------------------------
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    eid
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label short.list }
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+STRINGS { s t}
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+FUNCTION {add.blank}
+{  " " * before.all 'output.state :=
+}
+
+FUNCTION {date.block}
+{
+  new.block
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+STRINGS {z}
+FUNCTION {remove.dots}
+{ 'z :=
+  ""
+  { z empty$ not }
+  { z #1 #1 substring$
+    z #2 global.max$ substring$ 'z :=
+    duplicate$ "." = 'pop$
+      { * }
+    if$
+  }
+  while$
+}
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+FUNCTION {bolden}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\textbf{" swap$ * "}" * }
+  if$
+}
+FUNCTION {tie.or.space.prefix}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$
+}
+
+FUNCTION {capitalize}
+{ "u" change.case$ "t" change.case$ }
+
+FUNCTION {space.word}
+{ " " swap$ * " " * }
+ % Here are the language-specific definitions for explicit words.
+ % Each function has a name bbl.xxx where xxx is the English word.
+ % The language selected here is ENGLISH
+FUNCTION {bbl.and}
+{ "and"}
+
+FUNCTION {bbl.etal}
+{ "et~al." }
+
+FUNCTION {bbl.editors}
+{ "eds." }
+
+FUNCTION {bbl.editor}
+{ "ed." }
+
+FUNCTION {bbl.edby}
+{ "edited by" }
+
+FUNCTION {bbl.edition}
+{ "edition" }
+
+FUNCTION {bbl.volume}
+{ "volume" }
+
+FUNCTION {bbl.of}
+{ "of" }
+
+FUNCTION {bbl.number}
+{ "number" }
+
+FUNCTION {bbl.nr}
+{ "no." }
+
+FUNCTION {bbl.in}
+{ "in" }
+
+FUNCTION {bbl.pages}
+{ "pp." }
+
+FUNCTION {bbl.page}
+{ "p." }
+
+FUNCTION {bbl.chapter}
+{ "chapter" }
+
+FUNCTION {bbl.techrep}
+{ "Technical Report" }
+
+FUNCTION {bbl.mthesis}
+{ "Master's thesis" }
+
+FUNCTION {bbl.phdthesis}
+{ "Ph.D. thesis" }
+
+FUNCTION {bbl.first}
+{ "1st" }
+
+FUNCTION {bbl.second}
+{ "2nd" }
+
+FUNCTION {bbl.third}
+{ "3rd" }
+
+FUNCTION {bbl.fourth}
+{ "4th" }
+
+FUNCTION {bbl.fifth}
+{ "5th" }
+
+FUNCTION {bbl.st}
+{ "st" }
+
+FUNCTION {bbl.nd}
+{ "nd" }
+
+FUNCTION {bbl.rd}
+{ "rd" }
+
+FUNCTION {bbl.th}
+{ "th" }
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+FUNCTION {eng.ord}
+{ duplicate$ "1" swap$ *
+  #-2 #1 substring$ "1" =
+     { bbl.th * }
+     { duplicate$ #-1 #1 substring$
+       duplicate$ "1" =
+         { pop$ bbl.st * }
+         { duplicate$ "2" =
+             { pop$ bbl.nd * }
+             { "3" =
+                 { bbl.rd * }
+                 { bbl.th * }
+               if$
+             }
+           if$
+          }
+       if$
+     }
+   if$
+}
+
+MACRO {acmcs} {"ACM Comput. Surv."}
+
+MACRO {acta} {"Acta Inf."}
+
+MACRO {cacm} {"Commun. ACM"}
+
+MACRO {ibmjrd} {"IBM J. Res. Dev."}
+
+MACRO {ibmsj} {"IBM Syst.~J."}
+
+MACRO {ieeese} {"IEEE Trans. Software Eng."}
+
+MACRO {ieeetc} {"IEEE Trans. Comput."}
+
+MACRO {ieeetcad}
+ {"IEEE Trans. Comput. Aid. Des."}
+
+MACRO {ipl} {"Inf. Process. Lett."}
+
+MACRO {jacm} {"J.~ACM"}
+
+MACRO {jcss} {"J.~Comput. Syst. Sci."}
+
+MACRO {scp} {"Sci. Comput. Program."}
+
+MACRO {sicomp} {"SIAM J. Comput."}
+
+MACRO {tocs} {"ACM Trans. Comput. Syst."}
+
+MACRO {tods} {"ACM Trans. Database Syst."}
+
+MACRO {tog} {"ACM Trans. Graphic."}
+
+MACRO {toms} {"ACM Trans. Math. Software"}
+
+MACRO {toois} {"ACM Trans. Office Inf. Syst."}
+
+MACRO {toplas} {"ACM Trans. Progr. Lang. Syst."}
+
+MACRO {tcs} {"Theor. Comput. Sci."}
+
+FUNCTION {bibinfo.check}
+{ swap$
+  duplicate$ missing$
+    {
+      pop$ pop$
+      ""
+    }
+    { duplicate$ empty$
+        {
+          swap$ pop$
+        }
+        { swap$
+          pop$
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {bibinfo.warn}
+{ swap$
+  duplicate$ missing$
+    {
+      swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
+      ""
+    }
+    { duplicate$ empty$
+        {
+          swap$ "empty " swap$ * " in " * cite$ * warning$
+        }
+        { swap$
+          pop$
+        }
+      if$
+    }
+  if$
+}
+INTEGERS { nameptr namesleft numnames }
+
+
+STRINGS  { bibinfo}
+
+FUNCTION {format.names}
+{ 'bibinfo :=
+  duplicate$ empty$ 'skip$ {
+  's :=
+  "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}{ jj}{ f{}}"
+      format.name$
+      remove.dots
+      bibinfo bibinfo.check
+      't :=
+      nameptr #1 >
+        {
+          nameptr #0
+          #10 +
+          #1 + =
+          numnames #0
+          #10 +
+          > and
+            { "others" 't :=
+              #1 'namesleft := }
+            'skip$
+          if$
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                {
+                  " " * bbl.etal *
+                }
+                {
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+  } if$
+}
+FUNCTION {format.names.ed}
+{
+  'bibinfo :=
+  duplicate$ empty$ 'skip$ {
+  's :=
+  "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{f{}~}{vv~}{ll}{ jj}"
+      format.name$
+      remove.dots
+      bibinfo bibinfo.check
+      't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                {
+
+                  " " * bbl.etal *
+                }
+                {
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+  } if$
+}
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author "author" format.names * "." %added to give period at end of author list
+}
+FUNCTION {get.bbl.editor}
+{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
+
+FUNCTION {format.editors}
+{ editor "editor" format.names duplicate$ empty$ 'skip$
+    {
+      "," *
+      " " *
+      get.bbl.editor
+      *
+    }
+  if$
+}
+FUNCTION {format.note}
+{
+ note empty$
+    { "" }
+    { note #1 #1 substring$
+      duplicate$ "{" =
+        'skip$
+        { output.state mid.sentence =
+          { "l" }
+          { "u" }
+        if$
+        change.case$
+        }
+      if$
+      note #2 global.max$ substring$ * "note" bibinfo.check
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title
+  duplicate$ empty$ 'skip$
+    { "t" change.case$ }
+  if$
+  "title" bibinfo.check
+}
+FUNCTION {format.full.names}
+{'s :=
+ "" 't :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$
+      't :=
+      nameptr #1 >
+        {
+          nameptr #0
+          #10 +
+          #1 + =
+          numnames #0
+          #10 +
+          > and
+            { "others" 't :=
+              #1 'namesleft := }
+            'skip$
+          if$
+          namesleft #1 >
+            { ", " * t * }
+            {
+              s nameptr "{ll}" format.name$ duplicate$ "others" =
+                { 't := }
+                { pop$ }
+              if$
+              t "others" =
+                {
+                  " " * bbl.etal *
+                }
+                {
+                  numnames #2 >
+                    { "," * }
+                    'skip$
+                  if$
+                  bbl.and
+                  space.word * t *
+                }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.key.full}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.key.full}
+{ author empty$
+    { key empty$
+         { cite$ #1 #3 substring$ }
+          'key
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.key.full}
+{ editor empty$
+    { key empty$
+         { cite$ #1 #3 substring$ }
+          'key
+      if$
+    }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.full
+    { type$ "proceedings" =
+        'editor.key.full
+        'author.key.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem[{" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "}]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{
+  't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {word.in}
+{ bbl.in capitalize
+  " " * }
+
+FUNCTION {format.date}
+{ year "year" bibinfo.check duplicate$ empty$
+    {
+      "empty year in " cite$ * "; set to ????" * warning$
+       pop$ "????"
+    }
+    'skip$
+  if$
+  extra.label *
+  before.all 'output.state :=
+  " " swap$ *
+}
+FUNCTION {format.btitle}
+{ title "title" bibinfo.check
+  duplicate$ empty$ 'skip$
+    {
+      emphasize
+    }
+  if$
+}
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { bbl.volume volume tie.or.space.prefix
+      "volume" bibinfo.check * *
+      series "series" bibinfo.check
+      duplicate$ empty$ 'pop$
+        { swap$ bbl.of space.word * swap$
+          emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { series empty$
+            { number "number" bibinfo.check }
+            { output.state mid.sentence =
+                { bbl.number }
+                { bbl.number capitalize }
+              if$
+              number tie.or.space.prefix "number" bibinfo.check * *
+              bbl.in space.word *
+              series "series" bibinfo.check *
+            }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+FUNCTION {is.num}
+{ chr.to.int$
+  duplicate$ "0" chr.to.int$ < not
+  swap$ "9" chr.to.int$ > not and
+}
+
+FUNCTION {extract.num}
+{ duplicate$ 't :=
+  "" 's :=
+  { t empty$ not }
+  { t #1 #1 substring$
+    t #2 global.max$ substring$ 't :=
+    duplicate$ is.num
+      { s swap$ * 's := }
+      { pop$ "" 't := }
+    if$
+  }
+  while$
+  s empty$
+    'skip$
+    { pop$ s }
+  if$
+}
+
+FUNCTION {convert.edition}
+{ extract.num "l" change.case$ 's :=
+  s "first" = s "1" = or
+    { bbl.first 't := }
+    { s "second" = s "2" = or
+        { bbl.second 't := }
+        { s "third" = s "3" = or
+            { bbl.third 't := }
+            { s "fourth" = s "4" = or
+                { bbl.fourth 't := }
+                { s "fifth" = s "5" = or
+                    { bbl.fifth 't := }
+                    { s #1 #1 substring$ is.num
+                        { s eng.ord 't := }
+                        { edition 't := }
+                      if$
+                    }
+                  if$
+                }
+              if$
+            }
+          if$
+        }
+      if$
+    }
+  if$
+  t
+}
+
+FUNCTION {format.edition}
+{ edition duplicate$ empty$ 'skip$
+    {
+      convert.edition
+      output.state mid.sentence =
+        { "l" }
+        { "t" }
+      if$ change.case$
+      "edition" bibinfo.check
+      " " * bbl.edition *
+    }
+  if$
+}
+INTEGERS { multiresult }
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+FUNCTION {format.pages}
+{ pages duplicate$ empty$ 'skip$
+    { duplicate$ multi.page.check
+        {
+          bbl.pages swap$
+          n.dashify
+        }
+        {
+          bbl.page swap$
+        }
+      if$
+      tie.or.space.prefix
+      "pages" bibinfo.check
+      * *
+    }
+  if$
+}
+FUNCTION {format.journal.pages}
+{ pages duplicate$ empty$ 'pop$
+    { swap$ duplicate$ empty$
+        { pop$ pop$ format.pages }
+        {
+          ": " *
+          swap$
+          n.dashify
+          "pages" bibinfo.check
+          *
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.journal.eid}
+{ eid "eid" bibinfo.check
+  duplicate$ empty$ 'pop$
+    { swap$ duplicate$ empty$ 'skip$
+      {
+          ": " *
+      }
+      if$
+      swap$ *
+    }
+  if$
+}
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  duplicate$ empty$ 'skip$
+    {
+      "volume" bibinfo.check
+    }
+  if$
+  bolden
+  eid empty$
+    { format.journal.pages }
+    { format.journal.eid }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { bbl.chapter }
+        { type "l" change.case$
+          "type" bibinfo.check
+        }
+      if$
+      chapter tie.or.space.prefix
+      "chapter" bibinfo.check
+      * *
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.booktitle}
+{
+  booktitle "booktitle" bibinfo.check
+  emphasize
+}
+FUNCTION {format.in.ed.booktitle}
+{ format.booktitle duplicate$ empty$ 'skip$
+    {
+      editor "editor" format.names.ed duplicate$ empty$ 'pop$
+        {
+          get.bbl.editor
+          " " * swap$ *
+          "(" swap$ * ")" *
+          swap$
+          " " * swap$
+          * }
+      if$
+      word.in swap$ *
+    }
+  if$
+}
+FUNCTION {format.thesis.type}
+{ type duplicate$ empty$
+    'pop$
+    { swap$ pop$
+      "t" change.case$ "type" bibinfo.check
+    }
+  if$
+}
+FUNCTION {format.tr.number}
+{ number "number" bibinfo.check
+  type duplicate$ empty$
+    { pop$ bbl.techrep }
+    'skip$
+  if$
+  "type" bibinfo.check
+  swap$ duplicate$ empty$
+    { pop$ "t" change.case$ }
+    { tie.or.space.prefix * * }
+  if$
+}
+FUNCTION {format.article.crossref}
+{
+  word.in
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.book.crossref}
+{ volume duplicate$ empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      pop$ word.in
+    }
+    { bbl.volume
+      capitalize
+      swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word *
+    }
+  if$
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.incoll.inproc.crossref}
+{
+  word.in
+  " \cite{" * crossref * "}" *
+}
+FUNCTION {format.org.or.pub}
+{ 't :=
+  ""
+  address empty$ t empty$ and
+    'skip$
+    {
+      t empty$
+        { address "address" bibinfo.check *
+        }
+        { t *
+          address empty$
+            'skip$
+            { ", " * address "address" bibinfo.check * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+FUNCTION {format.publisher.address}
+{ publisher "publisher" bibinfo.warn format.org.or.pub
+}
+
+FUNCTION {format.organization.address}
+{ organization "organization" bibinfo.check format.org.or.pub
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    {
+      journal
+      "journal" bibinfo.check
+      emphasize
+      "journal" output.check
+      add.blank
+      format.vol.num.pages output
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      format.publisher.address output
+    }
+    {
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  howpublished "howpublished" bibinfo.check output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  crossref missing$
+    {
+      format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      format.publisher.address output
+    }
+    {
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      format.publisher.address output
+      format.edition output
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      new.sentence
+      publisher empty$
+        { format.organization.address output }
+        { organization "organization" bibinfo.check output
+          format.publisher.address output
+        }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {conference} { inproceedings }
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  organization "organization" bibinfo.check output
+  address "address" bibinfo.check output
+  format.edition output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle
+  "title" output.check
+  new.block
+  bbl.mthesis format.thesis.type output.nonnull
+  school "school" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title output
+  new.block
+  howpublished "howpublished" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle
+  "title" output.check
+  new.block
+  bbl.phdthesis format.thesis.type output.nonnull
+  school "school" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  format.date "year" output.check
+  date.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  new.sentence
+  publisher empty$
+    { format.organization.address output }
+    { organization "organization" bibinfo.check output
+      format.publisher.address output
+    }
+  if$
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title
+  "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" bibinfo.warn output
+  address "address" bibinfo.check output
+  new.block
+  format.note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  format.date "year" output.check
+  date.block
+  format.title "title" output.check
+  new.block
+  format.note "note" output.check
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+READ
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+INTEGERS { len }
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+FUNCTION {format.lab.names}
+{ 's :=
+  "" 't :=
+  s #1 "{vv~}{ll}" format.name$
+  s num.names$ duplicate$
+  #2 >
+    { pop$
+      " " * bbl.etal *
+    }
+    { #2 <
+        'skip$
+        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+            {
+              " " * bbl.etal *
+            }
+            { bbl.and space.word * s #2 "{vv~}{ll}" format.name$
+              * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.label}
+{ editor empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.label
+        'author.key.label
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{ll{ }}{  f{ }}{  jj{ }}"
+      format.name$ 't :=
+      nameptr #1 >
+        {
+          nameptr #0
+          #10 +
+          #1 + =
+          numnames #0
+          #10 +
+          > and
+            { "others" 't :=
+              #1 'namesleft := }
+            'skip$
+          if$
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" * }
+            { t sortify * }
+          if$
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+FUNCTION {editor.sort}
+{ editor empty$
+    { key empty$
+        { "to sort, need editor or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.sort
+        'author.sort
+      if$
+    }
+  if$
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+SORT
+STRINGS { last.label next.extra }
+INTEGERS { last.extra.num number.label }
+FUNCTION {initialize.extra.label.stuff}
+{ #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'last.extra.num :=
+  #0 'number.label :=
+}
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { "{\natexlab{" swap$ * "}}" * }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+EXECUTE {initialize.extra.label.stuff}
+ITERATE {forward.pass}
+REVERSE {reverse.pass}
+FUNCTION {bib.sort.order}
+{ sort.label
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+ITERATE {bib.sort.order}
+SORT
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+}
+EXECUTE {begin.bib}
+EXECUTE {init.state.consts}
+ITERATE {call.type$}
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+EXECUTE {end.bib}
+%% End of customized bst file
+%%
+%% End of file `genome_research.bst'.
\ No newline at end of file
diff --git a/paper/haplotype_calling.png b/paper/haplotype_calling.png
new file mode 100644
index 0000000..bf0bd3c
Binary files /dev/null and b/paper/haplotype_calling.png differ
diff --git a/paper/indel_error.png b/paper/indel_error.png
new file mode 100644
index 0000000..9d110dc
Binary files /dev/null and b/paper/indel_error.png differ
diff --git a/paper/low_frequency_sensitivity.png b/paper/low_frequency_sensitivity.png
new file mode 100644
index 0000000..485396d
Binary files /dev/null and b/paper/low_frequency_sensitivity.png differ
diff --git a/paper/main.aux b/paper/main.aux
new file mode 100644
index 0000000..c89a8ea
--- /dev/null
+++ b/paper/main.aux
@@ -0,0 +1,121 @@
+\relax 
+\citation{browning2007,mach2010,delaneau2012,howie2011}
+\citation{branton2008,clarke2009}
+\citation{blmash}
+\citation{recurrentTERT2013}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Motivation}{1}}
+\citation{li2011stats,marth99}
+\citation{gatk2011}
+\citation{marth99}
+\citation{freebayesgit}
+\citation{mutatrixgit}
+\citation{holtgrewe2010}
+\citation{mosaik}
+\citation{gatk2011}
+\citation{samtools}
+\citation{vcflibgit}
+\@writefile{toc}{\contentsline {section}{\numberline {2}Results}{2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Small variant detection in simulated data}{2}}
+\newlabel{sec:simulation}{{2.1}{2}}
+\citation{samtools}
+\citation{mosaik}
+\citation{vcflibgit}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Receiver-operator characteristics (ROCs) for FreeBayes, GATK HaplotypeCaller and UnifiedGenotyper, and samtools on 100 samples at 10x simulated sequencing depth. FreeBayes achieves the highest area under the curve (AUC) 1\hbox {}, with the HaplotypeCaller and samtools each performing next-best for indels and SNPs, respectively.}}{3}}
+\newlabel{fig:10xROC}{{1}{3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Using simulation to assess the direct detection of haplotypes}{3}}
+\newlabel{sec:complexsimulation}{{2.2}{3}}
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Performance of FreeBayes, GATK HaplotypeCaller and UnifiedGenotyper, and samtools against simulated data. }}{4}}
+\newlabel{tab:simROCs}{{1}{4}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces A known error mode of Illumina sequencing methods generates a 1bp insertion artifact that is detected by standard mapping-based variant calling methods. The artifact results in a relative over-abundance of 1bp insertions. Here, we characterize the ability of our method to remove this artifact by detecting variants in a larger detection window. As the calling window size is increased beyond 10bp, the artifact is effecti [...]
+\newlabel{fig:indelerror}{{2}{4}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Using haplotype-based variant detection to improve the signal to noise ratio of candidate variants}{4}}
+\newlabel{sec:indelerror}{{2.3}{4}}
+\citation{1000Gphase1}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Using haplotype-based variant detection to understand genotyping array design failure}{5}}
+\newlabel{sec:arrayfailure}{{2.4}{5}}
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces The Omni 2.5 genotyping array includes a number of alleles which consistently report as non-polymorphic (monomorphic) in the 1000 Genomes cohort in which they were originally detected. By detecting variants using our method at a 10bp variant calling window, we demonstrate that more than 90\% of the apparently monomorphic loci are not biallelic SNPs, and thus the array design does not match the local variant structure i [...]
+\newlabel{fig:omnierrors}{{3}{6}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}The importance of accurately modeling copy number variations on sex chromosomes}{6}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.6}Comparing to other methods in low-coverage sequencing data}{6}}
+\newlabel{sec:1000Gcomparisons}{{2.6}{6}}
+\citation{cortex}
+\citation{dindel}
+\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Performance of various variant detection pipelines tested as part of the 1000 Genomes Project. Sets are Boston College; non-haplotype-based method (BC), haplotype-based method described in this paper (BC2), Baylor College of Medicine (BCM), Broad Institute GATK UnifiedGenotyper (BI1), Sanger Institute Samtools (SI1), University of Michigan GlfMultiples (UM), Broad Institute GATK HaplotypeCaller (BI2), Oxford Platypus (O [...]
+\newlabel{table:1000Gcomparisons}{{2}{7}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.7}Indel detection performance}{7}}
+\newlabel{sec:1000Gindels}{{2.7}{7}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.8}Sensitivity to low-frequency variation}{7}}
+\newlabel{sec:lowfreq}{{2.8}{7}}
+\citation{opmac99}
+\citation{1000GPhaseI}
+\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Performance of indel detection methods in 1000 Genomes project on the AFR191 sample set as assesed via high-depth resequencing validation. Sets are Boston College FreeBayes (BC), Broad Institute GATK UnifiedGenotyper (BI1), Sanger Institute Samtools (SI1), Broad Institute GATK HaplotypeCaller (BI2), Oxford Platypus (OX1), Oxford Cortex (OX2).}}{8}}
+\newlabel{fig:1000Gindels}{{4}{8}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.9}Haplotype-based consolidation of small variant calls}{8}}
+\newlabel{sec:ensemble}{{2.9}{8}}
+\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Sensitivity to low-frequency variants of various detection methods, as assessed in 191 samples of African ancestry in the 1000 Genomes low-coverage cohort. BC2 is FreeBayes, BI1 is the GATK UnifiedGenotyper, BI2 is the GATK HaplotypeCaller, and SI2 is the global assembler SGA.}}{9}}
+\newlabel{fig:lowfreqsens}{{5}{9}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Methods}{10}}
+\newlabel{sec:model}{{3}{10}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Definitions}{10}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}A Bayesian approach}{10}}
+\newlabel{sec:modeloverview}{{3.2}{10}}
+\newlabel{eq:bayesian}{{2}{10}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Estimating the probability of sequencing observations given an underlying genotype, $P(R_i|G)$}{11}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Genotype combination priors, $P(G_1,\ldots  ,G_n)$}{11}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.4.1}Decomposition of prior probability of genotype combination}{11}}
+\citation{ewens72}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.4.2}Genotype combination sampling probability $P(G_1,\ldots  ,G_n | f_1,\ldots  ,f_k)$}{12}}
+\newlabel{eq:phasedsampling}{{8}{12}}
+\newlabel{eq:unphasedsampling}{{9}{12}}
+\citation{watterson1975,tajima1983}
+\citation{gatk2011}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.4.3}Derivation of $P(f_1,\ldots  ,f_k)$ by Ewens' sampling formula}{13}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.5}Expanding the model to incorporate the observability of the locus and alleles}{13}}
+\newlabel{sec:modelsequencability}{{3.5}{13}}
+\citation{marth99,samtools,li2011stats}
+\citation{snptools,maq}
+\citation{snpsvm}
+\newlabel{eq:bayesianandmodel}{{14}{14}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.6}Estimation of the probability that the locus is sequencable $P(S)$}{14}}
+\newlabel{sec:sequencable}{{3.6}{14}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Direct detection of phase from short-read sequencing}{16}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Parsing haplotype observations from sequencing data}{16}}
+\newlabel{sec:parsing}{{4.1}{16}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Determining a window over which to assemble haplotype observations}{16}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces The direct detection of phase from short-read sequencing traces and counting of haplotypes across dynamically-determined windows.}}{17}}
+\newlabel{fig:haplotypecalling}{{6}{17}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Detection and genotyping of local haplotypes}{17}}
+\newlabel{sec:genotyping}{{4.3}{17}}
+\bibdata{references}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.4}Probability of polymorphism}{18}}
+\newlabel{eq:probpoly}{{22}{18}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.5}Marginal likelihoods of individual genotypes}{18}}
+\newlabel{eq:marginals}{{23}{18}}
+\bibcite{1000Gphase1}{{1}{2012}{{1000 Genomes Project~Participants}}{{}}}
+\bibcite{dindel}{{2}{2011}{{Albers et~al.}}{{Albers, Lunter, MacArthur, McVean, Ouwehand, and Durbin}}}
+\bibcite{branton2008}{{3}{2008}{{Branton et~al.}}{{Branton, Deamer, Marziali, Bayley, Benner, Butler, Di~Ventra, Garaj, Hibbs, Huang et~al.}}}
+\bibcite{browning2007}{{4}{2007}{{Browning and Browning}}{{}}}
+\bibcite{clarke2009}{{5}{2009}{{Clarke et~al.}}{{Clarke, Wu, Jayasinghe, Patel, Reid, and Bayley}}}
+\bibcite{blmash}{{6}{2003}{{Cleary et~al.}}{{Cleary, Zhang, Di~Nicola, Aronson, Aube, Steinman, Haddad, Redston, Gallinger, Narod et~al.}}}
+\bibcite{delaneau2012}{{7}{2012}{{Delaneau et~al.}}{{Delaneau, Marchini, and Zagury}}}
+\bibcite{gatk2011}{{8}{2011}{{DePristo et~al.}}{{DePristo, Banks, Poplin, Garimella, Maguire, Hartl, Philippakis, del Angel, Rivas, Hanna et~al.}}}
+\bibcite{ewens72}{{9}{1972}{{Ewens}}{{}}}
+\bibcite{freebayesgit}{{10}{2012{a}}{{Garrison}}{{}}}
+\bibcite{mutatrixgit}{{11}{2012{b}}{{Garrison}}{{}}}
+\bibcite{vcflibgit}{{12}{2012{c}}{{Garrison}}{{}}}
+\bibcite{holtgrewe2010}{{13}{2010}{{Holtgrewe}}{{}}}
+\bibcite{howie2011}{{14}{2011}{{Howie et~al.}}{{Howie, Marchini, and Stephens}}}
+\bibcite{recurrentTERT2013}{{15}{2013}{{Huang et~al.}}{{Huang, Hodis, Xu, Kryukov, Chin, and Garraway}}}
+\bibcite{cortex}{{16}{2012}{{Iqbal et~al.}}{{Iqbal, Caccamo, Turner, Flicek, and McVean}}}
+\bibcite{mosaik}{{17}{2012}{{Lee and Str{\"{o}}mberg}}{{}}}
+\bibcite{li2011stats}{{18}{2011}{{Li}}{{}}}
+\bibcite{samtools}{{19}{2009}{{Li et~al.}}{{Li, Handsaker, Wysoker, Fennell, Ruan, Homer, Marth, Abecasis, and Durbin}}}
+\bibcite{maq}{{20}{2008}{{Li et~al.}}{{Li, Ruan, and Durbin}}}
+\bibcite{mach2010}{{21}{2010}{{Li et~al.}}{{Li, Willer, Ding, Scheet, and Abecasis}}}
+\bibcite{marth99}{{22}{1999}{{Marth et~al.}}{{Marth, Korf, Yandell, Yeh, Gu, Zakeri, Stitziel, Hillier, Kwok, and Gish}}}
+\bibcite{snpsvm}{{23}{2013}{{O'Fallon et~al.}}{{O'Fallon, Wooderchak-Donahue, and Crockett}}}
+\bibcite{opmac99}{{24}{1999}{{Opitz and Maclin}}{{}}}
+\bibcite{tajima1983}{{25}{1983}{{Tajima}}{{}}}
+\bibcite{snptools}{{26}{2013}{{Wang et~al.}}{{Wang, Lu, Yu, Gibbs, and Yu}}}
+\bibcite{watterson1975}{{27}{1975}{{Watterson}}{{}}}
+\bibstyle{genome_research}
diff --git a/paper/main.bbl b/paper/main.bbl
new file mode 100644
index 0000000..0e685a3
--- /dev/null
+++ b/paper/main.bbl
@@ -0,0 +1,170 @@
+\begin{thebibliography}{27}
+\providecommand{\natexlab}[1]{#1}
+
+\bibitem[{1000 Genomes Project~Participants(2012)}]{1000Gphase1}
+1000 Genomes Project~Participants T. 2012.
+\newblock {{A}n integrated map of genetic variation from 1,092 human genomes}.
+\newblock \emph{Nature} \textbf{491}: 56--65.
+
+\bibitem[{Albers et~al.(2011)Albers, Lunter, MacArthur, McVean, Ouwehand, and
+  Durbin}]{dindel}
+Albers CA, Lunter G, MacArthur DG, McVean G, Ouwehand WH, and Durbin R. 2011.
+\newblock {{D}indel: accurate indel calls from short-read data}.
+\newblock \emph{Genome Res.} \textbf{21}: 961--973.
+
+\bibitem[{Branton et~al.(2008)Branton, Deamer, Marziali, Bayley, Benner,
+  Butler, Di~Ventra, Garaj, Hibbs, Huang et~al.}]{branton2008}
+Branton D, Deamer DW, Marziali A, Bayley H, Benner SA, Butler T, Di~Ventra M,
+  Garaj S, Hibbs A, Huang X, et~al.. 2008.
+\newblock {{T}he potential and challenges of nanopore sequencing}.
+\newblock \emph{Nat. Biotechnol.} \textbf{26}: 1146--1153.
+
+\bibitem[{Browning and Browning(2007)}]{browning2007}
+Browning SR and Browning BL. 2007.
+\newblock {{R}apid and accurate haplotype phasing and missing-data inference
+  for whole-genome association studies by use of localized haplotype
+  clustering}.
+\newblock \emph{Am. J. Hum. Genet.} \textbf{81}: 1084--1097.
+
+\bibitem[{Clarke et~al.(2009)Clarke, Wu, Jayasinghe, Patel, Reid, and
+  Bayley}]{clarke2009}
+Clarke J, Wu HC, Jayasinghe L, Patel A, Reid S, and Bayley H. 2009.
+\newblock {{C}ontinuous base identification for single-molecule nanopore
+  {D}{N}{A} sequencing}.
+\newblock \emph{Nat Nanotechnol} \textbf{4}: 265--270.
+
+\bibitem[{Cleary et~al.(2003)Cleary, Zhang, Di~Nicola, Aronson, Aube, Steinman,
+  Haddad, Redston, Gallinger, Narod et~al.}]{blmash}
+Cleary SP, Zhang W, Di~Nicola N, Aronson M, Aube J, Steinman A, Haddad R,
+  Redston M, Gallinger S, Narod SA, et~al.. 2003.
+\newblock {{H}eterozygosity for the {B}{L}{M}({A}sh) mutation and cancer risk}.
+\newblock \emph{Cancer Res.} \textbf{63}: 1769--1771.
+
+\bibitem[{Delaneau et~al.(2012)Delaneau, Marchini, and Zagury}]{delaneau2012}
+Delaneau O, Marchini J, and Zagury JF. 2012.
+\newblock {{A} linear complexity phasing method for thousands of genomes}.
+\newblock \emph{Nat. Methods} \textbf{9}: 179--181.
+
+\bibitem[{DePristo et~al.(2011)DePristo, Banks, Poplin, Garimella, Maguire,
+  Hartl, Philippakis, del Angel, Rivas, Hanna et~al.}]{gatk2011}
+DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis
+  AA, del Angel G, Rivas MA, Hanna M, et~al.. 2011.
+\newblock {{A} framework for variation discovery and genotyping using
+  next-generation {D}{N}{A} sequencing data}.
+\newblock \emph{Nat. Genet.} \textbf{43}: 491--498.
+
+\bibitem[{Ewens(1972)}]{ewens72}
+Ewens WJ. 1972.
+\newblock {{T}he sampling theory of selectively neutral alleles}.
+\newblock \emph{Theor Popul Biol} \textbf{3}: 87--112.
+
+\bibitem[{Garrison(2012{\natexlab{a}})}]{freebayesgit}
+Garrison E. 2012{\natexlab{a}}.
+\newblock {FreeBayes source repository}.
+\newblock \url{https://github.com/ekg/freebayes}.
+
+\bibitem[{Garrison(2012{\natexlab{b}})}]{mutatrixgit}
+Garrison E. 2012{\natexlab{b}}.
+\newblock {mutatrix population genome simulator}.
+\newblock \url{https://github.com/ekg/mutatrix}.
+
+\bibitem[{Garrison(2012{\natexlab{c}})}]{vcflibgit}
+Garrison E. 2012{\natexlab{c}}.
+\newblock {vcflib: variant call file processing and manipulation utilities}.
+\newblock \url{https://github.com/ekg/vcflib}.
+
+\bibitem[{Holtgrewe(2010)}]{holtgrewe2010}
+Holtgrewe M. 2010.
+\newblock Mason – a read simulator for second generation sequencing data.
+\newblock Technical Report TR-B-10-06, Institut für Mathematik und Informatik,
+  Freie Universität Berlin.
+
+\bibitem[{Howie et~al.(2011)Howie, Marchini, and Stephens}]{howie2011}
+Howie B, Marchini J, and Stephens M. 2011.
+\newblock {{G}enotype imputation with thousands of genomes}.
+\newblock \emph{G3 (Bethesda)} \textbf{1}: 457--470.
+
+\bibitem[{Huang et~al.(2013)Huang, Hodis, Xu, Kryukov, Chin, and
+  Garraway}]{recurrentTERT2013}
+Huang FW, Hodis E, Xu MJ, Kryukov GV, Chin L, and Garraway LA. 2013.
+\newblock {{H}ighly recurrent {T}{E}{R}{T} promoter mutations in human
+  melanoma}.
+\newblock \emph{Science} \textbf{339}: 957--959.
+
+\bibitem[{Iqbal et~al.(2012)Iqbal, Caccamo, Turner, Flicek, and
+  McVean}]{cortex}
+Iqbal Z, Caccamo M, Turner I, Flicek P, and McVean G. 2012.
+\newblock {{D}e novo assembly and genotyping of variants using colored de
+  {B}ruijn graphs}.
+\newblock \emph{Nat. Genet.} \textbf{44}: 226--232.
+
+\bibitem[{Lee and Str{\"{o}}mberg(2012)}]{mosaik}
+Lee WP and Str{\"{o}}mberg M. 2012.
+\newblock {MOSAIK reference-guided aligner for next-generation sequencing
+  technologies}.
+\newblock \url{https://github.com/wanpinglee/MOSAIK}.
+
+\bibitem[{Li(2011)}]{li2011stats}
+Li H. 2011.
+\newblock {{A} statistical framework for {S}{N}{P} calling, mutation discovery,
+  association mapping and population genetical parameter estimation from
+  sequencing data}.
+\newblock \emph{Bioinformatics} \textbf{27}: 2987--2993.
+
+\bibitem[{Li et~al.(2009)Li, Handsaker, Wysoker, Fennell, Ruan, Homer, Marth,
+  Abecasis, and Durbin}]{samtools}
+Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G,
+  and Durbin R. 2009.
+\newblock {{T}he {S}equence {A}lignment/{M}ap format and {S}{A}{M}tools}.
+\newblock \emph{Bioinformatics} \textbf{25}: 2078--2079.
+
+\bibitem[{Li et~al.(2008)Li, Ruan, and Durbin}]{maq}
+Li H, Ruan J, and Durbin R. 2008.
+\newblock {{M}apping short {D}{N}{A} sequencing reads and calling variants
+  using mapping quality scores}.
+\newblock \emph{Genome Res.} \textbf{18}: 1851--1858.
+
+\bibitem[{Li et~al.(2010)Li, Willer, Ding, Scheet, and Abecasis}]{mach2010}
+Li Y, Willer CJ, Ding J, Scheet P, and Abecasis GR. 2010.
+\newblock {{M}a{C}{H}: using sequence and genotype data to estimate haplotypes
+  and unobserved genotypes}.
+\newblock \emph{Genet. Epidemiol.} \textbf{34}: 816--834.
+
+\bibitem[{Marth et~al.(1999)Marth, Korf, Yandell, Yeh, Gu, Zakeri, Stitziel,
+  Hillier, Kwok, and Gish}]{marth99}
+Marth GT, Korf I, Yandell MD, Yeh RT, Gu Z, Zakeri H, Stitziel NO, Hillier L,
+  Kwok PY, and Gish WR. 1999.
+\newblock {{A} general approach to single-nucleotide polymorphism discovery}.
+\newblock \emph{Nat. Genet.} \textbf{23}: 452--456.
+
+\bibitem[{O'Fallon et~al.(2013)O'Fallon, Wooderchak-Donahue, and
+  Crockett}]{snpsvm}
+O'Fallon BD, Wooderchak-Donahue W, and Crockett DK. 2013.
+\newblock {{A} support vector machine for identification of single-nucleotide
+  polymorphisms from next-generation sequencing data}.
+\newblock \emph{Bioinformatics} \textbf{29}: 1361--1366.
+
+\bibitem[{Opitz and Maclin(1999)}]{opmac99}
+Opitz DW and Maclin R. 1999.
+\newblock Popular ensemble methods: An empirical study.
+\newblock \emph{J. Artif. Intell. Res. (JAIR)} \textbf{11}: 169--198.
+
+\bibitem[{Tajima(1983)}]{tajima1983}
+Tajima F. 1983.
+\newblock {{E}volutionary relationship of {D}{N}{A} sequences in finite
+  populations}.
+\newblock \emph{Genetics} \textbf{105}: 437--460.
+
+\bibitem[{Wang et~al.(2013)Wang, Lu, Yu, Gibbs, and Yu}]{snptools}
+Wang Y, Lu J, Yu J, Gibbs RA, and Yu F. 2013.
+\newblock {{A}n integrative variant analysis pipeline for accurate
+  genotype/haplotype inference in population {N}{G}{S} data}.
+\newblock \emph{Genome Res.} \textbf{23}: 833--842.
+
+\bibitem[{Watterson(1975)}]{watterson1975}
+Watterson GA. 1975.
+\newblock {{O}n the number of segregating sites in genetical models without
+  recombination}.
+\newblock \emph{Theor Popul Biol} \textbf{7}: 256--276.
+
+\end{thebibliography}
diff --git a/paper/main.blg b/paper/main.blg
new file mode 100644
index 0000000..8f3c05a
--- /dev/null
+++ b/paper/main.blg
@@ -0,0 +1,49 @@
+This is BibTeX, Version 0.99d (TeX Live 2013/Debian)
+Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
+The top-level auxiliary file: main.aux
+The style file: genome_research.bst
+Database file #1: references.bib
+Warning--I didn't find a database entry for "1000GPhaseI"
+Reallocated wiz_functions (elt_size=4) to 6000 items from 3000.
+You've used 27 entries,
+            3195 wiz_defined-function locations,
+            811 strings with 9619 characters,
+and the built_in function-call counts, 29266 in all, are:
+= -- 2663
+> -- 1238
+< -- 13
++ -- 1130
+- -- 336
+* -- 2419
+:= -- 4009
+add.period$ -- 81
+call.type$ -- 27
+change.case$ -- 244
+chr.to.int$ -- 25
+cite$ -- 27
+duplicate$ -- 2114
+empty$ -- 2240
+format.name$ -- 405
+if$ -- 5958
+int.to.chr$ -- 3
+int.to.str$ -- 1
+missing$ -- 262
+newline$ -- 139
+num.names$ -- 108
+pop$ -- 634
+preamble$ -- 1
+purify$ -- 217
+quote$ -- 0
+skip$ -- 670
+stack$ -- 0
+substring$ -- 2840
+swap$ -- 632
+text.length$ -- 1
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 243
+warning$ -- 0
+while$ -- 236
+width$ -- 0
+write$ -- 350
+(There was 1 warning)
diff --git a/paper/main.pdf b/paper/main.pdf
new file mode 100644
index 0000000..26bfa1e
Binary files /dev/null and b/paper/main.pdf differ
diff --git a/paper/main.tex b/paper/main.tex
new file mode 100644
index 0000000..31682f3
--- /dev/null
+++ b/paper/main.tex
@@ -0,0 +1,664 @@
+\documentclass{article}
+\usepackage{fullpage}
+\usepackage{amsmath}
+\usepackage{url}
+\usepackage{verbatim}
+\usepackage{graphicx}
+\DeclareGraphicsExtensions{.pdf,.png,.jpg}
+\usepackage[square, comma, sort&compress]{natbib}
+\usepackage{wrapfig}
+\usepackage{float}
+\usepackage{color}
+\usepackage[section]{placeins}
+
+\DeclareMathOperator*{\E}{\mbox{\large E}}
+
+\newcommand{\approptoinn}[2]{\mathrel{\vcenter{
+  \offinterlineskip\halign{\hfil$##$\cr
+    #1\propto\cr\noalign{\kern2pt}#1\sim\cr\noalign{\kern-2pt}}}}}
+
+\newcommand{\appropto}{\mathpalette\approptoinn\relax}
+
+\def\app#1#2{%
+  \mathrel{%
+    \setbox0=\hbox{$#1\sim$}%
+    \setbox2=\hbox{%
+      \rlap{\hbox{$#1\propto$}}%
+      \lower1.1\ht0\box0%
+    }%
+    \raise0.25\ht2\box2%
+  }%
+}
+\def\approxprop{\mathpalette\app\relax}
+
+\begin{document}
+\large
+
+\title{Haplotype-based variant detection from short-read sequencing}
+%\title{Characterization of complex variants by haplotype-based variant detection}
+
+\author{Erik Garrison and Gabor Marth}
+
+\maketitle
+
+
+\begin{abstract}
+
+With genomic variant detection methods, we can determine point-wise differences against a reference genome.  Widely-used methods fail to reliably characterize the relative phase of proximal alleles.  This information is essential for the functional interpretation of genomic material, and can be determined directly from the primary sequence reads.  Here we propose such a method, and demonstrate that the use of haplotypes does not only improve our ability to interpret genomic information,  [...]
+
+
+\end{abstract}
+
+
+
+\section{Motivation}
+
+\begin{comment}
+write introduction / motivation statement to lead into results
+
+\emph{
+GTM: Biological motivations: 
+(1) Haplotype phasing: [Result: resolution of compound hets. e.g. BLM allele-like situations such as the frame-restoring indels, and the 6 SNPs in a row. This improves functional impact prediction] [Display items: example of phase-restoring INDELs]
+(2) Complex variant detection: [Result: Alleles are reported in a consistent fashion, allowing e.g. improved genotyping chip design] [Display items: OMNI results]
+(3) Accuracy of SNP and INDEL detection improves because haplotype-level detection improves signal to noise ratio [Result: comparison to other SNP calls and INDEL calls; 1bp INDEL detection accuracy as a function of clump size] [Display items: SNP AUC table; INDEL size as a function of clump size; INDEL caller ROC comparison]
+(4) Enables integration of multiple independent callsets. [Results: comparison to other integration strategies; Quoting number of inconsistent allele locations in Phase 1.] [Display items: cartoon of inconsistent calls; table comparing performance of variant consolidation methods]
+(5) Ploidy framework allows accurate genotype determination in haploid regions. [Result: Chromosome X calling]. [Display items: in-text chromosome X results]
+(6) Physical haplotype method gives better sensitivity than assembly based methods (local or global), but comparable to mapping methods, while achieving better specificity.  [Results: SNP AFS comparison across mapping, assembly methods] [Display items: AFS comparison; call comparison table]
+}
+
+\emph{
+Gabor's logical ordering: 1, 2, 4, 3, 6, 5
+}
+
+\emph{
+Literature/competing method review:
+How other methods deal with the same issues?
+Other approaches for variant calling (per-site mapping based, assembly, local assembly)
+How our approach promises to solve the problems.
+}
+
+\end{comment}
+
+
+While \emph{statistical phasing} approaches are necessary for the determination of large-scale haplotype structure \citep{browning2007, mach2010, delaneau2012, howie2011}, sequencing traces provide short-range phasing information that may be employed directly in primary variant detection to establish phase between proximal alleles.  Present read lengths and error rates limit this \emph{physical phasing} approach to variants clustered within tens to hundreds of bases, but as the cost of o [...]
+
+Haplotype-based variant detection methods, in which short haplotypes are read directly from sequencing traces, offer a number of benefits over methods which operate on a single position at a time. Haplotype-based methods ensure semantic consistency among described variants by simultaneously evaluating all classes of alleles in the same context. 
+Locally phased genotypes can be used to improve genotyping accuracy in the context of rare variations that can be difficult to impute due to sparse linkage information.
+
+Similarly, they can assist in the design of genotyping
+assays, which can fail in the context of undescribed variation at the assayed locus.  These methods can provide the direct detection of complex variants of clinical significance, such as the BLM\textsuperscript{Ash} allele, a complex block substitution in a helicase gene related to cancer risk \citep{blmash} or recurrent multi-nucleotide polymorphisms often found in particular cancer types \citep{recurrentTERT2013}.  Directly detecting such alleles from sequencing data decreases the cost [...]
+
+The use of longer haplotypes in variant detection can improve detection by increasing the signal to noise ratio of the genotype likelihood space that is used in analysis, provided some degree of independence between sequencing errors.  This follows from the fact that the space of possible erroneous haplotypes expands dramatically with haplotype length, while the space of true variation remains constant, with the number of true alleles less than or equal to the ploidy of the sample at a g [...]
+
+The direct detection of haplotypes from alignment data presents several challenges to existing variant detection methods.  As the length of a haplotype increases, so does the number of possible alleles within the haplotype, and thus methods designed to detect genetic variation over haplotypes in a unified context must be able to model multiallelism.  However, most variant detection methods establish estimates of the likelihood of polymorphism at a given loci using statistical models whic [...]
+
+To enable the application of population-level inference methods to the detection of haplotypes, we generalize the Bayesian statistical method described by \citet{marth99} to allow multiallelic loci and non-uniform copy number across the samples under consideration.  We have implemented this model in FreeBayes \citep{freebayesgit}.  In addition to extensions enabling haplotype-based detection, we have incorporated a model of the capacity for the alignments to characterize the locus and al [...]
+
+
+\section{Results}
+
+\subsection{Small variant detection in simulated data}
+\label{sec:simulation}
+
+%\emph{GTM: This section presents results for Biological problem 3. The table for SNPs, and the figure for INDELs.}
+
+To assess the performance of our method, we used the population genome simulator mutatrix \citep{mutatrixgit} to simulate variation in 100 samples over 100 kilobases of human chromosome 20, and the mason read simulator \citep{holtgrewe2010} to generate a simulated Illumina-like 70bp-reads at 10x depth per sample.
+%several sets of simulated data: 10 samples at 100x coverage, 1000 samples at 10x coverage, and a single sample at 20x, 50x, 100x, and 200x coverage, over 100 kilobases of human chromosome 20.
+%This simulator employs a $1/i$ allele frequency model to simulate SNPs and indels across a number of samples.
+The data were aligned with Mosaik \citep{mosaik}, and variants were called using several popular detection methods capable of simultaneously detecting SNPs and short indels: GATK HaplotypeCaller and UnifiedGenotyper (version 2.7.4) \citep{gatk2011}, samtools (version 0.1.19-44428cd) \citep{samtools}, and FreeBayes (version 0.9.9.2-21-g78714b8).
+%In order to improve statistical power, each simulation was run 100 times and the results were merged.
+To assess each caller's detection performance we generated receiver-operator characteristics (ROCs) using vcfroc \citep{vcflibgit}.  We provide results in terms of area under the curve (AUC) for all tested variant callers in table \ref{tab:simROCs}.%  (See supplement for ROC plots for each simulation.)
+
+These results indicate that FreeBayes provides superior performance to the GATK and samtools at all assayed depths and numbers of samples.
+%For both SNPs and indels, the relative performance difference between FreeBayes and these other methods is largest at low sequencing depth and numbers of samples, and diminishes as sequencing depth or number of samples increases.
+We observe that the difference in the AUC metric is dominated by both minimum distance from perfect discrimination (perfect sensitivity and perfect specificity), in which FreeBayes consistently outperforms the other methods, and by apparent hard limitation on sensitivity imposed by the other methods.  We hypothesize that the difference in performance for indels, which is larger than that for SNPs, reflects our method's detection of alleles on haplotypes, which improves the signal to nois [...]
+
+
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=0.7\textwidth]{100samples10x_0_25_both}
+\caption{Receiver-operator characteristics (ROCs) for FreeBayes, GATK HaplotypeCaller and UnifiedGenotyper, and samtools on 100 samples at 10x simulated sequencing depth.  FreeBayes achieves the highest area under the curve (AUC) \ref{tab:simROCs}, with the HaplotypeCaller and samtools each performing next-best for indels and SNPs, respectively.}
+\label{fig:10xROC}
+\end{figure}
+
+
+\begin{table}
+\centering
+    \begin{tabular}{|l||l|l|l|l|}
+        \hline
+        variant detector      & depth            & samples  & AUC SNPs                & AUC indels                \\ \hline
+        FreeBayes             & 10               & 100      & 0.9594                  & 0.9400                 \\ 
+        \hline
+        GATK HaplotypeCaller  & 10               & 100      & 0.8155                  & 0.7765                 \\ 
+        \hline
+        GATK UnifiedGenotyper & 10               & 100      & 0.8907                  & 0.7073                 \\ 
+        \hline
+        samtools              & 10               & 100      & 0.9056                  & 0.4698                 \\  \hline
+    \end{tabular}
+
+\caption{Performance of FreeBayes, GATK HaplotypeCaller and UnifiedGenotyper, and samtools against simulated data.
+%FreeBayes provides the best area under the curve (AUC) at all sequencing depths and numbers of samples for both SNPs and indels.}
+}
+\label{tab:simROCs}
+\end{table}
+
+
+\subsection{Using simulation to assess the direct detection of haplotypes}
+\label{sec:complexsimulation}
+
+In order to facilitate our assessment of the method at determining phase between clusters of alleles, we set a mutation rate sufficient to generate many clusters of variants in these simulated samples.  We then simulated reads at 20x coverage from the resulting simulated chromosomes using wgsim \citep{samtools}, aligned the results using Mosaik \citep{mosaik} and ran freebayes on the resulting alignments specifying a haplotype detection length of 10bp.  The results were compared to the t [...]
+%For the evaluation of complex alleles detection, we ignored haplotypes which could not be detected by the algorithm given the maximum haplotype length constraint of the detection method.
+
+Our results agree with those obtained for other classes of small variants in section \ref{sec:simulation}, showing high performance against SNPs (AUC of 0.979) and indels (AUC of 0.948).  For complex variants composed between multiple small variants, direct detection provides an AUC of 0.919.
+
+% TODO rerun using newest freebayes version
+
+% humu.bc.edu:haplotypecomparisons/1mb_10x_10samples
+%> abs(trapz(c(1, roc$snpsfpr), c(1, roc$snpstpr)))
+%[1] 0.9795195
+%> abs(trapz(c(1, roc$complexfpr), c(1, roc$complextpr)))
+%[1] 0.9033818
+%> abs(trapz(c(1, roc$indelsfpr), c(1, roc$indelstpr)))
+%[1] 0.7524788
+%> abs(trapz(c(1, roc$mnpsfpr), c(1, roc$mnpstpr)))
+%[1] 0.9480347
+
+
+%\section{Comparison of haplotype-based calls to statistically phased genotypes}
+
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=0.8\textwidth]{indel_error}
+\caption{A known error mode of Illumina sequencing methods generates a 1bp insertion artifact that is detected by standard mapping-based variant calling methods.  The artifact results in a relative over-abundance of 1bp insertions.  Here, we characterize the ability of our method to remove this artifact by detecting variants in a larger detection window.  As the calling window size is increased beyond 10bp, the artifact is effectively removed, and the balance between insertions and delet [...]
+\label{fig:indelerror}
+\end{figure}
+
+
+\subsection{Using haplotype-based variant detection to improve the signal to noise ratio of candidate variants}
+\label{sec:indelerror}
+
+The fluorescence-based imaging utilized by Illumina sequencing machines is susceptible to errors generated by air bubbles introduced into the flowcell in which the sequencing reaction takes place.  Bubble errors tend to manifest themselves as high-quality 1bp insertions in sequencing traces derived from spots in the affected regions of the sequencing flowcell.  These errors are randomly distributed with respect to reference position, but their high frequency in some sequencing runs means [...]
+
+To assess the ability of our haplotype-based method to overcome this characteristic error, we detected variants in the previously described AFR191 sample set using a number of different haplotype lengths.  The indel detection results (figure \ref{fig:indelerror}) indicate that this error mode can be effectively removed from small variant calls by increasing the detection window size to 10bp or greater.
+
+As we increase the length of detected haplotypes, we increase the number of possible erroneous haplotypes without increasing the number of true haplotypes.  This effect results in an improved signal to noise ratio for detected variants at larger haplotype sizes.  As such, increasing window size in our algorithm allows us to exclude likely insertion artifacts from consideration, as the recurrance of an erroneous haplotype diminishes rapidly with haplotype length.  We hypothesize that this [...]
+
+
+%\begin{wrapfigure}{r}{0.3\textwidth}
+%\includegraphics[width=0.3\textwidth]{omni_errors}
+%\caption{Omni errors.}
+%\label{fig:omnierrors}
+%\end{wrapfigure}
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=0.3\textwidth]{omni_errors}
+\caption{The Omni 2.5 genotyping array includes a number of alleles which consistently report as non-polymorphic (monomorphic) in the 1000 Genomes cohort in which they were originally detected.  By detecting variants using our method at a 10bp variant calling window, we demonstrate that more than 90\% of the apparently monomorphic loci are not biallelic SNPs, and thus the array design does not match the local variant structure in these samples.  By using a haplotype-based approach, group [...]
+\label{fig:omnierrors}
+\end{figure}
+
+
+\subsection{Using haplotype-based variant detection to understand genotyping array design failure}
+\label{sec:arrayfailure}
+
+Variant calls generated during the pilot phase of the 1000 Genomes Project \citep{1000Gphase1} were used to design a genotyping array, (the Illumina OMNI2.5).  Subsequently, many of the alleles on this array (approximately 10\%) were found to be putatively monomorphic in the same set of samples, suggesting they resulted from variant detection error.
+
+We investigated these loci using whole-genome calls in the low-coverage cohort in Phase I of the 1000 Genomes Project.  We ran freebayes using a haplotype window of 10 base pairs.  On comparison with the monomorphic array loci, we found that approximately 90\% of the array-monomorphic loci overlap non-SNP or non-biallelic variation in these samples within 10bp of the target SNP, whereas the opposite is true of polymorphic loci--- greater than 90\% of loci assayed as polymorphic overlap b [...]
+
+We observe that many of the apparent failures in variant detection are actually caused by an inability of methods to assess local clusters of variation.  The accurate design of genotyping arrays and their use in cross-validation of sequencing-based genotyping performance thus requires information about local haplotypes structure.
+
+\subsection{The importance of accurately modeling copy number variations on sex chromosomes}
+
+Our method is currently the only variant detector in common use which provides the ability to call males and females jointly on chromosome X with correct copy number.  To evaluate the benefits of this approach, we detected variants in chromosome X for 191 low-coverage 1000 Genomes samples of African ancestry using FreeBayes both with and without copy-number awareness.  Comparison of our results to the genotyping array calls (excluding cases of likely array failure due to non-SNP, non-bia [...]
+
+
+\subsection{Comparing to other methods in low-coverage sequencing data}
+\label{sec:1000Gcomparisons}
+
+In the testing phase of the 1000 Genomes Project, participating groups submitted callsets based on 191 samples of African ancestry (AFR191).  Results are characterized in figure \ref{table:1000Gcomparisons}.  Unlike other haplotype-based and assembly metods, the approach described in this paper (BC2) provides sensitivity to known variants equivalent to mapping-based methods (BCM, BC1, SI1, UM).  Furthermore, the method's ability to characterize haplotypes in loci which appeared to be mon [...]
+
+\begin{table}
+\resizebox{\textwidth}{!}{%
+\centering
+\begin{tabular}{|l|lllll|lll|ll|lllll|}
+\hline
+call set       & BC   & BCM  & BI1  & SI1  & UM   & BC2  & BI2  & OX1  & SI2  & OX2  & Union  & 2/9  & 3/9  & 4/9  & BC cons   \\
+\hline
+SNPs [K]       & 459  & 512  & 481  & 480  & 491  & 495  & 362  & 452  & 252  & 101  & 621  & 548  & 518  & 487  & 543       \\
+Omni poly [\%] & 91.6 & 98.9 & 96.5 & 95.2 & 97.6 & 97.4 & 88.4 & 87   & 83.1 & 44.6 & 99.3 & 98.9 & 98.6 & 97.6 & 98.7 \\
+Hapmap [\%]    & 94.5 & 99.4 & 98   & 95.6 & 98.9 & 98.3 & 93.6 & 90.3 & 91.1 & 53.7 & 99.4 & 99.4 & 99.3 & 99   & 98.6 \\
+Omni mono [\%] & 1.39 & 1.63 & 0.29 & 0.62 & 0.77 & 0.56 & 0.14 & 1.1  & 0.72 & 0.1  & 3.73 & 0.97 & 0.67 & 0.48 & 0.65 \\
+\hline
+
+\hline
+\end{tabular}}
+\caption{Performance of various variant detection pipelines tested as part of the 1000 Genomes Project.  Sets are Boston College; non-haplotype-based method (BC), haplotype-based method described in this paper (BC2), Baylor College of Medicine (BCM), Broad Institute GATK UnifiedGenotyper (BI1), Sanger Institute Samtools (SI1), University of Michigan GlfMultiples (UM), Broad Institute GATK HaplotypeCaller (BI2), Oxford Platypus (OX1), Sanger SGA (SI2), Oxford Cortex (OX2).  Union: combina [...]
+\label{table:1000Gcomparisons}
+\end{table}
+
+
+
+%\begin{figure}[h!]
+%\centering
+%\includegraphics[width=1.0\textwidth]{1000G_performance_comparison}
+%\end{figure}
+
+
+\subsection{Indel detection performance}
+\label{sec:1000Gindels}
+
+\begin{table}
+\centering
+\begin{tabular}{|l||l|l|l|l|l|}
+\hline
+center           & specificity & sensitivity & caller & optimality &  AUC   \\
+\hline
+Oxford Cortex    & 98          & 27          & OX2    & 73.02739   & 0.2646 \\
+Pindel           & 90          & 52          & Pindel & 49.03060   & 0.4680 \\
+BC               & 83          & 66          & BC     & 38.01316   & 0.5478 \\
+Broad assembly   & 80          & 67          & BI2    & 38.58756   & 0.5360 \\
+Sanger           & 76          & 69          & SI1    & 39.20459   & 0.5244 \\
+Broad mapping    & 65          & 74          & BI1    & 43.60046   & 0.4810 \\
+Oxford Platypus  & 60          & 55          & OX1    & 60.20797   & 0.3300 \\ \hline
+\end{tabular}
+\end{table}
+
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=0.6\textwidth]{miseq}
+\caption{Performance of indel detection methods in 1000 Genomes project on the AFR191 sample set as assesed via high-depth resequencing validation.  Sets are Boston College FreeBayes (BC), Broad Institute GATK UnifiedGenotyper (BI1), Sanger Institute Samtools (SI1), Broad Institute GATK HaplotypeCaller (BI2), Oxford Platypus (OX1), Oxford Cortex (OX2).}
+\label{fig:1000Gindels}
+\end{figure}
+
+\subsection{Sensitivity to low-frequency variation}
+\label{sec:lowfreq}
+
+Current methods for haplotype-based variant detection rely on assembly methods, which can be applied globally \citep{cortex} or locally \citep{dindel}.  These methods remove reference bias from the analysis of short-read sequencing data, but the generation of assemblies of large genomes requires pruning of low-frequency kmer observations.  While low-frequency kmers are often generated by sequencing error, in many cases they represent true variation, and thus this pruning reduces the sens [...]
+
+Results from the experiments described in \ref{sec:1000Gcomparisons} demonstrate that our method, while acting as a form of local assembly, does not incur the same sensitivity penalties seen in both local and global assembly methods.  We assess this using the count of minor alternate alleles as reported by each caller (figure \ref{fig:lowfreqsens}).  These results indicate that both global and local assembly methods suffer significant decrease in sensitivity to low-frequency variants, al [...]
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=0.8\textwidth]{low_frequency_sensitivity}
+\caption{Sensitivity to low-frequency variants of various detection methods, as assessed in 191 samples of African ancestry in the 1000 Genomes low-coverage cohort.   BC2 is FreeBayes, BI1 is the GATK UnifiedGenotyper, BI2 is the GATK HaplotypeCaller, and SI2 is the global assembler SGA.}
+\label{fig:lowfreqsens}
+\end{figure}
+
+
+\subsection{Haplotype-based consolidation of small variant calls}
+\label{sec:ensemble}
+
+Ensemble methods have been shown to provide superior performance to component inference methods in many contexts \citep{opmac99}.  We hypothesize that ensemble approaches to variant detection from short-read sequencing may provide improved performance in the context of variant detection.  While ensemble approaches have already been successfully applied to SNPs in large-scale resequencing projects \citep{1000GPhaseI}, their application to other variant classes is problematic because detec [...]
+
+
+
+%\section{Generalizing variant detection to multiallelic loci and non-uniform copy number}
+\section{Methods}
+\label{sec:model}
+
+\subsection{Definitions}
+
+At a given genetic locus we have $n$ samples drawn from a population, each of which has a copy number or multiplicity of $m$ within the locus.  We denote the number of copies of the locus present within our set of samples as $M = \sum_{i=1}^n m_i$.  Among these $M$ copies we have $K$ distinct alleles, $b_1,\ldots,b_K$ with allele counts $c_1,\ldots,c_K$ and frequencies $f_1,\ldots,f_K$.  Each individual has an unphased genotype $G_i$ comprised of $k_i$ distinct alleles $b_{i_1},\ldots,b_ [...]
+
+\subsection{A Bayesian approach}
+\label{sec:modeloverview}
+
+To genotype the samples at a specific locus, we could simply apply a Bayesian statistic relating $P(G_i|R_i)$ to the likelihood of sequencing errors in our reads and the prior likelihood of specific genotypes.  However, this maximum-likelihood approach limits our ability to incorporate information from other individuals in the population under analysis, which can improve detection power.
+
+Given a set of genotypes $G_1,\ldots,G_n$ and a set of observations observations $R_1,\ldots,R_n$ for all individuals at the current genetic locus, we can use Bayes' theorem to related the probability of a specific combination of genotypes to both the quality of sequencing observations and \emph{a priori} expectations about the distribution of alleles within a set of individuals sampled from the same population:
+
+
+\begin{equation}
+P(G_1,\ldots,G_n|R_1,\ldots,R_n) 
+= { P(G_1,\ldots,G_n) P(R_1,\ldots,R_n|G_1,\ldots,G_n) \over P(R_1,\ldots,R_n)} \\
+\end{equation}
+
+\begin{equation}
+\label{eq:bayesian}
+P(G_1,\ldots,G_n|R_1,\ldots,R_n) = { P(G_1,\ldots,G_n) \prod_{i=1}^n P(R_i|G_i) \over 
+\sum_{\forall{G_1,\ldots,G_n}}  P(G_1,\ldots,G_n) \prod_{i=1}^n P(R_i|G_i) }
+\end{equation}
+
+In this formulation, $P(R_1,\ldots,R_n|G_1,\ldots,G_n) = \prod_{i=1}^n P(R_i|G_i)$ represents the likelihood that our observations match a given genotype combination (our data likelihood), and $P(G_1,\ldots,G_n)$ represents the prior likelihood of observing a specific genotype combination.  We estimate the data likelihood as the joint probability that the observations for a specific individual support a given genotype.  We use a neutral model of allele diffusion conditioned on an estimat [...]
+
+Except for situations with small numbers of samples and potential alleles, we avoid the explicit evaluation of the posterior distribution as implied by (\ref{eq:bayesian}), instead using a number of optimizations to make the algorithm tractable to apply to very large datasets (see section \ref{sec:genotyping}).
+
+
+\subsection{Estimating the probability of sequencing observations given an underlying genotype, $P(R_i|G)$}
+
+Given a set of reads $R_i = r_{i_1},\ldots,r_{i_{s_i}}$ from a sample at a given locus, we can extract a set of $k_i$ observed alleles $B'_i = b'_1,\ldots,b'_{k_i}$ corresponding to underlying alleles $b_1,\ldots,b_i$ which encapsulate the potential set of represented variants at the locus in the given sample, including erroneous observations.  Each of these observed alleles $b'_i$ has a count $o_f$ within the observations of the individual sample $: \sum_{j=1}^{k_i} o_j = s_i$ and corre [...]
+
+The probability of obtaining a single observation $b_i'$ provided a genotype in a single sample is:
+
+\begin{equation}
+P(b'_i|G) = \sum_{\forall(b_i \in G)} { f_i P(b'_i|b_i) }
+\end{equation}
+
+Here $f_i$ is the genotype allele frequency of $b_i$ in $G$.  We observe that the process generating reads from a given locus in a given sample is a multinomial process in which the sampling probabilities for each allele are governed by both the counts of alleles in the genotype and the error process that generates $b'_i$ from underlying $b_i$.  However, for the case that the base observation agrees with the underlying genotype, sampling probability dominates the probability that the obs [...]
+
+\begin{equation}
+P(b'|b) = 
+\left\{
+	\begin{array}{ll}
+		1 & \mbox{if } b' = b \\
+		P(error) & \mbox{if } b' \neq b
+	\end{array}
+\right.
+\end{equation}
+
+Here $P(error)$ is the probability that the base is erroneous as determined by the sequencing process used to generate the reads from the sample.  Provided this approximation, we can estimate the probability of a given set of reads conditioned on an underlying genotype by using the multinomial sampling probability to estimate the probability of obtaining the observations that support the genotype scaled by the probability that the observations that disagree with the genotype are erroneous:
+
+\begin{equation}
+P(R_i|G) \approx {s_i \choose o_1,\ldots,o_{k_i} } 
+\prod_{j=1}^{k_i} { f_{i_j}^{o_j} }
+\prod_{l=1}^{s_i} { P(b'_l | b_l) }
+\end{equation}
+
+%which define $G_i$, $f_{i_1},\ldots,f_{i_{k_i}}$.
+
+%If we had perfect observations of a locus, $P(R_i|G_i)$ for any individual would approximate the probability of sampling observations $R_i$ out of $G_i$ with replacement.  This probability is given by the multinomial distribution in $s_i$ over the probability $P(b_l)$ of obtaining each allele from the given genotype, which is ${f_{i_j} \over m_i}$ for each allele frequency in the frequencies which define $G_i$, $f_{i_1},\ldots,f_{i_{k_i}}$.
+
+% TODO f_k_i
+% Furthermore, we must sum $P(R_i|G_i)$ for all possible $R_i$ combinations $\forall(R_i \in G_i : | R_i | = k_i)$ drawn from our genotype to obtain the joint probability of $R_i$ given $G_i$, as each $\prod_{l=1}^{s_i} { P(b'_l | b_l) }$ only accounts for the marginal probability of the a specific $R_i$ given $B'_i$.
+
+\begin{comment}
+
+\begin{equation}
+P(R_i | G_i)
+\approx
+P(B'_i | G_i) = 
+{ s_i! \over { 
+\prod_{j=1}^{k_i} o'_j !
+} }
+\prod_{j=1}^{k_i} { \left({f_{i_j} \over m_i}\right)^{o'_j} }
+\prod_{l=1}^{s_i} { P(b'_l | b_l) }
+\end{equation}
+
+This extends $P(R_i|G_i)$ as follows:
+
+\begin{equation}
+P(R_i | G_i) =  
+\sum_{\forall(R_i \in G_i)} \left(
+{ s_i! \over { 
+\prod_{j=1}^{k_i} o'_j !
+} }
+\prod_{j=1}^{k_i} { \left({f_{i_j} \over m_i}\right)^{o'_j} }
+\prod_{l=1}^{s_i} { P(b'_l | b_l) }
+\right)
+\end{equation}
+\end{comment}
+
+\subsection{Genotype combination priors, $P(G_1,\ldots,G_n)$}
+
+\subsubsection{Decomposition of prior probability of genotype combination}
+
+Let $G_1,\ldots,G_n$ denote the set of genotypes at the locus and $f_1,\ldots,f_k$ denote the set of allele frequencies which corresponds to these genotypes.  We estimate the prior likelihood of observing a specific combination of genotypes within a given locus by decomposition into resolvable terms:
+
+\begin{equation}
+P(G_1,\ldots,G_n) = P(G_1,\ldots,G_n \cap f_1,\ldots,f_k)
+\end{equation}
+
+The probability of a given genotype combination is equivalent to the intersection of that probability and the probability of the corresponding set of allele frequencies.  This identity follows from the fact that the allele frequencies are derived from the set of genotypes and we always will have the same $f_1,\ldots,f_k$ for any equivalent $G_1,\ldots,G_n$.
+
+Following Bayes' Rule, this identity further decomposes to:
+
+\begin{equation}
+P(G_1,\ldots,G_n \cap f_1,\ldots,f_k) = P(G_1,\ldots,G_n | f_1,\ldots,f_k) P(f_1,\ldots,f_k)
+\end{equation}
+
+We now can estimate the prior probability of $G_1,\ldots,G_n$ in terms of the genotype combination sampling probability, $P(G_1,\ldots,G_n | f_1,\ldots,f_k)$, and the probability of observing a given allele frequency in our population, $P(f_1,\ldots,f_k)$.
+
+\subsubsection{Genotype combination sampling probability $P(G_1,\ldots,G_n | f_1,\ldots,f_k)$}
+
+The multinomial coefficient ${M \choose c_1,\ldots,c_k }$ gives the number of ways which a set of alleles with frequencies $f_1,\ldots,f_k : f_i = c_i/M$ may be distributed among $M$ copies of a locus.  For phased genotypes $\hat{G_i}$ the probability of sampling a specific $\hat{G_1},\ldots,\hat{G_n}$ given allele frequencies $f_1,\ldots,f_k$ is thus provided by the inverse of this term:
+
+\begin{equation}
+\label{eq:phasedsampling}
+P(\hat{G_1},\ldots,\hat{G_n} | f_1,\ldots,f_k) =
+{M \choose
+  c_1,\ldots,c_k }^{-1}
+\end{equation}
+
+However, our model is limited to unphased genotypes because our primary data only allows phasing within a limited context. Consequently, we must adjust (\ref{eq:phasedsampling}) to reflect the number of phased genotypes which correspond to the unphased genotyping $G_1,\ldots,G_n$.  Each unphased genotype corresponds to as many phased genotypes as there are permutations of the alleles in $G_i$.  Thus, for a given unphased genotyping $G_1,\ldots,G_n$, there are $\prod_{i=1}^n { m_i \choose [...]
+
+In conjunction, these two terms provide the probability of sampling a particular unphased genotype combination given a set of allele frequencies:
+
+\begin{equation}
+\label{eq:unphasedsampling}
+P(G_1,\ldots,G_n | f_1,\ldots,f_k) =
+{ M \choose c_1,\ldots,c_k }^{-1}
+\prod_{i=1}^n { m_i \choose c_{i_1}, \ldots, c_{i_{k_i}}}
+\end{equation}
+
+%\begin{equation}
+% = 
+%\frac{1}{M!}
+%\prod_{l=1}^k f_l! 
+%\prod_{i=1}^n \frac{m_i!}{\prod_{j=1}^{k_i} f_{i_j}!}
+%\end{equation}
+
+In the case of a fully diploid population, the product of all possible multiset permutations of all genotypes reduces to $2^h$, where $h$ is the number of heterozygous genotypes, simplifying (\ref{eq:unphasedsampling}) to:
+
+\begin{equation}
+P(G_1,\ldots,G_n | f_1,\ldots,f_k) =
+2^h
+{ M \choose c_1,\ldots,c_k }^{-1}
+\end{equation}
+
+
+\subsubsection{Derivation of $P(f_1,\ldots,f_k)$ by Ewens' sampling formula}
+
+Provided our sample size $n$ is small relative to the population which it samples, and the population is in equilibrium under mutation and genetic drift, the probability of observing a given set of allele frequencies at a locus is given by Ewens' sampling formula \citep{ewens72}.  Ewens' sampling formula is based on an infinite alleles coalescent model, and relates the probability of observing a given set of allele frequencies to the number of sampled chromosomes at the locus ($M$) and t [...]
+
+The application of Ewens' formula to our context is straightforward.  Let $a_f$ be the number of alleles among $b_1,\ldots,b_k$ whose allele count within our set of samples is $c$.  We can thus transform our set of frequencies $f_1,\ldots,f_k$ (equivalently, allele counts, $c_1,\ldots,c_k$) into a set of non-negative frequency counts $a_1,\ldots,a_M : \sum_{c=1}^M{ca_c} = M$.  As many $c_1,\ldots,c_k$ can map to the same $a_1,\ldots,a_M$, this transformation is not invertible, but it is  [...]
+
+Having transformed a set of frequencies over alleles to a set of frequency counts over frequencies, we can now use Ewens' sampling formula to approximate $P(f_1,\ldots,f_k)$ given $\theta$:
+
+\begin{comment}
+\begin{equation}
+P(f_1,\ldots,f_k) = P(a_1,\ldots,a_M) = {M! \over \theta(\theta+1)\cdots(\theta+M-1)}\prod_{j=1}^M{\theta^{a_j} \over j^{a_j} a_j!}
+\end{equation}
+\end{comment}
+
+\begin{equation}
+P(f_1,\ldots,f_k) =
+P(a_1,\ldots,a_M) = 
+{M! \over \theta \prod_{z=1}^{M-1}(\theta+z)}
+\prod_{j=1}^M{\theta^{a_j} \over j^{a_j} a_j!}
+\end{equation}
+
+In the bi-allelic case in which our set of samples has two alleles with frequencies $f_1$ and $f_2$ such that $f_1 + f_2 = M$:
+
+\begin{equation}
+P(a_{f_1} = 1, a_{f_2} = 1) = 
+{M! \over \prod_{z=1}^{M-1}(\theta+z)}
+{\theta \over f_1 f_2}
+\end{equation}
+
+While in the monomorphic case, where only a single allele is represented at this locus in our population, this term reduces to:
+
+\begin{equation}
+P(a_M = 1) = 
+{(M-1)! \over \prod_{z=1}^{M-1}(\theta+z)}
+\end{equation}
+
+In this case, $P(f_1,\ldots,f_k) = 1 - \theta$ when $M = 2$.  This is sensible as $\theta$ represents the population mutation rate, which can be estimated from the pairwise heterozygosity rate of any two chromosomes in the population \citep{watterson1975, tajima1983}.
+
+\subsection{Expanding the model to incorporate the observability of the locus and alleles}
+\label{sec:modelsequencability}
+
+The bayesian model described in section \ref{sec:modeloverview} can generate posterior estimates based on sequencing quality information and genotype distribution in a panel of samples.  However, this estimate can incorporate only information captured in base quality information and read counts.  This may fail to assess the ability of the sequencing and alignment methods to accurately characterize the locus and alleles that we genotype, which is an important consideration for downstream  [...]
+
+Previous authors have addressed this limitation by adding post-processing steps to recalibrate the estimated quality of variants using training sets of known variants and known artifacts.  Once variant calls have been made we can annotate them with a variety of features and apply standard machine learning methods to ``recalibrate'' the quality estimates produced from genotype distribution, allele frequency, observation counts, and base quality.  For instance, \cite{gatk2011} apply a guas [...]
+
+Problematically, such an approach requires a training set, which may not be applicable in contexts with limited validation data, such as is commonly the case in non-model organisms.  Furthermore, the training set may bias our results towards established patterns, decreasing sensitivity to novel variation that might have been previously uncharacterized due to technological limitations.
+
+In contrast, we address the issue of loci sequencability in a general, \emph{a priori} fashion by extending the traditional Bayesian variant detection model to incorporate an indicator, $S$, which describes the ability of our sequencing and alignment methods to characterize the locus we are considering.  We define $S = true$ when we can sequence the locus and alleles and $S = false$ otherwise, and redefine our model (\ref{eq:bayesian}) to estimate the posterior probability of a particula [...]
+
+\begin{equation}
+\label{eq:bayesianandmodel}
+P(G_1,\ldots,G_n, S | R_1,\ldots,R_n)  = { P(G_1,\ldots,G_n) P(S) \prod_{i=1}^n P(R_i|G_i) \over 
+\sum_{\forall{G_1,\ldots,G_n}} ( P(G_1,\ldots,G_n) P(S) \prod_{i=1}^n P(R_i|G_i) ) }
+\end{equation}
+
+We will describe the development of $P(S)$ using aggregate statistics built from the read evidence overlapping the locus in section \ref{sec:sequencable}.
+
+\subsection{Estimation of the probability that the locus is sequencable $P(S)$}
+\label{sec:sequencable}
+
+For accurate variant detection via resequencing, we require that the locus in question is sequencable.  That is, we require that the reference is accurate, that we have an accurate model of copy number at the locus, that we have genomic coverage, and that reads can be aligned to the alleles of interest in the region.  In a case where these conditions are met, we assume $S = true$.  Where it is not, $S = false$.
+
+% TODO cleanup, remove duplication with previous section
+The sequenceability of a locus and its alleles is assumed under previous Bayesian variant detection models \citep{marth99, samtools, li2011stats}.  Uncertainty about the genomic model characterization has been incorporated into data likelihoods or detection thresholds using read mapping quality \citep{snptools, maq}.  In practice, the incorporation of confidence in the characterizability of the locus and alleles requires the reclassification of variant calls on the basis of aggregate met [...]
+
+A quality score recalibrator utilizes training data, particularly as sets of known variants or validated errors, to describe the distribution of true events and errors across the space of possible annotations in the data set to be recalibrated.  The variant calling error function as described by these aggregate metrics can then be approximated using a variety of machine learning methods, such as support vector machines \citep{snpsvm} or a gaussian mixture model as implemented in the GATK [...]
+%  Feature-based recalibration and detection methods have wide utility in variant detection, and we can expect them to become more comon in practice, but they can also be problematic in that they require adequate training sets, which may not be available in a specific experimental context.
+
+We observe that $S$ is proportional to a number of variables which can be estimated directly from the observations covering a genomic locus.  For instance, if the locus and alleles are observable without bias, we expect the count of observations of a sample supporting a particular alternate allele $R_i \equiv b$ to approximate its frequency in the correct genotype $G_i$ for the sample, $|R_i \equiv b|/|R_i| \approx |b \in G_i|/m_i$.  Deviation from this expectation which is observed acro [...]
+
+In an unbiased context, we expect half of our reads to place to either side of the locus (placement bias $B_p$):
+
+\begin{equation}
+P(B_p = 0) \propto \binom{|R_{left}|}{|R|} 0.5^{|R_{left}|}
+\end{equation}
+
+We expect half to contain the allele in the first half of their length (cycle bias $B_c$):
+
+\begin{equation}
+P(B_c = 0) \propto \binom{|R_{start}|}{|R|} 0.5^{|R_{start}|}
+\end{equation}
+
+Half should be derived from each strand of DNA (strand bias $B_s$):
+
+\begin{equation}
+P(B_s = 0) \propto  \binom{|R_{forward}|}{|R|} 0.5^{|R_{forward}|}
+\end{equation}
+
+And, the aggregate fraction of reads supporting a particular allele in samples with a particular genotype should approximate the frequency of the allele in that particular genotype (allele balance, $B_a$).  Recall that the distinct alleles in a particular set of genotypes are $b_1,\ldots,b_K$, the corresponding allele frequencies in the set are $f_1,\ldots,f_k$, and the observation counts are represented by $o_1,\ldots,o_K$:
+
+%\begin{equation}
+%o_i = |\{r \in R : r \equiv b_i\}|
+%\end{equation}
+
+\begin{equation}
+P(B_a = 0) \propto 
+\prod_{\forall g \in \{G\}} 
+\binom{ |R| }{ o_1,\ldots,o_K }
+\prod_{j=1}^{K} f_j^{o_j}
+\end{equation}
+
+We use these relationships to determine relationships in $P(S)$ under various configurations of alleles and genotypes in the samples:
+
+\begin{equation}
+P(S) \propto P(B_p = 0) P(B_c = 0) P(B_s = 0) P(B_a = 0)
+\end{equation}
+
+
+\begin{comment}
+\begin{eqnarray}
+P(S) \propto & & multinom([ |\{R \equiv b\}| \forall b \in K] ; \sum_{i=1}^{n} |R_i|, f_i,\ldots,f_K)  \\
+& \times \prod_{\forall b \in K} & binom(|forwardStrand(\{R \equiv b\})|; |\{R \equiv b\}|, 1/2) \\
+&  & \times binom(|placedLeft(\{R \equiv b\})|; |\{R \equiv b\}|, 1/2) \\
+&  & \times binom(|placedRight(\{R \equiv b\})|; |\{R \equiv b\}|, 1/2)
+\end{eqnarray}
+
+Here $binom(k; n, p)$ is the binomial probability mass function of $k$ successes in $n$ trials with probability $p$.  Similarly, $multinom(k_1,\ldots,k_n; n, p_1,\ldots,p_i)$ provides the multinomial PMF for results $k_1,\ldots,k_n$ in $n$ trials with probabilities $p_1,\ldots,p_i$.  As defined before, we have a set of reads $\{R\}$ at the locus and a set of alleles $K = b_1,\ldots,b_K$ in a given genotyping across all the samples.
+\end{comment}
+
+%% TODO
+
+\section{Direct detection of phase from short-read sequencing}
+
+By modeling multiallelic loci, this Bayesian statistical framework provides the foundation for the direct detection of longer, multi-base alleles from sequence alignments.  In this section we describe our implementation of a haplotype-based variant detection method based on this model.
+
+Our method assembles haplotype observations over minimal, dynamically-determined, reference-relative windows which contain multiple segregating alleles.  To be used in the analysis, haplotype observations must be derived from aligned reads which are anchored by reference-matching sequence at both ends of the detection window.  These haplotype observations have derived quality estimations which allow their incorporation into the general statistical model described in section \ref{sec:mode [...]
+
+\subsection{Parsing haplotype observations from sequencing data}
+\label{sec:parsing}
+
+In order to establish a range of sequence in which multiple polymorphisms segregate in the population under analysis, it is necessary to first determine potentially polymorphic windows in order to bound the analysis.  This determination is complicated by the fact that a strict windowing can inappropriately break clusters of alleles into multiple variant calls.  We employ a dynamic windowing approach that is driven by the observation of multiple proximal reference-relative variations (SNP [...]
+
+Where reference-relative variations are separated by less than a configurable number of non-polymorphic bases in an aligned sequence trace, our method combines them into a single haplotype allele observation, $H_i$.  The observational quality of these haplotype alleles is given as $\min ( q_l \, \forall \, b'_i \in H_i , \, Q_i)$, or the minimum of the supporting read's mapping quality and the minimum base quality of the haplotype's component variant allele observations.
+
+\begin{figure}[h!]
+\centering
+\includegraphics[width=0.8\textwidth]{haplotype_calling}
+\caption{The direct detection of phase from short-read sequencing traces and counting of haplotypes across dynamically-determined windows.}
+\label{fig:haplotypecalling}
+\end{figure}
+
+\subsection{Determining a window over which to assemble haplotype observations}
+
+At each position in the reference, we collect allele observations derived from alignments as described in \ref{sec:parsing}.  To improve performance, we apply a set of input filters to exclude alleles from the analysis which are highly unlikely to be true.  These filters require a minimum number of alternate observations and a minimum sum of base qualities in a single sample in order to incorporate a putative allele and its observations into the analysis.
+
+We then determine a haplotype length over which to genotype samples by a bounded iterative process.  We first determine the allele passing the input filters which is longest relative to the reference.  For instance, a longer allele could be a multi-base indel or a composite haplotype observation flanked by SNPs.  Then, we parse haplotype observations from all the alignments which fully overlap this window, finding the rightmost end of the longest haplotype allele which begins within the  [...]
+
+\subsection{Detection and genotyping of local haplotypes}
+\label{sec:genotyping}
+
+Once a window for analysis has been determined, we parse all fully-overlapping reads into haplotype observations which are anchored at the boundaries of the window.  Given these sets of sequencing observations $r_{i_1},\ldots,r_{i_{s_i}} = R_i$ and data likelihoods $P(R_i|G_i)$ for each sample and possible genotype derived from the putative alleles, we then determine the probability of polymorphism at the locus given the Bayesian model described in section \ref{sec:model}.
+
+To establish a maximum \emph{a posteriori} estimate of the genotype for each sample, we employ a convergent gradient ascent approach to the posterior probability distribution over the mutual genotyping across all samples under our Bayesian model.  This process begins at the genotyping across all samples $G_1,\ldots,G_n$ where each sample's genotype is the maximum-likelihood genotype given the data likelihood $P(R_i|G_i)$:
+
+\begin{equation}
+G_1,\ldots,G_n =
+\underset{G_i}{\operatorname{argmax}} \; P(R_i|G_i)
+%:=  \{ G_i | \forall G : P(R_i|G_i) >= P(R_i|G) \}
+\end{equation}
+
+The posterior search then attempts to find a genotyping $G_1,\ldots,G_n$ in the local space of genotypings which has higher posterior probability under the model than this initial genotyping.  In practice, this step is done by searching through all genotypings in which a single sample has up to the $N$th best genotype when ranked by $P(R_i|G_i)$, and $N$ is a small number (e.g. 2).  This search starts with some set of genotypes $G_1,\ldots,G_n = \{G\}$ and attempts to find a genotyping $ [...]
+
+\begin{equation}
+P(\{G\}'|R_1,\ldots,R_n) > P(\{G\}|R_1,\ldots,R_n)
+\end{equation}
+
+$\{G\}'$ is then used as a basis for the next update step.  This search iterates until convergence, but in practice must be bounded at a fixed number of steps in order to ensure optimal performance.  As the quality of input data increases in coverage and confidence, this search will converge more quickly because the maximum-likelihood estimate will lie closer to the maximum \emph{a posteriori} estimate under the model.
+
+This method incorporates a basic form of genotype imputation into the detection method, which in practice improves the quality of raw genotypes produced in primary allele detection and genotyping relative to methods which only utilize a maximum-likelihood method to determine genotypes.  Furthermore, this method allows for the determination of marginal genotype likelihoods via the marginalization of assigned genotypes for each sample over the posterior probability distribution.
+
+\subsection{Probability of polymorphism}
+
+Provided a maximum \emph{a posteriori} estimate of the genotyping of all the individuals in our sample, we might like establish an estimate of the quality of the genotyping.  For this, we can use the probability that the locus is polymorphic, which means that the number of distinct alleles at the locus, $K$, is greater than 1.  While in practice the space of possible genotypings is too great to integrate over, it is possible to derive the probability that the loci is polymorphic in our s [...]
+
+\begin{equation}
+\label{eq:probpoly}
+P(K > 1 | R_1,\ldots,R_n)
+=
+1 - P(K = 1 | R_1,\ldots,R_n)
+%=
+%1 - \sum_{\forall(G_i,\ldots,G_n : K = 1)} P(G_i,\ldots,G_n|R_i,\ldots,R_n)
+\end{equation}
+
+Equation (\ref{eq:probpoly}) thus provides the probability of polymorphism at the site, which is provided as a quality estimate for each evaluated locus in the output of FreeBayes.
+
+\subsection{Marginal likelihoods of individual genotypes}
+
+Similarly, we can establish a quality estimate for a single genotype by summing over the marginal probability of that specific genotype and sample combination under the model.  The marginal probability of a given genotype is thus:
+
+\begin{equation}
+\label{eq:marginals}
+P(G_j|R_i,\ldots,R_n)
+=
+\sum_{\forall(\{G\} : G_j \in \{G\})}
+P(\{G\}|R_i,\ldots,R_n)
+\end{equation}
+
+In implementation, the estimation of this term requires us to must sample enough genotypings from the posterior in order to obtain well-normalized marginal likelihoods.  In practice, we marginalize from the local space of genotypings in which each individual genotype is no more than a small number of steps in one sample from the maximum \emph{a posteriori} estimate of $G_i,\ldots,G_n$.  This space is similar to that used during the posterior search described in section \ref{sec:genotypin [...]
+
+%\subsection{Extensions to the method}
+
+
+
+
+%
+
+%\subsection{Using prior empirical information to improve detection}
+
+\bibliography{references}{}
+%\bibliographystyle{plainnat}
+\bibliographystyle{genome_research}
+
+\end{document}
diff --git a/paper/miseq.png b/paper/miseq.png
new file mode 100644
index 0000000..5a72618
Binary files /dev/null and b/paper/miseq.png differ
diff --git a/paper/omni_errors.png b/paper/omni_errors.png
new file mode 100644
index 0000000..c815b85
Binary files /dev/null and b/paper/omni_errors.png differ
diff --git a/paper/references.bib b/paper/references.bib
new file mode 100644
index 0000000..8e4421a
--- /dev/null
+++ b/paper/references.bib
@@ -0,0 +1,321 @@
+ at Article{marth99,
+   Author="Marth, G. T.  and Korf, I.  and Yandell, M. D.  and Yeh, R. T.  and Gu, Z.  and Zakeri, H.  and Stitziel, N. O.  and Hillier, L.  and Kwok, P. Y.  and Gish, W. R. ",
+   Title="{{A} general approach to single-nucleotide polymorphism discovery}",
+   Journal="Nat. Genet.",
+   Year="1999",
+   Volume="23",
+   Pages="452--456",
+   Month="Dec"
+}
+
+ at misc{freebayeshome,
+Author="Garrison, E.",
+Howpublished={\url{http://bioinformatics.bc.edu/marthlab/FreeBayes}},
+Title="{FreeBayes}",
+Year=2011
+}
+
+ at misc{freebayesgit,
+Author="Garrison, E.",
+Howpublished={\url{https://github.com/ekg/freebayes}},
+Title="{FreeBayes source repository}",
+Year=2012
+}
+
+ at misc{mutatrixgit,
+Author="Garrison, E.",
+Howpublished={\url{https://github.com/ekg/mutatrix}},
+Title="{mutatrix population genome simulator}",
+Year=2012
+}
+
+ at misc{ogapgit,
+Author="Garrison, E.",
+Howpublished={\url{https://github.com/ekg/ogap}},
+Title="{ogap: a local indel realigner}",
+Year=2012
+}
+
+ at misc{vcflibgit,
+Author="Garrison, E.",
+Howpublished={\url{https://github.com/ekg/vcflib}},
+Title="{vcflib: variant call file processing and manipulation utilities}",
+Year=2012
+}
+
+ at misc{bamleftaligngit,
+Author="Garrison, E.",
+Howpublished={\url{https://github.com/ekg/freebayes/blob/master/src/bamleftalign.cpp}},
+Title="{bamleftalign: BAM indel left-realigner}",
+Year=2012
+}
+
+ at misc{mosaik,
+Author="Lee, W. P. and M. Str{\"{o}}mberg",
+Howpublished={\url{https://github.com/wanpinglee/MOSAIK}},
+Title="{MOSAIK reference-guided aligner for next-generation sequencing technologies}",
+Year=2012
+}
+
+ at Article{ewens72,
+   Author="Ewens, W. J. ",
+   Title="{{T}he sampling theory of selectively neutral alleles}",
+   Journal="Theor Popul Biol",
+   Year="1972",
+   Volume="3",
+   Pages="87--112",
+   Month="Mar"
+}
+
+ at Article{li2011stats,
+   Author="Li, H. ",
+   Title="{{A} statistical framework for {S}{N}{P} calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data}",
+   Journal="Bioinformatics",
+   Year="2011",
+   Volume="27",
+   Number="21",
+   Pages="2987--2993",
+   Month="Nov"
+}
+
+
+
+ at Article{samtools,
+   Author="Li, H.  and Handsaker, B.  and Wysoker, A.  and Fennell, T.  and Ruan, J.  and Homer, N.  and Marth, G.  and Abecasis, G.  and Durbin, R. ",
+   Title="{{T}he {S}equence {A}lignment/{M}ap format and {S}{A}{M}tools}",
+   Journal="Bioinformatics",
+   Year="2009",
+   Volume="25",
+   Number="16",
+   Pages="2078--2079",
+   Month="Aug"
+}
+
+ at Article{libaq,
+   Author="Li, H. ",
+   Title="{{I}mproving {S}{N}{P} discovery by base alignment quality}",
+   Journal="Bioinformatics",
+   Year="2011",
+   Volume="27",
+   Number="8",
+   Pages="1157--1158",
+   Month="Apr"
+}
+
+ at Article{gatk2011,
+   Author="DePristo, M. A.  and Banks, E.  and Poplin, R.  and Garimella, K. V.  and Maguire, J. R.  and Hartl, C.  and Philippakis, A. A.  and del Angel, G.  and Rivas, M. A.  and Hanna, M.  and McKenna, A.  and Fennell, T. J.  and Kernytsky, A. M.  and Sivachenko, A. Y.  and Cibulskis, K.  and Gabriel, S. B.  and Altshuler, D.  and Daly, M. J. ",
+   Title="{{A} framework for variation discovery and genotyping using next-generation {D}{N}{A} sequencing data}",
+   Journal="Nat. Genet.",
+   Year="2011",
+   Volume="43",
+   Number="5",
+   Pages="491--498",
+   Month="May"
+}
+
+ at Article{browning2007,
+   Author="Browning, S. R.  and Browning, B. L. ",
+   Title="{{R}apid and accurate haplotype phasing and missing-data inference for whole-genome association studies by use of localized haplotype clustering}",
+   Journal="Am. J. Hum. Genet.",
+   Year="2007",
+   Volume="81",
+   Number="5",
+   Pages="1084--1097",
+   Month="Nov"
+}
+
+ at Article{mach2010,
+   Author="Li, Y.  and Willer, C. J.  and Ding, J.  and Scheet, P.  and Abecasis, G. R. ",
+   Title="{{M}a{C}{H}: using sequence and genotype data to estimate haplotypes and unobserved genotypes}",
+   Journal="Genet. Epidemiol.",
+   Year="2010",
+   Volume="34",
+   Number="8",
+   Pages="816--834",
+   Month="Dec"
+}
+
+ at Article{delaneau2012,
+   Author="Delaneau, O.  and Marchini, J.  and Zagury, J. F. ",
+   Title="{{A} linear complexity phasing method for thousands of genomes}",
+   Journal="Nat. Methods",
+   Year="2012",
+   Volume="9",
+   Number="2",
+   Pages="179--181",
+   Month="Feb"
+}
+
+ at Article{howie2009,
+   Author="Howie, B. N.  and Donnelly, P.  and Marchini, J. ",
+   Title="{{A} flexible and accurate genotype imputation method for the next generation of genome-wide association studies}",
+   Journal="PLoS Genet.",
+   Year="2009",
+   Volume="5",
+   Number="6",
+   Pages="e1000529",
+   Month="Jun"
+}
+
+ at Article{howie2011,
+   Author="Howie, B.  and Marchini, J.  and Stephens, M. ",
+   Title="{{G}enotype imputation with thousands of genomes}",
+   Journal="G3 (Bethesda)",
+   Year="2011",
+   Volume="1",
+   Number="6",
+   Pages="457--470",
+   Month="Nov"
+}
+
+ at Article{clarke2009,
+   Author="Clarke, J.  and Wu, H. C.  and Jayasinghe, L.  and Patel, A.  and Reid, S.  and Bayley, H. ",
+   Title="{{C}ontinuous base identification for single-molecule nanopore {D}{N}{A} sequencing}",
+   Journal="Nat Nanotechnol",
+   Year="2009",
+   Volume="4",
+   Number="4",
+   Pages="265--270",
+   Month="Apr"
+}
+
+ at Article{branton2008,
+   Author="Branton, D.  and Deamer, D. W.  and Marziali, A.  and Bayley, H.  and Benner, S. A.  and Butler, T.  and Di Ventra, M.  and Garaj, S.  and Hibbs, A.  and Huang, X.  and Jovanovich, S. B.  and Krstic, P. S.  and Lindsay, S.  and Ling, X. S.  and Mastrangelo, C. H.  and Meller, A.  and Oliver, J. S.  and Pershin, Y. V.  and Ramsey, J. M.  and Riehn, R.  and Soni, G. V.  and Tabard-Cossa, V.  and Wanunu, M.  and Wiggin, M.  and Schloss, J. A. ",
+   Title="{{T}he potential and challenges of nanopore sequencing}",
+   Journal="Nat. Biotechnol.",
+   Year="2008",
+   Volume="26",
+   Number="10",
+   Pages="1146--1153",
+   Month="Oct"
+}
+
+ at Article{tajima1983,
+   Author="Tajima, F. ",
+   Title="{{E}volutionary relationship of {D}{N}{A} sequences in finite populations}",
+   Journal="Genetics",
+   Year="1983",
+   Volume="105",
+   Number="2",
+   Pages="437--460",
+   Month="Oct"
+}
+
+ at Article{watterson1975,
+   Author="Watterson, G. A. ",
+   Title="{{O}n the number of segregating sites in genetical models without recombination}",
+   Journal="Theor Popul Biol",
+   Year="1975",
+   Volume="7",
+   Number="2",
+   Pages="256--276",
+   Month="Apr"
+}
+
+ at Techreport{holtgrewe2010,
+    Author="Holtgrewe, M.",
+    Year="2010",
+    Title="Mason – a read simulator for second generation sequencing data",
+    Journal="Technical Report",
+    Number="TR-B-10-06",
+    Institution="Institut für Mathematik und Informatik, Freie Universität Berlin"
+}
+
+% 23128226 
+ at Article{1000Gphase1,
+    Title="{{A}n integrated map of genetic variation from 1,092 human genomes}",
+    Journal="Nature",
+    Year="2012",
+    Volume="491",
+    Number="7422",
+    Pages="56--65",
+    Month="Nov",
+    Author="1000 Genomes Project Participants, The"
+}
+
+ at Article{cortex,
+    Author="Iqbal, Z.  and Caccamo, M.  and Turner, I.  and Flicek, P.  and McVean, G. ",
+    Title="{{D}e novo assembly and genotyping of variants using colored de {B}ruijn graphs}",
+    Journal="Nat. Genet.",
+    Year="2012",
+    Volume="44",
+    Number="2",
+    Pages="226--232",
+    Month="Feb"
+}
+
+ at Article{dindel,
+    Author="Albers, C. A.  and Lunter, G.  and MacArthur, D. G.  and McVean, G.  and Ouwehand, W. H.  and Durbin, R. ",
+    Title="{{D}indel: accurate indel calls from short-read data}",
+    Journal="Genome Res.",
+    Year="2011",
+    Volume="21",
+    Number="6",
+    Pages="961--973",
+    Month="Jun"
+}
+
+ at article{opmac99,
+  author    = {David W. Opitz and
+               Richard Maclin},
+  title     = {Popular Ensemble Methods: An Empirical Study},
+  journal   = {J. Artif. Intell. Res. (JAIR)},
+  volume    = {11},
+  year      = {1999},
+  pages     = {169-198},
+}
+
+ at Article{blmash,
+   Author="Cleary, S. P.  and Zhang, W.  and Di Nicola, N.  and Aronson, M.  and Aube, J.  and Steinman, A.  and Haddad, R.  and Redston, M.  and Gallinger, S.  and Narod, S. A.  and Gryfe, R. ",
+   Title="{{H}eterozygosity for the {B}{L}{M}({A}sh) mutation and cancer risk}",
+   Journal="Cancer Res.",
+   Year="2003",
+   Volume="63",
+   Number="8",
+   Pages="1769--1771",
+   Month="Apr"
+}
+
+ at Article{recurrentTERT2013,
+   Author="Huang, F. W.  and Hodis, E.  and Xu, M. J.  and Kryukov, G. V.  and Chin, L.  and Garraway, L. A. ",
+   Title="{{H}ighly recurrent {T}{E}{R}{T} promoter mutations in human melanoma}",
+   Journal="Science",
+   Year="2013",
+   Volume="339",
+   Number="6122",
+   Pages="957--959",
+   Month="Feb"
+}
+
+ at Article{snpsvm,
+   Author="O'Fallon, B. D.  and Wooderchak-Donahue, W.  and Crockett, D. K. ",
+   Title="{{A} support vector machine for identification of single-nucleotide polymorphisms from next-generation sequencing data}",
+   Journal="Bioinformatics",
+   Year="2013",
+   Volume="29",
+   Number="11",
+   Pages="1361--1366",
+   Month="Jun"
+}
+
+ at Article{snptools,
+   Author="Wang, Y.  and Lu, J.  and Yu, J.  and Gibbs, R. A.  and Yu, F. ",
+   Title="{{A}n integrative variant analysis pipeline for accurate genotype/haplotype inference in population {N}{G}{S} data}",
+   Journal="Genome Res.",
+   Year="2013",
+   Volume="23",
+   Number="5",
+   Pages="833--842",
+   Month="May"
+}
+
+ at Article{maq,
+   Author="Li, H.  and Ruan, J.  and Durbin, R. ",
+   Title="{{M}apping short {D}{N}{A} sequencing reads and calling variants using mapping quality scores}",
+   Journal="Genome Res.",
+   Year="2008",
+   Volume="18",
+   Number="11",
+   Pages="1851--1858",
+   Month="Nov"
+}
\ No newline at end of file
diff --git a/scripts/coverage_to_regions.py b/scripts/coverage_to_regions.py
new file mode 100755
index 0000000..e4dff3e
--- /dev/null
+++ b/scripts/coverage_to_regions.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import sys
+
+if len(sys.argv) < 3:
+    print "usage: <bamtools_coverage_output ", sys.argv[0], " fasta_index num_regions >regions.bed"
+    print "Generates regions with even sequencing coverage, provided an input of coverage per-position as"
+    print "generated by bamtools coverage.  In other words, generates regions such that the integral of"
+    print "coverage is approximately equal for each.  These can be used when variant calling to reduce"
+    print "variance in job runtime."
+    exit(1)
+
+lengths = {}
+fai = open(sys.argv[1])
+for line in fai.readlines():
+    c, l = line.split("\t")[:2]
+    lengths[c] = int(l)
+
+positions = []
+total_coverage = 0
+for line in sys.stdin:
+    chrom, pos, depth = line.strip().split("\t")
+    pos = int(pos)
+    depth = int(depth)
+    positions.append([chrom, pos, depth])
+    total_coverage += depth
+
+bp_per_region = total_coverage / int(sys.argv[2])
+
+lchrom = None
+lpos = 0
+bp_in_region = 0
+
+for line in positions:
+    chrom, pos, depth = line #line.strip().split("\t")
+    if lchrom != chrom:
+        if lchrom:
+            print lchrom+":"+str(lpos)+"-"+str(lengths[lchrom])
+            lpos = 0
+            lchrom = chrom
+            bp_in_region = 0
+        else:
+            lchrom = chrom
+    bp_in_region += depth
+    if bp_in_region > bp_per_region:
+        print chrom+":"+str(lpos)+"-"+str(pos) #, pos - lpos
+        lpos = pos
+        bp_in_region = 0
+
+
+print lchrom+":"+str(lpos)+"-"+str(lengths[lchrom])
diff --git a/scripts/fasta_generate_regions.py b/scripts/fasta_generate_regions.py
index 200336b..0433410 100755
--- a/scripts/fasta_generate_regions.py
+++ b/scripts/fasta_generate_regions.py
@@ -4,12 +4,16 @@ import sys
 
 
 if len(sys.argv) == 1:
-    print "usage: ", sys.argv[0], " <fasta index file> <region size>"
+    print "usage: ", sys.argv[0], " <fasta file or index file> <region size>"
     print "generates a list of freebayes/bamtools region specifiers on stdout"
     print "intended for use in creating cluster jobs"
     exit(1)
 
-fasta_index_file = open(sys.argv[1])
+fasta_index_file = sys.argv[1]
+if not fasta_index_file.endswith(".fai"):
+    fasta_index_file = fasta_index_file + ".fai"
+
+fasta_index_file = open(fasta_index_file)
 region_size = int(sys.argv[2])
 
 for line in fasta_index_file:
diff --git a/scripts/freebayes-parallel b/scripts/freebayes-parallel
new file mode 100755
index 0000000..a293846
--- /dev/null
+++ b/scripts/freebayes-parallel
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+if [ $# -lt 3 ];
+then
+    echo "usage: $0 [regions file] [ncpus] [freebayes arguments]"
+    echo
+    echo "Run freebayes in parallel over regions listed in regions file, using ncpus processors."
+    echo "Will merge and sort output, producing a uniform VCF stream on stdout.  Flags to freebayes"
+    echo "which would write to e.g. a particular file will obviously cause problms, so caution is"
+    echo "encouraged when using this script."
+    echo
+    echo "examples:"
+    echo
+    echo "Run freebayes in parallel on 100000bp chunks of the ref (fasta_generate_regions.py is also"
+    echo "located in the scripts/ directory in the freebayes distribution).  Use 36 threads."
+    echo
+    echo "    freebayes-parallel <(fasta_generate_regions.py ref.fa.fai 100000) 36 -f ref.fa aln.bam >out.vcf"
+    echo
+    echo "Generate regions that are equal in terms of data content, and thus have lower variance"
+    echo "in runtime.  This will yield better resource utilization."
+    echo
+    echo "    bamtools coverage -in aln.bam | coverage_to_regions.py ref.fa 500 >ref.fa.500.regions"
+    echo "    freebayes-parallel ref.fa.500.regions 36 -f ref.fa aln.bam >out.vcf"
+    echo
+    exit
+fi
+
+regionsfile=$1
+shift
+ncpus=$1
+shift
+
+command="freebayes $@"
+
+(
+#$command | head -100 | grep "^#" # generate header
+# iterate over regions using gnu parallel to dispatch jobs
+cat $regionsfile | parallel -k -j $ncpus "$command --region {}" 
+) | vcffirstheader \
+    | vcfstreamsort -w 1000 | vcfuniq # remove duplicates at region edges
diff --git a/src/Allele.cpp b/src/Allele.cpp
index b4a7a97..2c58c79 100644
--- a/src/Allele.cpp
+++ b/src/Allele.cpp
@@ -315,6 +315,8 @@ string stringForAllele(const Allele &allele) {
             << allele.basesRight;
     } else {
         out << allele.typeStr() << ":"
+            << allele.cigar << ":"
+            << scientific << fixed << allele.position << ":"
             << allele.length << ":"
             << allele.alternateSequence;
     }
@@ -427,6 +429,8 @@ ostream &operator<<(ostream &out, Allele &allele) {
         out.precision(prec);
     } else {
         out << allele.typeStr() 
+            << ":" << allele.cigar
+            << ":" << scientific << fixed << allele.position
             << ":" << allele.length 
             << ":" << (string) allele.alternateSequence;
     }
diff --git a/src/AlleleParser.cpp b/src/AlleleParser.cpp
index 3d9b6dd..5a2b79c 100644
--- a/src/AlleleParser.cpp
+++ b/src/AlleleParser.cpp
@@ -19,7 +19,11 @@
 
 // must-see error messages
 #define ERROR(msg) \
-    cerr << msg << endl;
+    cerr << "ERROR(freebayes): " << msg << endl;
+
+// must-see warning messages
+#define WARNING(msg) \
+    cerr << "WARNING(freebayes): " << msg << endl;
 
 using namespace std;
 
@@ -41,15 +45,18 @@ void AlleleParser::openBams(void) {
     if (parameters.useStdin) {
         if (!bamMultiReader.Open(parameters.bams)) {
             ERROR("Could not read BAM data from stdin");
+            cerr << bamMultiReader.GetErrorString() << endl;
             exit(1);
         }
     } else {
         if (!bamMultiReader.Open(parameters.bams)) {
             ERROR("Could not open input BAM files");
+            cerr << bamMultiReader.GetErrorString() << endl;
             exit(1);
         } else {
             if (!bamMultiReader.LocateIndexes()) {
                 ERROR("Opened BAM reader without index file, jumping is disabled.");
+                cerr << bamMultiReader.GetErrorString() << endl;
                 if (!targets.empty()) {
                     ERROR("Targets specified but no BAM index file provided.");
                     ERROR("FreeBayes cannot jump through targets in BAM files without BAM index files, exiting.");
@@ -62,6 +69,7 @@ void AlleleParser::openBams(void) {
         }
         if (!bamMultiReader.SetExplicitMergeOrder(bamMultiReader.MergeByCoordinate)) {
             ERROR("could not set sort order to coordinate");
+            cerr << bamMultiReader.GetErrorString() << endl;
             exit(1);
         }
     }
@@ -400,6 +408,10 @@ string AlleleParser::vcfHeader() {
         //<< "##INFO=<ID=RR,Number=1,Type=Integer,Description=\"Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele\">" << endl
         << "##INFO=<ID=RPP,Number=A,Type=Float,Description=\"Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality\">" << endl
         << "##INFO=<ID=RPPR,Number=1,Type=Float,Description=\"Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality\">" << endl
+        << "##INFO=<ID=RPL,Number=A,Type=Float,Description=\"Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele\">" << endl
+        //<< "##INFO=<ID=RPLR,Number=A,Type=Float,Description=\"Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele\">" << endl
+        << "##INFO=<ID=RPR,Number=A,Type=Float,Description=\"Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele\">" << endl
+        //<< "##INFO=<ID=RPRR,Number=A,Type=Float,Description=\"Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele\">" << endl
         //<< "##INFO=<ID=EL,Number=1,Type=Integer,Description=\"Allele End Left: number of observations of the alternate where the alternate occurs in the left end of the read\">" << endl
         //<< "##INFO=<ID=ER,Number=1,Type=Integer,Description=\"Allele End Right: number of observations of the alternate where the alternate occurs in the right end of the read\">" << endl
         << "##INFO=<ID=EPP,Number=A,Type=Float,Description=\"End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality\">" << endl
@@ -493,6 +505,7 @@ void AlleleParser::setupVCFInput(void) {
     if (!parameters.variantPriorsFile.empty()) {
         variantCallInputFile.open(parameters.variantPriorsFile);
         currentVariant = new vcf::Variant(variantCallInputFile);
+        usingVariantInputAlleles = true;
 
         // get sample names from VCF input file
         //
@@ -511,8 +524,8 @@ void AlleleParser::setupVCFInput(void) {
 
     // haplotype alleles for constructing haplotype alleles
     if (!parameters.haplotypeVariantFile.empty()) {
-	haplotypeVariantInputFile.open(parameters.haplotypeVariantFile);
-	usingHaplotypeBasisAlleles = true;
+        haplotypeVariantInputFile.open(parameters.haplotypeVariantFile);
+        usingHaplotypeBasisAlleles = true;
     }
 }
 
@@ -567,9 +580,9 @@ void AlleleParser::loadReferenceSequence(BamAlignment& alignment) {
 void AlleleParser::loadReferenceSequence(BedTarget* target, int before, int after) {
     basesBeforeCurrentTarget = before;
     basesAfterCurrentTarget = after;
-    DEBUG2("loading reference subsequence " << target->seq << " from " << target->left << " - " << before << " to " << target->right << " + " << after << " + before");
+    DEBUG2("loading reference subsequence " << target->seq << " from " << target->left << " - " << before << " to " << target->right + 1 << " + " << after << " + before");
     string name = reference.sequenceNameStartingWith(target->seq);
-    currentSequence = uppercase(reference.getSubSequence(name, (target->left - 1) - before, (target->right - target->left) + after + before));
+    currentSequence = uppercase(reference.getSubSequence(name, target->left - before, (target->right + 1 - target->left) + after + before));
     currentReferenceBase = currentReferenceBaseChar();
 }
 
@@ -704,7 +717,7 @@ void AlleleParser::loadTargets(void) {
                 if (foundRangeSep + sep.size() != region.size()) {
                     stopPos = atoi(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format
                 } else {
-                    stopPos = reference.sequenceLength(startSeq);
+                    stopPos = -1;
                 }
             }
         }
@@ -716,7 +729,7 @@ void AlleleParser::loadTargets(void) {
         BedTarget bd(startSeq,
                     (startPos == 0) ? 0 : startPos,
                     (stopPos == -1) ? reference.sequenceLength(startSeq) : stopPos - 1); // internally, we use 0-base inclusive end
-        DEBUG("will process reference sequence " << startSeq << ":" << bd.left << ".." << bd.right);
+        DEBUG("will process reference sequence " << startSeq << ":" << bd.left << ".." << bd.right + 1);
         targets.push_back(bd);
         bedReader.targets.push_back(bd);
 
@@ -725,7 +738,8 @@ void AlleleParser::loadTargets(void) {
     // check validity of targets wrt. reference
     for (vector<BedTarget>::iterator e = targets.begin(); e != targets.end(); ++e) {
         BedTarget& bd = *e;
-        if (bd.left < 0 || bd.right - 1 > reference.sequenceLength(bd.seq)) {
+        // internally, we use 0-base inclusive end
+        if (bd.left < 0 || bd.right > reference.sequenceLength(bd.seq)) {
             ERROR("Target region coordinates (" << bd.seq << " "
                     << bd.left << " " << bd.right
                     << ") outside of reference sequence bounds ("
@@ -733,7 +747,7 @@ void AlleleParser::loadTargets(void) {
             exit(1);
         }
         if (bd.right < bd.left) {
-            ERROR("Invalid target region coordinates (" << bd.seq << " " << bd.left << " " << bd.right << ")"
+            ERROR("Invalid target region coordinates (" << bd.seq << " " << bd.left << " " << bd.right + 1 << ")"
                     << " right bound is lower than left bound!");
             exit(1);
         }
@@ -754,8 +768,8 @@ void AlleleParser::loadTargetsFromBams(void) {
     for( ; refIter != refEnd; ++refIter) {
         RefData refData = *refIter;
         string refName = refData.RefName;
-        BedTarget bd(refName, 0, refData.RefLength); // 0-based half open
-        DEBUG2("will process reference sequence " << refName << ":" << bd.left << ".." << bd.right);
+        BedTarget bd(refName, 0, refData.RefLength); // 0-based inclusive internally
+        DEBUG2("will process reference sequence " << refName << ":" << bd.left << ".." << bd.right + 1);
         targets.push_back(bd);
     }
 }
@@ -817,7 +831,9 @@ bool AlleleParser::inTarget(void) {
     if (targets.empty()) {
         return true;  // everything is in target if we don't have targets
     } else {
-        if (bedReader.targetsOverlap(currentSequenceName, currentPosition, currentPosition + 1)) {
+        // expects 0-based, fully-closed, and we're only checking a single
+        // base, so start == end.
+        if (bedReader.targetsOverlap(currentSequenceName, currentPosition, currentPosition)) {
             return true;
         } else {
             return false;
@@ -838,9 +854,11 @@ AlleleParser::AlleleParser(int argc, char** argv) : parameters(Parameters(argc,
     justSwitchedTargets = false;  // flag to trigger cleanup of Allele*'s and objects after jumping targets
     hasMoreAlignments = true; // flag to track when we run out of alignments in the current target or BAM files
     currentSequenceStart = 0;
-    lastHaplotypeLength = 1;
+    lastHaplotypeLength = 0;
     usingHaplotypeBasisAlleles = false;
+    usingVariantInputAlleles = false;
     rightmostHaplotypeBasisAllelePosition = 0;
+    rightmostInputAllelePosition = 0;
     nullSample = new Sample();
     referenceSampleName = "reference_sample";
 
@@ -918,7 +936,7 @@ string AlleleParser::referenceSubstr(long int pos, unsigned int len) {
 bool AlleleParser::isCpG(string& altbase) {
     // bounds check
     if (floor(currentPosition) - currentSequenceStart - 1 < 0
-            || floor(currentPosition) - currentSequenceStart + 1 > currentSequence.size()) {
+            || floor(currentPosition) - currentSequenceStart + 1 >= currentSequence.size()) {
         return false;
     }
     string prevb = currentSequence.substr(floor(currentPosition) - currentSequenceStart - 1, 1);
@@ -996,10 +1014,12 @@ void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int max
             }
         } else if (newAllele.isReference()
                    && (newAllele.referenceLength > maxComplexGap
-                       || ( newAllele.referenceLength <= maxComplexGap && newAllele.basesRight <= maxComplexGap )
                        || newAllele.basesRight == 0)) {
             // if the last allele is reference too, we need to combine them!
             if (lastAllele.isReference()) {
+                DEBUG2("addAllele: mergeAllele/1:"
+                    << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
+                    << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
                 lastAllele.mergeAllele(newAllele, ALLELE_REFERENCE);
                 assert(lastAllele.alternateSequence.size() == lastAllele.baseQualities.size());
             } else if (lastAllele.isComplex() || lastAllele.isMNP() || lastAllele.isSNP()) {
@@ -1015,8 +1035,15 @@ void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int max
                         string seq; vector<pair<int, string> > cig; vector<short> quals;
                         pAllele.subtractFromEnd(matchlen, seq, cig, quals);
                         alleles.back().subtractFromStart(pAllele.referenceLength, seq, cig, quals);
+                        DEBUG2("addAllele: mergeAllele/2:"
+                           << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
+                           << " .back() "    << alleles.back().typeStr() << "@" << alleles.back().position << ":" << alleles.back().cigar
+                           << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
                         alleles.back().mergeAllele(newAllele, ALLELE_REFERENCE);
                     } else { // expand the complex allele
+                        DEBUG2("addAllele: mergeAllele/3:"
+                           << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
+                           << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
                         lastAllele.mergeAllele(newAllele, ALLELE_COMPLEX);
                     }
                 } else {
@@ -1063,15 +1090,55 @@ void RegisteredAlignment::addAllele(Allele newAllele, bool mergeComplex, int max
             // -> complex event or MNP
             if (mergeComplex && lastAllele.position + lastAllele.referenceLength == newAllele.position
                 && !lastAllele.isNull()) {
-                AlleleType atype = ALLELE_COMPLEX;
-                if (lastAllele.isSNP() || lastAllele.isMNP()) {
-                    vector<pair<int, string> > cigar = splitCigar(lastAllele.cigar);
-                    if (cigar.back().second == "X" && newAllele.isSNP() || newAllele.isMNP()) {
-                        atype = ALLELE_MNP;
+
+                vector<pair<int, string> > lastCigar = splitCigar(lastAllele.cigar);
+
+                // If the last allele is complex and ends in a match, we need
+                // to check that after merging the then-embedded match won't be
+                // longer than maxComplexGap.  We do this for every new allele,
+                // since we don't want to allow the complex allele to grow
+                // beyond maxComplexGap before splitting.
+                if (lastAllele.isComplex()
+                    && lastCigar.back().second == "M"
+                    && lastCigar.back().first > maxComplexGap)
+                {
+                    // Break apart the complex allele into one complex and one
+                    // reference allele.
+                    //
+                    // FIXME TODO: The allele may not actually be complex
+                    // anymore after splitting, in which case we should demote
+                    // its type to SNP/MNP/INDEL.
+                    // -trs, 20 Nov 2014
+                    alleles.push_back(lastAllele);
+                    Allele& pAllele = alleles.at(alleles.size() - 2);
+                    string seq; vector<pair<int, string> > cig; vector<short> quals;
+                    pAllele.subtractFromEnd(lastCigar.back().first, seq, cig, quals);
+                    alleles.back().subtractFromStart(pAllele.referenceLength, seq, cig, quals);
+
+                    if (newAllele.isReference()) {
+                        DEBUG2("addAllele: mergeAllele/5:"
+                            << " lastAllele " << lastAllele.typeStr()     << "@" << lastAllele.position     << ":" << lastAllele.cigar
+                            << " .back() "    << alleles.back().typeStr() << "@" << alleles.back().position << ":" << alleles.back().cigar
+                            << " newAllele "  << newAllele.typeStr()      << "@" << newAllele.position      << ":" << newAllele.cigar);
+                        alleles.back().mergeAllele(newAllele, ALLELE_REFERENCE);
+                    } else {
+                        alleles.push_back(newAllele);
+                    }
+                } else {
+                    AlleleType atype = ALLELE_COMPLEX;
+                    if (lastAllele.isSNP() || lastAllele.isMNP()) {
+                        if (lastCigar.back().second == "X" && newAllele.isSNP() || newAllele.isMNP()) {
+                            atype = ALLELE_MNP;
+                        }
                     }
+
+                    DEBUG("addAllele: mergeAllele/4:"
+                       << " lastAllele " << lastAllele.typeStr() << "@" << lastAllele.position << ":" << lastAllele.cigar
+                       << " newAllele "  << newAllele.typeStr()  << "@" << newAllele.position  << ":" << newAllele.cigar);
+
+                    lastAllele.mergeAllele(newAllele, atype);
+                    assert(lastAllele.alternateSequence.size() == lastAllele.baseQualities.size());
                 }
-                lastAllele.mergeAllele(newAllele, atype);
-                assert(lastAllele.alternateSequence.size() == lastAllele.baseQualities.size());
             } else {
                 alleles.push_back(newAllele);
             }
@@ -1086,9 +1153,12 @@ void AlleleParser::updateHaplotypeBasisAlleles(long int pos, int referenceLength
         stringstream r;
         //r << currentSequenceName << ":" << rightmostHaplotypeBasisAllelePosition << "-" << pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
         //cerr << "getting variants in " << r.str() << endl;
+
+        // tabix expects 1-based, fully closed regions for ti_parse_region()
+        // (which is what setRegion() calls eventually)
         if (haplotypeVariantInputFile.setRegion(currentSequenceName,
-                                                rightmostHaplotypeBasisAllelePosition,
-                                                pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW)) {
+                                                rightmostHaplotypeBasisAllelePosition + 1,
+                                                pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1)) {
             //cerr << "the vcf line " << haplotypeVariantInputFile.line << endl;
             // get the variants in the target region
             vcf::Variant var(haplotypeVariantInputFile);
@@ -1119,8 +1189,9 @@ void AlleleParser::updateHaplotypeBasisAlleles(long int pos, int referenceLength
 
             }
         } else {
-            ERROR("Could not set haplotype-basis VCF file to target region");
-            exit(1);
+            // indicates empty region
+            //ERROR("Could not set haplotype-basis VCF file to target region");
+            //exit(1);
         }
         // set the rightmost haplotype position to trigger the next update
         rightmostHaplotypeBasisAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
@@ -1128,7 +1199,7 @@ void AlleleParser::updateHaplotypeBasisAlleles(long int pos, int referenceLength
 }
 
 
-bool AlleleParser::allowedAllele(long int pos, string& ref, string& alt) {
+bool AlleleParser::allowedHaplotypeBasisAllele(long int pos, string& ref, string& alt) {
     // check the haplotypeBasisAllele map for membership of the allele in question in the current sequence
     //cerr << "is allowed: " << pos << " " << ref << "/" << alt << " ?" << endl;
     if (!usingHaplotypeBasisAlleles) {
@@ -1199,20 +1270,20 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra,
     // if not, adjust the allele so that it's a reference allele with preset BQ and length
     // in effect, this means creating a reference allele of the reference length of the allele with 0 BQ
 
+    // NB, if we are using haplotype basis alleles the algorithm forces
+    // alleles that aren't in the haplotype basis set into the reference space
     if (type != ALLELE_REFERENCE
         && type != ALLELE_NULL 
-        && !allowedAllele(pos + 1,
-                          refSequence,
-                          readSequence)) {
-        //type = ALLELE_REFERENCE;
-        type = ALLELE_NULL;
-        // ... commented out so we don't force everything into reference space (what about indels??)
+        && !allowedHaplotypeBasisAllele(pos + 1,
+                                        refSequence,
+                                        readSequence)) {
+        type = ALLELE_REFERENCE;
         length = referenceLengthFromCigar(cigar);
-        cigar = convert(length) + "N";
+        cigar = convert(length) + "M";
         // by adjusting the cigar, we implicitly adjust
         // allele.referenceLength, which is calculated when the allele is made
         qualstr = string(length, qualityInt2Char(0));
-        readSequence = string(length, 'N');
+        readSequence = currentSequence.substr(pos - currentSequenceStart, length);
     }
 
     // cache information about repeat structure in the alleles, to
@@ -1321,6 +1392,10 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
         updateHaplotypeBasisAlleles(sp, alignment.AlignedBases.size());
     }
 
+    if (usingVariantInputAlleles) {
+         updateInputVariants(sp, alignment.AlignedBases.size());
+    }
+
 #ifdef VERBOSE_DEBUG
     if (parameters.debug2) {
         DEBUG2("registering alignment " << rp << " " << csp << " " << sp << endl <<
@@ -1403,9 +1478,10 @@ RegisteredAlignment& AlleleParser::registerAlignment(BamAlignment& alignment, Re
                     b = rDna.at(rp);
                 } catch (std::out_of_range outOfRange) {
                     cerr << "Exception: Cannot read past the end of the alignment's sequence." << endl
+                         << alignment.Name << endl
+                         << currentSequenceName << ":" << (long unsigned int) currentPosition + 1 << endl
                          << alignment.AlignedBases << endl
-                         << currentSequence.substr(csp, alignment.AlignedBases.size()) << endl
-                         << currentSequenceName << ":" << (long unsigned int) currentPosition + 1 << endl;
+                         << currentSequence.substr(csp, alignment.AlignedBases.size()) << endl;
                     abort();
                 }
 
@@ -1937,20 +2013,28 @@ void AlleleParser::updateAlignmentQueue(long int position,
             }
 
             // skip this alignment if we are not using duplicate reads (we remove them by default)
-            if (currentAlignment.IsDuplicate() && !parameters.useDuplicateReads)
+            if (currentAlignment.IsDuplicate() && !parameters.useDuplicateReads) {
+                //DEBUG("skipping alignment " << currentAlignment.Name << " because it is a duplicate read");
                 continue;
+            }
 
             // skip unmapped alignments, as they cannot be used in the algorithm
-            if (!currentAlignment.IsMapped())
+            if (!currentAlignment.IsMapped()) {
+                //DEBUG("skipping alignment " << currentAlignment.Name << " because it is not mapped");
                 continue;
+            }
 
             // skip alignments which have no aligned bases
-            if (currentAlignment.AlignedBases.size() == 0)
+            if (currentAlignment.AlignedBases.size() == 0) {
+                //DEBUG("skipping alignment " << currentAlignment.Name << " because it has no aligned bases");
                 continue;
+            }
 
             // skip alignments which are non-primary
-            if (!currentAlignment.IsPrimaryAlignment())
+            if (!currentAlignment.IsPrimaryAlignment()) {
+                //DEBUG("skipping alignment " << currentAlignment.Name << " because it is not marked primary");
                 continue;
+            }
 
             if (!gettingPartials && currentAlignment.GetEndPosition() < position) {
                 cerr << currentAlignment.Name << " at " << currentSequenceName << ":" << currentAlignment.Position << " is out of order!"
@@ -1963,7 +2047,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
             // such as mismatches
 
             // initially skip reads with low mapping quality (what happens if MapQuality is not in the file)
-            if (currentAlignment.MapQuality > parameters.MQL0) {
+            if (currentAlignment.MapQuality >= parameters.MQL0) {
                 // extend our cached reference sequence to allow processing of this alignment
                 extendReferenceSequence(currentAlignment);
                 // left realign indels
@@ -1987,7 +2071,7 @@ void AlleleParser::updateAlignmentQueue(long int position,
                 // here we get the deque of alignments ending at this alignment's end position
                 deque<RegisteredAlignment>& rq = registeredAlignments[currentAlignment.GetEndPosition()];
                 // and insert the registered alignment into that deque
-                rq.push_front(RegisteredAlignment(currentAlignment));
+                rq.push_front(RegisteredAlignment(currentAlignment, parameters));
                 RegisteredAlignment& ra = rq.front();
                 registerAlignment(currentAlignment, ra, sampleName, sequencingTech);
                 // backtracking if we have too many mismatches
@@ -2029,8 +2113,11 @@ void AlleleParser::updateRegisteredAlleles(void) {
     // http://stackoverflow.com/questions/347441/erasing-elements-from-a-vector
     vector<Allele*>& alleles = registeredAlleles;
 
+
     for (vector<Allele*>::iterator allele = alleles.begin(); allele != alleles.end(); ++allele) {
         long unsigned int position = (*allele)->position;
+        // note that this will underflow if currentPosition == 0 and lastHaplotypeLength > 0
+        // resolved by setting lastHaplotypeLength = 0 in init, and when we switch targets
         if (currentPosition - lastHaplotypeLength > position + (*allele)->referenceLength) {
             *allele = NULL;
         }
@@ -2052,17 +2139,40 @@ void AlleleParser::updateRegisteredAlleles(void) {
     }
 }
 
-void AlleleParser::updateInputVariants(void) {
+void AlleleParser::updateInputVariants(long int pos, int referenceLength) {
 
-    if (variantCallInputFile.is_open()) {
+    //cerr << "updating input variants (?) " << pos << " + " << referenceLength << " >? " << rightmostInputAllelePosition << endl;
+    if (!usingVariantInputAlleles) return;
+
+    if (pos + referenceLength > rightmostInputAllelePosition) {
+        long int start = rightmostInputAllelePosition;
+        if (start == 0) {
+            start = rightmostHaplotypeBasisAllelePosition;
+        }
+
+        //stringstream r;
+        //r << currentSequenceName << ":" << start
+        //  << "-" << pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
+        //cerr << "getting variants in " << r.str() << endl;
 
-        if (hasMoreVariants && currentVariant->position - 1 <= currentPosition && currentVariant->sequenceName == currentSequenceName) {
-            do {
-                DEBUG2("getting input alleles from input VCF at position " << currentVariant->sequenceName << ":" << currentVariant->position);
+        // tabix expects 1-based, fully closed regions for ti_parse_region()
+        // (which is what setRegion() calls eventually)
+        if (variantCallInputFile.setRegion(currentSequenceName,
+                                           start + 1,
+                                           pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW + 1)) {
+            //cerr << "the vcf line " << haplotypeVariantInputFile.line << endl;
+            // get the variants in the target region
+            vcf::Variant var(variantCallInputFile);
+            bool ok;
+            while (ok = variantCallInputFile.getNextVariant(*currentVariant)) {
+
+                DEBUG("getting input alleles from input VCF at position " << currentVariant->sequenceName << ":" << currentVariant->position);
                 long int pos = currentVariant->position - 1;
                 // get alternate alleles
                 bool includePreviousBaseForIndels = true;
-                map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates(includePreviousBaseForIndels);
+                map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->parsedAlternates();
+                // TODO this would be a nice option: why does it not work?
+                //map<string, vector<vcf::VariantAllele> > variantAlleles = currentVariant->flatAlternates();
                 vector< vector<vcf::VariantAllele> > orderedVariantAlleles;
                 for (vector<string>::iterator a = currentVariant->alt.begin(); a != currentVariant->alt.end(); ++a) {
                     orderedVariantAlleles.push_back(variantAlleles[*a]);
@@ -2078,7 +2188,6 @@ void AlleleParser::updateInputVariants(void) {
                     vector<Allele> alleles;
 
                     for (vector<vcf::VariantAllele>::iterator v = altAllele.begin(); v != altAllele.end(); ++v) {
-
                         vcf::VariantAllele& variant = *v;
                         long int allelePos = variant.position - 1;
                         AlleleType type;
@@ -2111,112 +2220,50 @@ void AlleleParser::updateInputVariants(void) {
                             }
                             cigar = convert(len) + "X";
                         } else if (variant.ref.size() > variant.alt.size()) {
-                            len = variant.ref.size() - variant.alt.size();
-                            reflen = len;
                             type = ALLELE_DELETION;
-                            cigar = convert(len) + "D";
+                            len = variant.ref.size() - variant.alt.size();
+                            allelePos -= 1;
+                            reflen = len + 2;
+                            alleleSequence =
+                                uppercase(reference.getSubSequence(currentSequenceName, allelePos, 1))
+                                + alleleSequence
+                                + uppercase(reference.getSubSequence(currentSequenceName, allelePos+1+len, 1));
+                            cigar = "1M" + convert(len) + "D" + "1M";
                         } else {
-                            len = variant.alt.size() - variant.ref.size();
-                            reflen = 0;
+                            // we always include the flanking bases for these elsewhere, so here too in order to be consistent and trigger use
                             type = ALLELE_INSERTION;
-                            cigar = convert(len) + "I";
+                            // add previous base and post base to match format typically used for calling
+                            allelePos -= 1;
+                            alleleSequence =
+                                uppercase(reference.getSubSequence(currentSequenceName, allelePos, 1))
+                                + alleleSequence
+                                + uppercase(reference.getSubSequence(currentSequenceName, allelePos+1, 1));
+                            len = variant.alt.size() - var.ref.size();
+                            cigar = "1M" + convert(len) + "I" + "1M";
+                            reflen = 2;
                         }
+                        // TODO deal woth complex subs
 
                         Allele allele = genotypeAllele(type, alleleSequence, (unsigned int) len, cigar, (unsigned int) reflen, allelePos);
                         DEBUG("input allele: " << allele);
-                        alleles.push_back(allele);
 
-                    }
+                        //alleles.push_back(allele);
+                        genotypeAlleles.push_back(allele);
 
-                    // Variant::parsedAlternates() only gives us alternate alleles
-                    // for now, add reference sequences back between the alternates here
-                    if (alleles.size() > 1) {
-                        vector<Allele> newAlleles;
-                        vector<Allele>::iterator p = alleles.begin();
-                        for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
-                            if (p != a) {
-                                if (p->position + p->referenceLength < a->position) {
-                                    // insert a reference allele
-                                    long int pend = p->position + p->referenceLength;
-                                    string refsequence = uppercase(reference.getSubSequence(currentVariant->sequenceName, pend, a->position - pend));
-                                    string cigar = convert(refsequence.size()) + "M";
-                                    Allele refAllele = genotypeAllele(ALLELE_REFERENCE, refsequence, refsequence.size(), cigar, refsequence.size(), pend);
-                                    newAlleles.push_back(refAllele);
-                                }
-                            }
-                            newAlleles.push_back(*a);
-                            p = a;
+                        if (allele.type != ALLELE_REFERENCE) {
+                            inputVariantAlleles[allele.position].push_back(allele);
+                            alternatePositions.insert(allele.position);
                         }
-                        alleles = newAlleles;
-                        DEBUG2("alleles, post addition of reference sequences: " << alleles);
-                    }
 
-                    // for any indel alleles, grab the flanking bases to match VCF standard
-                    /*
-                    vector<Allele>::iterator p = alleles.begin();
-                    for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
-                        if (a->isDeletion()) {
-                            if (p != a) {
-                                if (p->isReference()) {
-                                    string seq; vector<pair<int, string> > cig; vector<short> quals;
-                                    p->subtractFromEnd(1, seq, cig, quals);
-                                    a->addToStart(seq, cig, quals);
-                                }
-                                // they will be merged otherwise in the complex merge step below
-                            } else {
-                                // add the previous reference base
-                                vector<short> quals;
-                                quals.assign(1, 0);
-                                vector<pair<int, string> > cig;
-                                cig.push_back(make_pair(1, "M"));
-                                string seq = uppercase(reference.getSubSequence(currentVariant->sequenceName, a->position - 1, 1));
-                                a->addToStart(seq, cig, quals);
-                            }
-                        }
-                        p = a;
-                    }
-                    DEBUG2("alleles, post processing of deletions: " << alleles);
-                    */
-
-                    // remove 0-length alleles resulting from edge cases in previous processing (e.g. beginning of read)
-                    if (alleles.size() > 1) {
-                        vector<Allele> newAlleles;
-                        for (vector<Allele>::iterator a = alleles.begin(); a != alleles.end(); ++a) {
-                            if (a->referenceLength > 0) {
-                                newAlleles.push_back(*a);
-                            }
-                        }
-                        alleles = newAlleles;
-                    }
-
-                    Allele& allele = alleles.front();
-                    if (alleles.size() > 1) {
-                        vector<Allele>::iterator a = alleles.begin(); ++a;
-                        for (; a != alleles.end(); ++a) {
-                            if (a->referenceLength > 0) {
-                                allele.mergeAllele(*a, ALLELE_COMPLEX);
-                            }
-                        }
-                    }
-
-                    inputVariantAlleles[allele.position].push_back(allele);
-                    genotypeAlleles.push_back(allele);
-
-                    if (allele.position + 1 != currentVariant->position) {
-                        cerr << "parsed allele position is not the same as the variant position!" << endl;
-                        cerr << *currentVariant << endl;
-                        cerr << allele << endl;
-                        exit(1);
-                    }
-
-                    if (allele.type != ALLELE_REFERENCE) {
-                        alternatePositions.insert(allele.position);
                     }
 
                 }
 
                 // store the allele counts, if they are provided
                 //
+                continue;
+
+                // xxx wverything here is skipped
                 if (currentVariant->info.find("AC") != currentVariant->info.end()
                     && currentVariant->info.find("AN") != currentVariant->info.end()) {
                     vector<string>& afstrs = currentVariant->info["AC"];
@@ -2245,8 +2292,9 @@ void AlleleParser::updateInputVariants(void) {
                     }
                 }
 
-                if (currentVariant->samples.empty())
+                if (currentVariant->samples.empty()) {
                     continue;
+                }
 
                 if (alternatePositions.size() == 1) {
 
@@ -2334,10 +2382,21 @@ void AlleleParser::updateInputVariants(void) {
                     */
                 }
 
-            } while ((hasMoreVariants = variantCallInputFile.getNextVariant(*currentVariant))
-                    && currentVariant->position - 1 <= currentPosition
-                    && currentVariant->sequenceName == currentSequenceName);
+            }
+
+            if (!ok) hasMoreVariants = false;
+        }
+        /*
+        for (map<long int, vector<Allele> >::iterator v = inputVariantAlleles.begin(); v != inputVariantAlleles.end(); ++v) {
+            vector<Allele>& iv = v->second;
+            cerr << "input variants pos = " << v->first << endl;
+            for (vector<Allele>::iterator a = iv.begin(); a != iv.end(); ++a) {
+                cerr << *a << endl;
+            }
         }
+        */
+        //rightmostHaplotypeBasisAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
+        rightmostInputAllelePosition = pos + referenceLength + CACHED_BASIS_HAPLOTYPE_WINDOW;
     }
 
 }
@@ -2539,7 +2598,10 @@ void AlleleParser::removePreviousAlleles(vector<Allele*>& alleles) {
 // false otherwise
 bool AlleleParser::toNextTarget(void) {
 
-    DEBUG2("seeking to next target with alignments...");
+    DEBUG("seeking to next target with alignments...");
+
+    // reset haplotype length; there is no last call in this sequence; it isn't relevant
+    lastHaplotypeLength = 0;
 
     // XXX
     // XXX
@@ -2553,8 +2615,9 @@ bool AlleleParser::toNextTarget(void) {
         bool ok = false;
 
         // try to load the first target if we need to
-        if (!currentTarget)
+        if (!currentTarget) {
             ok = loadTarget(&targets.front()) && getFirstAlignment();
+        }
 
         // step through targets until we get to one with alignments
         while (!ok && currentTarget != &targets.back()) {
@@ -2566,16 +2629,26 @@ bool AlleleParser::toNextTarget(void) {
             }
         }
 
-        if (!ok) return false; // last target and couldn't get alignment
+        if (ok) {
+
+            // XXX hack
+            clearRegisteredAlignments();
+            currentSequenceStart = currentAlignment.Position;
+            currentSequenceName = referenceIDToName[currentAlignment.RefID];
+            currentRefID = currentAlignment.RefID;
+            currentPosition = (currentPosition < currentAlignment.Position) ? currentAlignment.Position : currentPosition;
+            currentSequence = uppercase(reference.getSubSequence(currentSequenceName, currentSequenceStart, currentAlignment.Length));
+            rightmostHaplotypeBasisAllelePosition = currentPosition;
+
+        } else {
+            if (hasMoreVariants) {
+                DEBUG("continuing because we have more variants");
+                return true;
+            } else {
+                return false; // last target, no variants, and couldn't get alignment
+            }
+        }
 
-        // XXX hack
-        clearRegisteredAlignments();
-        currentSequenceStart = currentAlignment.Position;
-        currentSequenceName = referenceIDToName[currentAlignment.RefID];
-        currentRefID = currentAlignment.RefID;
-        currentPosition = (currentPosition < currentAlignment.Position) ? currentAlignment.Position : currentPosition;
-        currentSequence = uppercase(reference.getSubSequence(currentSequenceName, currentSequenceStart, currentAlignment.Length));
-        rightmostHaplotypeBasisAllelePosition = currentPosition;
 
     // stdin, no targets cases
     } else if (!currentTarget && (parameters.useStdin || targets.empty())) {
@@ -2614,7 +2687,7 @@ bool AlleleParser::loadTarget(BedTarget* target) {
 
     DEBUG("processing target " << currentTarget->desc << " " <<
             currentTarget->seq << " " << currentTarget->left << " " <<
-            currentTarget->right);
+            currentTarget->right + 1);
     DEBUG2("loading target reference subsequence");
 
     currentSequenceName = currentTarget->seq;
@@ -2627,32 +2700,32 @@ bool AlleleParser::loadTarget(BedTarget* target) {
     currentPosition = currentTarget->left;
     rightmostHaplotypeBasisAllelePosition = currentTarget->left;
 
-    if (!bamMultiReader.SetRegion(refSeqID, currentTarget->left, refSeqID, currentTarget->right - 1)) {  // TODO is bamtools taking 0/1 basing?
-        ERROR("Could not SetRegion to " << currentTarget->seq << ":" << currentTarget->left << ".." << currentTarget->right);
+    if (!bamMultiReader.SetRegion(refSeqID, currentTarget->left, refSeqID, currentTarget->right + 1)) { // bamtools expects 0-based, half-open
+        ERROR("Could not SetRegion to " << currentTarget->seq << ":" << currentTarget->left << ".." << currentTarget->right + 1);
+        cerr << bamMultiReader.GetErrorString() << endl;
         return false;
     }
 
     if (variantCallInputFile.is_open()) {
         stringstream r;
-        r << currentTarget->seq << ":" << max(0, currentTarget->left - 1) << "-" << currentTarget->right - 1;
+        // tabix expects 1-based, fully closed regions for ti_parse_region()
+        // (which is what setRegion() calls eventually)
+        r << currentTarget->seq << ":" << currentTarget->left + 1 << "-" << currentTarget->right + 1;
         if (!variantCallInputFile.setRegion(r.str())) {
-            ERROR("Could not set the region of the variants input file to " <<
+            WARNING("Could not set the region of the variants input file to " <<
                     currentTarget->seq << ":" << currentTarget->left << ".." <<
-                    currentTarget->right);
-            return false;
+                    currentTarget->right + 1);
+            //return false;
         } else {
             DEBUG("set region of variant input file to " << 
                     currentTarget->seq << ":" << currentTarget->left << ".." <<
-                    currentTarget->right);
+                    currentTarget->right + 1);
         }
     }
 
     // now that we've jumped, reset the hasMoreAlignments counter
     hasMoreAlignments = true;
 
-    // same for the variants record
-    hasMoreVariants = true;
-
     DEBUG2("set region");
 
     return true;
@@ -2677,9 +2750,9 @@ bool AlleleParser::getFirstAlignment(void) {
         DEBUG2("got first alignment in target region");
     } else {
         if (currentTarget) {
-            ERROR("Could not find any mapped reads in target region " << currentSequenceName << ":" << currentTarget->left << ".." << currentTarget->right);
+            WARNING("Could not find any mapped reads in target region " << currentSequenceName << ":" << currentTarget->left << ".." << currentTarget->right + 1);
         } else {
-            ERROR("Could not find any mapped reads in target region " << currentSequenceName);
+            WARNING("Could not find any mapped reads in target region " << currentSequenceName);
         }
         return false;
     }
@@ -2690,10 +2763,12 @@ bool AlleleParser::getFirstAlignment(void) {
 
 bool AlleleParser::getFirstVariant(void) {
 
-    hasMoreVariants = true;
+    hasMoreVariants = false;
     if (variantCallInputFile.is_open()) {
         if (!variantCallInputFile.getNextVariant(*currentVariant)) {
             hasMoreVariants = false;
+        } else {
+            hasMoreVariants = true;
         }
 
         if (hasMoreVariants) {
@@ -2737,47 +2812,54 @@ void AlleleParser::clearRegisteredAlignments(void) {
 //
 bool AlleleParser::toNextPosition(void) {
 
+    // either bail out
     if (currentSequenceName.empty()) {
         DEBUG("loading first target");
         if (!toNextTarget()) {
             return false;
         }
     } 
+    // or step to the next position
     else {
         ++currentPosition;
     }
 
-    if (!targets.empty() && (
-                (!parameters.allowIndels && currentPosition >= currentTarget->right)
-                || currentPosition > currentTarget->right - 1)) { // time to move to a new target
-        DEBUG("next position " << (long int) currentPosition + 1 <<  " outside of current target right bound " << currentTarget->right + 1);
+    // if we've run off the right edge of a target
+    if (!targets.empty() && currentPosition > currentTarget->right) { // time to move to a new target
+        DEBUG("next position " << (long int) currentPosition <<  " outside of current target right bound " << currentTarget->right + 1);
+        // try to get to the next one, and if this fails, bail out
         if (!toNextTarget()) {
             DEBUG("no more targets, finishing");
             return false;
         }
     }
 
-    // stdin, no targets case
-    if (parameters.useStdin || targets.empty()) {
-        // TODO rectify this with the other copies of this stanza...
+    // in the stdin, or no targets case
+    // here we assume we are processing an entire BAM or one contiguous region
+    if ((parameters.useStdin && targets.empty()) || targets.empty()) {
         // implicit step of target sequence
         // XXX this must wait for us to clean out all of our alignments at the end of the target
+
+        // here we loop over unaligned reads at the beginning of a target
+        // we need to get to a mapped read to figure out where we are
         while (hasMoreAlignments && !currentAlignment.IsMapped()) {
             hasMoreAlignments = bamMultiReader.GetNextAlignment(currentAlignment);
         }
+        // now, if the current position of this alignment is outside of the reference sequence length, switch references
         if (hasMoreAlignments) {
-            if (currentPosition > reference.sequenceLength(currentSequenceName)
+            if (currentPosition >= reference.sequenceLength(currentSequenceName)
                 || registeredAlignments.empty() && currentRefID != currentAlignment.RefID) {
                 DEBUG("at end of sequence");
                 clearRegisteredAlignments();
                 loadReferenceSequence(currentAlignment);
                 justSwitchedTargets = true;
             }
+        // here, if we have run out of alignments
         } else if (!hasMoreAlignments) {
             if (registeredAlignments.empty()) {
                 DEBUG("no more alignments in input");
                 return false;
-            } else if (currentPosition > currentSequence.size() + currentSequenceStart) {
+            } else if (currentPosition >= currentSequence.size() + currentSequenceStart) {
                 DEBUG("at end of sequence");
                 return false;
             }
@@ -2796,7 +2878,9 @@ bool AlleleParser::toNextPosition(void) {
     updateAlignmentQueue(currentPosition, newAlleles);
     addToRegisteredAlleles(newAlleles);
     DEBUG2("updating variants");
-    updateInputVariants();
+    // done typically at each new read, but this handles the case where there is no data for a while
+    updateInputVariants(currentPosition, 1);
+
     DEBUG2("updating registered alleles");
     updateRegisteredAlleles(); // this removes unused left-flanking sequence
     //DEBUG2("updating prior variant alleles");
@@ -2805,11 +2889,6 @@ bool AlleleParser::toNextPosition(void) {
     // if we have alignments which ended at the previous base, erase them and their alleles
     // TODO check that this doesn't leak...
     DEBUG2("erasing old registered alignments");
-    /*
-    map<long unsigned int, deque<RegisteredAlignment> >::iterator f = registeredAlignments.find(currentPosition - 2);
-    if (f != registeredAlignments.end()) {
-        registeredAlignments.erase(f);
-        }*/
     map<long unsigned int, deque<RegisteredAlignment> >::iterator f = registeredAlignments.begin();
     while (f != registeredAlignments.end()
            && f->first < currentPosition - lastHaplotypeLength) {
@@ -3424,7 +3503,7 @@ bool AlleleParser::getCompleteObservationsOfHaplotype(Samples& samples, int hapl
             RegisteredAlignment& ra = *rai;
             Allele* aptr;
             // this guard prevents trashing allele pointers when getting partial observations
-            if (ra.start <= currentPosition && ra.end > currentPosition + haplotypeLength) {
+            if (ra.start <= currentPosition && ra.end >= currentPosition + haplotypeLength) {
                 if (ra.fitHaplotype(currentPosition, haplotypeLength, aptr)) {
                     for (vector<Allele>::iterator a = ra.alleles.begin(); a != ra.alleles.end(); ++a) {
                         if (a->position == currentPosition && a->referenceLength == haplotypeLength) {
diff --git a/src/AlleleParser.h b/src/AlleleParser.h
index 37c2bc0..de65014 100644
--- a/src/AlleleParser.h
+++ b/src/AlleleParser.h
@@ -58,8 +58,9 @@ public:
     int snpCount;
     int indelCount;
     int alleleTypes;
+    Parameters parameters;
 
-    RegisteredAlignment(BamAlignment& alignment)
+    RegisteredAlignment(BamAlignment& alignment, Parameters parameters)
         //: alignment(alignment)
         : start(alignment.Position)
         , end(alignment.GetEndPosition())
@@ -69,6 +70,7 @@ public:
         , snpCount(0)
         , indelCount(0)
         , alleleTypes(0)
+        , parameters(parameters)
     {
         alignment.GetTag("RG", readgroup);
     }
@@ -171,9 +173,11 @@ public:
     // map from starting position to length->alle
     map<long int, vector<AllelicPrimitive> > haplotypeBasisAlleles;  // this is in the current reference sequence
     bool usingHaplotypeBasisAlleles;
+    bool usingVariantInputAlleles;
     long int rightmostHaplotypeBasisAllelePosition;
+    long int rightmostInputAllelePosition;
     void updateHaplotypeBasisAlleles(long int pos, int referenceLength);
-    bool allowedAllele(long int pos, string& ref, string& alt);
+    bool allowedHaplotypeBasisAllele(long int pos, string& ref, string& alt);
 
     Allele makeAllele(RegisteredAlignment& ra,
 		      AlleleType type,
@@ -241,7 +245,7 @@ public:
     RegisteredAlignment& registerAlignment(BamAlignment& alignment, RegisteredAlignment& ra, string& sampleName, string& sequencingTech);
     void clearRegisteredAlignments(void);
     void updateAlignmentQueue(long int position, vector<Allele*>& newAlleles, bool gettingPartials = false);
-    void updateInputVariants(void);
+    void updateInputVariants(long int pos, int referenceLength);
     void updateHaplotypeBasisAlleles(void);
     void removeAllelesWithoutReadSpan(vector<Allele*>& alleles, int probeLength, int haplotypeLength);
     void removeNonOverlappingAlleles(vector<Allele*>& alleles,
diff --git a/src/BedReader.cpp b/src/BedReader.cpp
index 1eac75a..b629b4c 100644
--- a/src/BedReader.cpp
+++ b/src/BedReader.cpp
@@ -28,6 +28,11 @@ vector<BedTarget> BedReader::entries(void) {
 
     string line;
     while (std::getline(*this, line)) {
+        // BED is base-numbered, 0-origin, half-open.  This parse turns that
+        // into base-numbered, 0-origin, fully-closed for internal use.  All
+        // coordinates used internally should be in the latter, and coordinates
+        // from the user in the former should be converted immediately to the
+        // internal format.
         vector<string> fields = split(line, " \t");
         BedTarget entry(strip(fields[0]),
                         atoi(strip(fields[1]).c_str()),
diff --git a/src/BedReader.h b/src/BedReader.h
index 62500da..9d7f271 100644
--- a/src/BedReader.h
+++ b/src/BedReader.h
@@ -21,7 +21,7 @@ public:
 
     string seq;  // sequence name
     int left;    // left position
-    int right;   // right position, adjusted to 0-base
+    int right;   // right position, adjusted to 0-base inclusive
     string desc; // descriptive information, target name typically
 
     BedTarget(string s, int l, int r, string d = "")
diff --git a/src/Genotype.cpp b/src/Genotype.cpp
index 1ba8e29..5a1b1a2 100644
--- a/src/Genotype.cpp
+++ b/src/Genotype.cpp
@@ -1347,6 +1347,7 @@ void addAllHomozygousCombos(
                                      binomialObsPriors,
                                      alleleBalancePriors,
                                      diffusionPriorScalar);
+
         combos.push_back(gc);
     }
 
@@ -1355,6 +1356,25 @@ void addAllHomozygousCombos(
     combos.sort(gcrSorter);
     combos.unique();
 
+    /*
+    for (list<GenotypeCombo>::iterator g = combos.begin(); g != combos.end(); ++g) {
+        GenotypeCombo& gc = *g;
+        cerr << gc << endl
+             << "," << gc.probObsGivenGenotypes
+             << "," << gc.posteriorProb
+             << "," << gc.priorProbG_Af
+             << "," << gc.priorProbAf
+             << "," << gc.priorProbObservations
+             << endl;
+        map<int, int> acs = gc.countFrequencies();
+        for (map<int, int>::iterator a = acs.begin(); a != acs.end(); ++a) {
+            cerr << a->first << " " << a->second << endl;
+        }
+        cerr << "***************************" << endl;
+    }
+    */
+
+
 }
 
 // conditional probability of the genotype combination given the represented allele frequencies
@@ -1501,13 +1521,16 @@ GenotypeCombo::calculatePosteriorProbability(
     if (binomialObsPriors) {
         // for each alternate and the reference allele
         // calculate the binomial probability that we see the given strand balance and read placement prob
-        // cerr << *combo << endl;
+        //cerr << *this << endl;
         for (map<string, AlleleCounter>::iterator ac = alleleCounters.begin(); ac != alleleCounters.end(); ++ac) {
             //const string& allele = ac->first;
             const AlleleCounter& alleleCounter = ac->second;
             int obs = alleleCounter.observations;
+
             /*
-            cerr << allele <<  " counts: " << alleleCounter.frequency
+            cerr << endl
+                 << "--------------------------------------------" << endl;
+            cerr <<  " counts: " << alleleCounter.frequency
                 << " observations " << alleleCounter.observations
                 << " " << alleleCounter.forwardStrand
                 << "," << alleleCounter.reverseStrand
@@ -1516,12 +1539,17 @@ GenotypeCombo::calculatePosteriorProbability(
                 << " " << alleleCounter.placedStart
                 << "," << alleleCounter.placedEnd
                 << endl;
-                */
+
+            cerr << "priorProbObservations = " << priorProbObservations << endl;
+            cerr << "binprobln strand = " << binomialProbln(alleleCounter.forwardStrand, obs, 0.5) << endl;
+            cerr << "binprobln position = " << binomialProbln(alleleCounter.placedLeft, obs, 0.5) << endl;
+            cerr << "binprobln start = " << binomialProbln(alleleCounter.placedStart, obs, 0.5) << endl;
+            */
 
             priorProbObservations
                 += binomialProbln(alleleCounter.forwardStrand, obs, 0.5)
-		    +  binomialProbln(alleleCounter.placedLeft, obs, 0.5)
-		    +  binomialProbln(alleleCounter.placedStart, obs, 0.5);
+                +  binomialProbln(alleleCounter.placedLeft, obs, 0.5)
+                +  binomialProbln(alleleCounter.placedStart, obs, 0.5);
         }
     }
 
@@ -1545,6 +1573,7 @@ GenotypeCombo::calculatePosteriorProbability(
     }
 
     // posterior probability
+
     /*
     cerr << "priorProbG_Af " << priorProbG_Af << endl
 	 << "priorProbAf " << priorProbAf << endl
@@ -1552,9 +1581,17 @@ GenotypeCombo::calculatePosteriorProbability(
 	 << "priorProbGenotypesGivenHWE " << priorProbGenotypesGivenHWE << endl
 	 << "probObsGivenGenotypes " << probObsGivenGenotypes << endl;
     */
+
     priorProb = priorProbG_Af + priorProbAf + priorProbObservations + priorProbGenotypesGivenHWE;
     posteriorProb = priorProb + probObsGivenGenotypes;
 
+    /*
+    cerr << "priorProb " << priorProb << endl;
+    cerr << "posteriorProb " << posteriorProb << endl;
+
+    cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << endl << endl;
+    */
+
 }
 
 
diff --git a/src/GenotypePriors.cpp b/src/GenotypePriors.cpp
index ae57872..7ed74d8 100644
--- a/src/GenotypePriors.cpp
+++ b/src/GenotypePriors.cpp
@@ -1,6 +1,6 @@
 #include "GenotypePriors.h"
 
-
+/*
 long double alleleFrequencyProbability(const map<int, int>& alleleFrequencyCounts, long double theta) {
 
     int M = 0;
@@ -49,6 +49,7 @@ long double __alleleFrequencyProbabilityln(const map<int, int>& alleleFrequencyC
     return factorialln(M) - (thetaln + thetaH) + p;
 
 }
+*/
 
 
 long double probabilityGenotypeComboGivenAlleleFrequencyln(GenotypeCombo& genotypeCombo, Allele& allele) {
@@ -111,12 +112,18 @@ genotypeCombinationPriorProbability(
                         << " " << alleleCounter.placedStart
                         << "," << alleleCounter.placedEnd
                         << endl;
-                        */
+                    cerr << "priorObservationExpectationProb = " << priorObservationExpectationProb << endl;
+                    cerr << "binprobln strand = " << binomialProbln(alleleCounter.forwardStrand, obs, 0.5) << endl;
+                    cerr << "binprobln position = " << binomialProbln(alleleCounter.placedLeft, obs, 0.5) << endl;
+                    cerr << "binprobln start = " << binomialProbln(alleleCounter.placedStart, obs, 0.5) << endl;
+                    cerr << "priorObservationExpectationProb = " << priorObservationExpectationProb << endl;
+                    */
 
                     priorObservationExpectationProb
                         += binomialProbln(alleleCounter.forwardStrand, obs, 0.5)
                         +  binomialProbln(alleleCounter.placedLeft, obs, 0.5)
                         +  binomialProbln(alleleCounter.placedStart, obs, 0.5);
+
                 }
             }
             // ok... now do the same move for the observation counts
diff --git a/src/Makefile b/src/Makefile
index fc72a07..bd64279 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -5,7 +5,7 @@
 ################################################################################
 
 # Compiler
-CC=g++
+CXX=g++
 C=gcc
 
 # Compiler flags
@@ -77,106 +77,106 @@ HEADERS=multichoose.h version_git.h
 # executables
 
 freebayes ../bin/freebayes: freebayes.o $(OBJECTS) $(HEADERS)
-	$(CC) $(CFLAGS) $(INCLUDE) freebayes.o $(OBJECTS) -o ../bin/freebayes $(LIBS)
+	$(CXX) $(CFLAGS) $(INCLUDE) freebayes.o $(OBJECTS) -o ../bin/freebayes $(LIBS)
 
 alleles ../bin/alleles: alleles.o $(OBJECTS) $(HEADERS)
-	$(CC) $(CFLAGS) $(INCLUDE) alleles.o $(OBJECTS) -o ../bin/alleles $(LIBS)
+	$(CXX) $(CFLAGS) $(INCLUDE) alleles.o $(OBJECTS) -o ../bin/alleles $(LIBS)
 
 dummy ../bin/dummy: dummy.o $(OBJECTS) $(HEADERS)
-	$(CC) $(CFLAGS) $(INCLUDE) dummy.o $(OBJECTS) -o ../bin/dummy $(LIBS)
+	$(CXX) $(CFLAGS) $(INCLUDE) dummy.o $(OBJECTS) -o ../bin/dummy $(LIBS)
 
 bamleftalign ../bin/bamleftalign: $(BAMTOOLS_ROOT)/lib/libbamtools.a bamleftalign.o Fasta.o LeftAlign.o IndelAllele.o split.o
-	$(CC) $(CFLAGS) $(INCLUDE) bamleftalign.o Fasta.o LeftAlign.o IndelAllele.o split.o $(BAMTOOLS_ROOT)/lib/libbamtools.a -o ../bin/bamleftalign $(LIBS)
+	$(CXX) $(CFLAGS) $(INCLUDE) bamleftalign.o Fasta.o LeftAlign.o IndelAllele.o split.o $(BAMTOOLS_ROOT)/lib/libbamtools.a -o ../bin/bamleftalign $(LIBS)
 
 bamfiltertech ../bin/bamfiltertech: $(BAMTOOLS_ROOT)/lib/libbamtools.a bamfiltertech.o $(OBJECTS) $(HEADERS)
-	$(CC) $(CFLAGS) $(INCLUDE) bamfiltertech.o $(OBJECTS) -o ../bin/bamfiltertech $(LIBS)
+	$(CXX) $(CFLAGS) $(INCLUDE) bamfiltertech.o $(OBJECTS) -o ../bin/bamfiltertech $(LIBS)
 
 
 # objects
 
 Fasta.o: Fasta.cpp
-	$(CC) $(CFLAGS) $(INCLUDE) -c Fasta.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Fasta.cpp
 
 alleles.o: alleles.cpp AlleleParser.o Allele.o
-	$(CC) $(CFLAGS) $(INCLUDE) -c alleles.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c alleles.cpp
 
 dummy.o: dummy.cpp AlleleParser.o Allele.o
-	$(CC) $(CFLAGS) $(INCLUDE) -c dummy.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c dummy.cpp
 
 freebayes.o: freebayes.cpp TryCatch.h $(BAMTOOLS_ROOT)/lib/libbamtools.a
-	$(CC) $(CFLAGS) $(INCLUDE) -c freebayes.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c freebayes.cpp
 
 fastlz.o: fastlz.c fastlz.h
 	$(C) $(CFLAGS) $(INCLUDE) -c fastlz.c	
 
 Parameters.o: Parameters.cpp Parameters.h Version.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Parameters.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Parameters.cpp
 
 Allele.o: Allele.cpp Allele.h multichoose.h Genotype.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Allele.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Allele.cpp
 
 Sample.o: Sample.cpp Sample.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Sample.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Sample.cpp
 
 Genotype.o: Genotype.cpp Genotype.h Allele.h multipermute.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Genotype.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Genotype.cpp
 
 Ewens.o: Ewens.cpp Ewens.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Ewens.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Ewens.cpp
 
 AlleleParser.o: AlleleParser.cpp AlleleParser.h multichoose.h Parameters.h $(BAMTOOLS_ROOT)/lib/libbamtools.a
-	$(CC) $(CFLAGS) $(INCLUDE) -c AlleleParser.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c AlleleParser.cpp
 
 Utility.o: Utility.cpp Utility.h Sum.h Product.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Utility.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Utility.cpp
 
 SegfaultHandler.o: SegfaultHandler.cpp SegfaultHandler.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c SegfaultHandler.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c SegfaultHandler.cpp
 
 Dirichlet.o: Dirichlet.h Dirichlet.cpp Sum.h Product.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Dirichlet.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Dirichlet.cpp
 
 Multinomial.o: Multinomial.h Multinomial.cpp Sum.h Product.h Utility.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Multinomial.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Multinomial.cpp
 
 DataLikelihood.o: DataLikelihood.cpp DataLikelihood.h Sum.h Product.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c DataLikelihood.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c DataLikelihood.cpp
 
 Marginals.o: Marginals.cpp Marginals.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Marginals.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Marginals.cpp
 
 ResultData.o: ResultData.cpp ResultData.h Result.h Result.cpp Allele.h Utility.h Genotype.h AlleleParser.h Version.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c ResultData.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c ResultData.cpp
 
 Result.o: Result.cpp Result.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Result.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Result.cpp
 
 BedReader.o: BedReader.cpp BedReader.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c BedReader.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c BedReader.cpp
 
 CNV.o: CNV.cpp CNV.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c CNV.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c CNV.cpp
 
 Bias.o: Bias.cpp Bias.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c Bias.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c Bias.cpp
 
 split.o: split.h split.cpp
-	$(CC) $(CFLAGS) $(INCLUDE) -c split.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c split.cpp
 
 bamleftalign.o: bamleftalign.cpp LeftAlign.cpp
-	$(CC) $(CFLAGS) $(INCLUDE) -c bamleftalign.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c bamleftalign.cpp
 
 bamfiltertech.o: bamfiltertech.cpp
-	$(CC) $(CFLAGS) $(INCLUDE) -c bamfiltertech.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c bamfiltertech.cpp
 
 LeftAlign.o: LeftAlign.h LeftAlign.cpp $(BAMTOOLS_ROOT)/lib/libbamtools.a
-	$(CC) $(CFLAGS) $(INCLUDE) -c LeftAlign.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c LeftAlign.cpp
 
 IndelAllele.o: IndelAllele.cpp IndelAllele.h
-	$(CC) $(CFLAGS) $(INCLUDE) -c IndelAllele.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c IndelAllele.cpp
 
 Variant.o: $(VCFLIB_ROOT)/src/Variant.h $(VCFLIB_ROOT)/src/Variant.cpp
-	$(CC) $(CFLAGS) $(INCLUDE) -c $(VCFLIB_ROOT)/src/Variant.cpp
+	$(CXX) $(CFLAGS) $(INCLUDE) -c $(VCFLIB_ROOT)/src/Variant.cpp
 
 ../vcflib/tabixpp/tabix.o: ../vcflib/tabixpp/tabix.hpp ../vcflib/tabixpp/tabix.cpp
 ../vcflib/tabixpp/bgzf.o: ../vcflib/tabixpp/bgzf.c ../vcflib/tabixpp/bgzf.h
diff --git a/src/Marginals.cpp b/src/Marginals.cpp
index 79c4f58..cebca4d 100644
--- a/src/Marginals.cpp
+++ b/src/Marginals.cpp
@@ -56,10 +56,10 @@ long double marginalGenotypeLikelihoods(list<GenotypeCombo>& genotypeCombos, Sam
             if (rmgsItr == rmgs.end()) {
                 rmgs[sdl.genotype] = gc->posteriorProb;
             } else {
-		//vector<long double> x;
-		//x.push_back(rmgsItr->second); x.push_back(gc->posteriorProb);
+                //vector<long double> x;
+                //x.push_back(rmgsItr->second); x.push_back(gc->posteriorProb);
                 //rmgs[sdl.genotype] = logsumexp_probs(x);
-		rmgs[sdl.genotype] = log(safe_exp(rmgsItr->second) + safe_exp(gc->posteriorProb));
+                rmgs[sdl.genotype] = log(safe_exp(rmgsItr->second) + safe_exp(gc->posteriorProb));
             }
         }
     }
@@ -67,6 +67,7 @@ long double marginalGenotypeLikelihoods(list<GenotypeCombo>& genotypeCombos, Sam
     // safely add the raw marginal vectors using logsumexp
     // and use to update the sample data likelihoods
     rawMarginalsItr = rawMarginals.begin();
+    long double minAllowedMarginal = -1e-16;
     for (SampleDataLikelihoods::iterator s = likelihoods.begin(); s != likelihoods.end(); ++s) {
         vector<SampleDataLikelihood>& sdls = *s;
         const map<Genotype*, long double>& rawmgs = *rawMarginalsItr++;
@@ -81,7 +82,8 @@ long double marginalGenotypeLikelihoods(list<GenotypeCombo>& genotypeCombos, Sam
         for (vector<SampleDataLikelihood>::iterator sdl = sdls.begin(); sdl != sdls.end(); ++sdl) {
             long double newmarginal = marginals[sdl->genotype] - normalizer;
             delta += newmarginal - sdl->marginal;
-            sdl->marginal = newmarginal;
+            // ensure the marginal is non-0 to guard against underflow
+            sdl->marginal = min(minAllowedMarginal, newmarginal);
         }
     }
 
diff --git a/src/Parameters.cpp b/src/Parameters.cpp
index eaf0ebe..06c283b 100644
--- a/src/Parameters.cpp
+++ b/src/Parameters.cpp
@@ -9,42 +9,15 @@ void Parameters::simpleUsage(char ** argv) {
         << endl
         << "Bayesian haplotype-based polymorphism discovery." << endl
         << endl
+        << "parameters:" << endl
+        << endl
+        << "   -h --help       For a complete description of options." << endl
+        << endl
         << "citation: Erik Garrison, Gabor Marth" << endl
         << "          \"Haplotype-based variant detection from short-read sequencing\"" << endl
         << "          arXiv:1207.3907 (http://arxiv.org/abs/1207.3907)" << endl
         << endl
-        << "overview:" << endl
-        << endl
-        << "    To call variants from aligned short-read sequencing data, supply BAM files and" << endl
-        << "    a reference.  FreeBayes will provide VCF output on standard out describing SNPs," << endl
-        << "    indels, and complex variants in samples in the input alignments." << endl
-        << endl
-        << "    By default, FreeBayes will consider variants supported by at least 2" << endl
-        << "    observations in a single sample (-C) and also by at least 20% of the reads from" << endl
-        << "    a single sample (-F).  These settings are suitable to low to high depth" << endl
-        << "    sequencing in haploid and diploid samples, but users working with polyploid or" << endl
-        << "    pooled samples may wish to adjust them depending on the characteristics of" << endl
-        << "    their sequencing data." << endl
-        << endl
-        << "    FreeBayes is capable of calling variant haplotypes shorter than a read length" << endl
-        << "    where multiple polymorphisms segregate on the same read.  The maximum distance" << endl
-        << "    between polymorphisms phased in this way is determined by the --max-complex-gap," << endl
-        << "    which defaults to 3bp." << endl
-        << endl
-        << "    Ploidy may be set to any level (-p), but by default all samples are assumed to" << endl
-        << "    be diploid.  FreeBayes can model per-sample and per-region variation in" << endl
-        << "    copy-number (-A) using a copy-number variation map." << endl
-        << endl
-        << "    FreeBayes can act as a frequency-based pooled caller and describe variants" << endl
-        << "    and haplotypes in terms of observation frequency rather than called genotypes." << endl
-        << "    To do so, use --pooled-continuous and set input filters to a suitable level." << endl
-        << "    Allele observation counts will be described by AO and RO fields in the VCF output." << endl
-        << endl
-        << "parameters:" << endl
-        << endl
-        << "   -h --help       Complete description of options." << endl
-        << endl
-        << "author:   Erik Garrison <erik.garrison at bc.edu>, Marth Lab, Boston College, 2010-2012" << endl
+        << "author:   Erik Garrison <erik.garrison at bc.edu>, Marth Lab, Boston College, 2010-2014" << endl
         << "version:  " << VERSION_GIT << endl;
 
 }
@@ -82,9 +55,45 @@ void Parameters::usage(char** argv) {
         << "    be diploid.  FreeBayes can model per-sample and per-region variation in" << endl
         << "    copy-number (-A) using a copy-number variation map." << endl
         << endl
+        << "    FreeBayes can act as a frequency-based pooled caller and describe variants" << endl
+        << "    and haplotypes in terms of observation frequency rather than called genotypes." << endl
+        << "    To do so, use --pooled-continuous and set input filters to a suitable level." << endl
+        << "    Allele observation counts will be described by AO and RO fields in the VCF output." << endl
+        << endl
+        << endl
+        << "examples:" << endl
+        << endl
+        << "    # call variants assuming a diploid sample" << endl
+        << "    freebayes -f ref.fa aln.bam >var.vcf" << endl
+        << endl
+        << "    # require at least 5 supporting observations to consider a variant" << endl
+        << "    freebayes -f ref.fa -C 5 aln.bam >var.vcf" << endl
+        << endl
+        << "    # use a different ploidy" << endl
+        << "    freebayes -f ref.fa -p 4 aln.bam >var.vcf" << endl
+        << endl
+        << "    # assume a pooled sample with a known number of genome copies" << endl
+        << "    freebayes -f ref.fa -p 20 --pooled-discrete aln.bam >var.vcf" << endl
+        << endl
+        << "    # generate frequency-based calls for all variants passing input thresholds" << endl
+        << "    freebayes -f ref.fa -F 0.01 -C 1 --pooled-continuous aln.bam >var.vcf" << endl
+        << endl
+        << "    # use an input VCF (bgzipped + tabix indexed) to force calls at particular alleles" << endl
+        << "    freebayes -f ref.fa -@ in.vcf.gz aln.bam >var.vcf" << endl
+        << endl
+        << "    # generate long haplotype calls over known variants" << endl
+        << "    freebayes -f ref.fa --haplotype-basis-alleles in.vcf.gz \\ " << endl
+        << "                        --haplotype-length 50 aln.bam" << endl
+        << endl
+        << "    # naive variant calling: simply annotate observation counts of SNPs and indels" << endl
+        << "    freebayes -f ref.fa --haplotype-length 0 --min-alternate-count 1 \\ " << endl
+        << "        --min-alternate-fraction 0 --pooled-continuous --report-monomorphic >var.vcf" << endl
+        << endl
+        << endl
         << "parameters:" << endl
         << endl
         << "   -h --help       Prints this help dialog." << endl
+        << "   --version       Prints the release number and the git commit id." << endl
         << endl
         << "input and output:" << endl
         << endl
@@ -102,7 +111,8 @@ void Parameters::usage(char** argv) {
         << "                   Limit analysis to targets listed in the BED-format FILE." << endl
         << "   -r --region <chrom>:<start_position>-<end_position>" << endl
         << "                   Limit analysis to the specified region, 0-base coordinates," << endl
-        << "                   end_position included.  Either '-' or '..' maybe used as a separator." << endl
+        << "                   end_position not included (same as BED format)." << endl
+        << "                   Either '-' or '..' maybe used as a separator." << endl
         << "   -s --samples FILE" << endl
         << "                   Limit analysis to samples listed (one per line) in the FILE." << endl
         << "                   By default FreeBayes will analyze all samples in its input" << endl
@@ -123,9 +133,8 @@ void Parameters::usage(char** argv) {
         << "                   pass --pvar to FILE." << endl
         << "   -@ --variant-input VCF" << endl
         << "                   Use variants reported in VCF file as input to the algorithm." << endl
-        << "                   Variants in this file will be treated as putative variants" << endl
-        << "                   even if there is not enough support in the data to pass" << endl
-        << "                   input filters." << endl
+        << "                   Variants in this file will included in the output even if" << endl
+        << "                   there is not enough support in the data to pass input filters." << endl
         << "   -l --only-use-input-alleles" << endl
         << "                   Only provide variant calls and genotype likelihoods for sites" << endl
         << "                   and alleles which are provided in the VCF input, and provide" << endl
@@ -188,8 +197,8 @@ void Parameters::usage(char** argv) {
         << "   -E --max-complex-gap N" << endl
         << "      --haplotype-length N" << endl
         << "                   Allow haplotype calls with contiguous embedded matches of up" << endl
-        << "                   to this length." << endl
-        << "   --min-repeat-length N" << endl
+        << "                   to this length.  (default: 3)" << endl
+        << "   --min-repeat-size N" << endl
         << "                   When assembling observations across repeats, require the total repeat" << endl
         << "                   length at least this many bp.  (default: 5)" << endl
         << "   --min-repeat-entropy N" << endl
@@ -212,7 +221,7 @@ void Parameters::usage(char** argv) {
         << "                   default: exclude duplicates marked as such in alignments" << endl
         << "   -m --min-mapping-quality Q" << endl
         << "                   Exclude alignments from analysis if they have a mapping" << endl
-        << "                   quality less than Q.  default: 0" << endl
+        << "                   quality less than Q.  default: 1" << endl
         << "   -q --min-base-quality Q" << endl
         << "                   Exclude alleles from analysis if their supporting base" << endl
         << "                   quality is less than Q.  default: 0" << endl
@@ -287,10 +296,13 @@ void Parameters::usage(char** argv) {
         << "                   Read length-dependent allele observation biases from FILE." << endl
         << "                   The format is [length] [alignment efficiency relative to reference]" << endl
         << "                   where the efficiency is 1 if there is no relative observation bias." << endl
-        << "   --standard-gls" << endl
-        << "                   Use legacy model to generate genotype likelihoods." << endl
         << "   --base-quality-cap Q" << endl
         << "                   Limit estimated observation quality by capping base quality at Q." << endl
+        << "   --experimental-gls" << endl
+        << "                   Generate genotype likelihoods using 'effective base depth' metric" << endl
+        << "                   qual = 1-BaseQual * 1-MapQual.  Incorporate partial observations." << endl
+        << "                   This is the default when contamination estimates are provided." << endl
+        << "                   Optimized for diploid samples." << endl
         << "   --prob-contamination F" << endl
         << "                   An estimate of contamination to use for all samples.  default: 10e-9" << endl
         << "   --contamination-estimates FILE" << endl
@@ -338,7 +350,7 @@ void Parameters::usage(char** argv) {
         << "   -dd             Print more verbose debugging output (requires \"make DEBUG\")" << endl
         << endl
         << endl
-        << "author:   Erik Garrison <erik.garrison at bc.edu>, Marth Lab, Boston College, 2010-2012" << endl
+        << "author:   Erik Garrison <erik.garrison at bc.edu>, Marth Lab, Boston College, 2010-2014" << endl
         << "version:  " << VERSION_GIT << endl;
 
 }
@@ -410,15 +422,16 @@ Parameters::Parameters(int argc, char** argv) {
     genotypingMaxBandDepth = 7;
     minPairedAltCount = 0;
     minAltMeanMapQ = 0;
+    limitGL = 0;
     reportAllHaplotypeAlleles = false;
     reportMonomorphic = false;
     boundIndels = true; // ignore indels at ends of reads
     onlyUseInputAlleles = false;
-    standardGLs = false;
+    standardGLs = true;
     MQR = 100;                     // -M --reference-mapping-quality
     BQR = 60;                     // -B --reference-base-quality
     ploidy = 2;                  // -p --ploidy
-    MQL0 = 0;                    // -m --min-mapping-quality
+    MQL0 = 1;                    // -m --min-mapping-quality
     BQL0 = 0;                    // -q --min-base-quality
     minSupportingAlleleQualitySum = 0;
     minSupportingMappingQualitySum = 0;
@@ -455,6 +468,7 @@ Parameters::Parameters(int argc, char** argv) {
     static struct option long_options[] =
         {
             {"help", no_argument, 0, 'h'},
+            {"version", no_argument, 0, '#'},
             {"bam", required_argument, 0, 'b'},
             {"bam-list", required_argument, 0, 'L'},
             {"stdin", no_argument, 0, 'c'},
@@ -525,7 +539,7 @@ Parameters::Parameters(int argc, char** argv) {
             {"haplotype-basis-alleles", required_argument, 0, '9'},
             {"report-genotype-likelihood-max", no_argument, 0, '5'},
             {"report-all-haplotype-alleles", no_argument, 0, '6'},
-            {"standard-gls", no_argument, 0, ')'},
+            {"experimental-gls", no_argument, 0, ')'},
             {"base-quality-cap", required_argument, 0, '('},
             {"prob-contamination", required_argument, 0, '_'},
             {"contamination-estimates", required_argument, 0, ','},
@@ -538,7 +552,7 @@ Parameters::Parameters(int argc, char** argv) {
     while (true) {
 
         int option_index = 0;
-        c = getopt_long(argc, argv, "hcO4ZKjH[0diN5a)Ik=wl6uVXJY:b:G:M:x:@:A:f:t:r:s:v:n:B:p:m:q:R:Q:U:$:e:T:P:D:^:S:W:F:C:&:L:8:z:1:3:E:7:2:9:%:(:_:,:",
+        c = getopt_long(argc, argv, "hcO4ZKjH[0diN5a)Ik=wl6#uVXJY:b:G:M:x:@:A:f:t:r:s:v:n:B:p:m:q:R:Q:U:$:e:T:P:D:^:S:W:F:C:&:L:8:z:1:3:E:7:2:9:%:(:_:,:",
                         long_options, &option_index);
 
         if (c == -1) // end of options
@@ -992,7 +1006,7 @@ Parameters::Parameters(int argc, char** argv) {
             break;
 
         case ')':
-            standardGLs = true;
+            standardGLs = false;
             break;
 
         case '(':
@@ -1007,6 +1021,13 @@ Parameters::Parameters(int argc, char** argv) {
             ++debuglevel;
             break;
 
+	case '#':
+	    
+	    // --version
+            cout << "version:  " << VERSION_GIT << endl;
+	    exit(0);
+	    break;
+
         case 'h':
             usage(argv);
             exit(0);
diff --git a/src/Parameters.h b/src/Parameters.h
index 1d326ea..4502bac 100644
--- a/src/Parameters.h
+++ b/src/Parameters.h
@@ -85,6 +85,7 @@ public:
     float genotypeVariantThreshold;
     int siteSelectionMaxIterations;
     bool allSites; // TODO
+    double limitGL; // minimum GL that is output
     int minPairedAltCount;
     double minAltMeanMapQ;
     int minAltQSum;
diff --git a/src/ResultData.cpp b/src/ResultData.cpp
index 9a1303c..de85c7b 100644
--- a/src/ResultData.cpp
+++ b/src/ResultData.cpp
@@ -58,12 +58,15 @@ vcf::Variant& Results::vcf(
     // set up VCF record-wide variables
 
     var.sequenceName = parser->currentSequenceName;
-    // XXX this should be the position of the matching reference haplotype
     var.position = referencePosition + 1;
     var.id = ".";
     var.filter = ".";
-    // XXX this should be the size of the maximum deletion + 1bp on the left end
+
+    // note that we set QUAL to 0 at loci with no data
     var.quality = max((long double) 0, nan2zero(big2phred(pHom)));
+    if (coverage == 0) {
+        var.quality = 0;
+    }
 
 
     // set up format string
@@ -90,6 +93,7 @@ vcf::Variant& Results::vcf(
     long double refReadMismatchSum = 0;
     long double refReadSNPSum = 0;
     long double refReadIndelSum = 0;
+    long double refReadSoftClipSum = 0;
     unsigned int refObsCount = 0;
     map<string, int> refObsBySequencingTechnology;
 
@@ -261,6 +265,7 @@ vcf::Variant& Results::vcf(
                 altReadMismatchSum += allele.readMismatchRate;
                 altReadSNPSum += allele.readSNPRate;
                 altReadIndelSum += allele.readIndelRate;
+				// TODO: add altReadSoftClipRate (avg)
                 if (allele.isProperPair) {
                     ++altproperPairs;
                 }
@@ -337,6 +342,8 @@ vcf::Variant& Results::vcf(
         var.info["RUN"].push_back(convert(parser->homopolymerRunLeft(altbase) + 1 + parser->homopolymerRunRight(altbase)));
         var.info["MQM"].push_back(convert((altObsCount == 0) ? 0 : nan2zero((double) altmqsum / (double) altObsCount)));
         var.info["RPP"].push_back(convert((altObsCount == 0) ? 0 : nan2zero(ln2phred(hoeffdingln(altReadsLeft, altReadsRight + altReadsLeft, 0.5)))));
+        var.info["RPR"].push_back(convert(altReadsRight));
+		var.info["RPL"].push_back(convert(altReadsLeft));
         var.info["EPP"].push_back(convert((altBasesLeft + altBasesRight == 0) ? 0 : nan2zero(ln2phred(hoeffdingln(altEndLeft, altEndLeft + altEndRight, 0.5)))));
         var.info["PAIRED"].push_back(convert((altObsCount == 0) ? 0 : nan2zero((double) altproperPairs / (double) altObsCount)));
         var.info["CIGAR"].push_back(adjustedCigar[altAllele.base()]);
@@ -496,6 +503,10 @@ vcf::Variant& Results::vcf(
             Sample& sample = *gc->second->sample;
             Result& sampleLikelihoods = s->second;
             Genotype* genotype = gc->second->genotype;
+            if (sample.observationCount() == 0) {
+                continue;
+            }
+
             sampleOutput["GT"].push_back(genotype->relativeGenotype(refbase, altAlleles));
 
             if (parameters.calculateMarginals) {
@@ -583,7 +594,7 @@ vcf::Variant& Results::vcf(
                         }
                     }
 
-                    // normalize GLs to -5 min 0 max using division by max and bounding at -5
+                    // normalize GLs to 0 max using division by max
                     long double minGL = 0;
                     for (map<int, double>::iterator g = genotypeLikelihoods.begin(); g != genotypeLikelihoods.end(); ++g) {
                         if (g->second < minGL) minGL = g->second;
@@ -592,8 +603,15 @@ vcf::Variant& Results::vcf(
                     for (map<int, double>::iterator g = genotypeLikelihoods.begin(); g != genotypeLikelihoods.end(); ++g) {
                         if (g->second > maxGL) maxGL = g->second;
                     }
-                    for (map<int, double>::iterator g = genotypeLikelihoods.begin(); g != genotypeLikelihoods.end(); ++g) {
-                        genotypeLikelihoodsOutput[g->first] = convert( max((long double)-10, (g->second-maxGL)) );
+
+                    if (parameters.limitGL == 0) {
+                        for (map<int, double>::iterator g = genotypeLikelihoods.begin(); g != genotypeLikelihoods.end(); ++g) {
+                            genotypeLikelihoodsOutput[g->first] = convert(g->second-maxGL);
+                        }
+                    } else {
+                        for (map<int, double>::iterator g = genotypeLikelihoods.begin(); g != genotypeLikelihoods.end(); ++g) {
+                            genotypeLikelihoodsOutput[g->first] = convert( max((long double) + parameters.limitGL, (g->second-maxGL)) );
+                        }
                     }
 
                     vector<string>& datalikelihoods = sampleOutput["GL"];
diff --git a/src/freebayes.cpp b/src/freebayes.cpp
index bb4bca3..83ca759 100644
--- a/src/freebayes.cpp
+++ b/src/freebayes.cpp
@@ -163,6 +163,9 @@ int main (int argc, char *argv[]) {
             } else if (coverage < parameters.minCoverage) {
                 DEBUG("post-filtering coverage of " << coverage << " is less than --min-coverage of " << parameters.minCoverage);
                 continue;
+            } else if (parameters.onlyUseInputAlleles) {
+                DEBUG("no input alleles, but using only input alleles for analysis, skipping position");
+                continue;
             }
 
             DEBUG2("coverage " << parser->currentSequenceName << ":" << parser->currentPosition << " == " << coverage);
@@ -177,6 +180,14 @@ int main (int argc, char *argv[]) {
             if (parameters.reportMonomorphic) {
                 DEBUG("calling at site even though there are no alternate observations");
             }
+        } else {
+            /*
+            cerr << "has input variants at " << parser->currentSequenceName << ":" << parser->currentPosition << endl;
+            vector<Allele>& inputs = parser->inputVariantAlleles[parser->currentPosition];
+            for (vector<Allele>::iterator a = inputs.begin(); a != inputs.end(); ++a) {
+                cerr << *a << endl;
+            }
+            */
         }
 
         // to ensure proper ordering of output stream
@@ -237,7 +248,7 @@ int main (int argc, char *argv[]) {
         long double theta = parameters.TH * parser->lastHaplotypeLength;
 
         // if we have only one viable allele, we don't have evidence for variation at this site
-        if (!parameters.reportMonomorphic && genotypeAlleles.size() <= 1 && genotypeAlleles.front().isReference()) {
+        if (!parser->hasInputVariantAllelesAtCurrentPosition() && !parameters.reportMonomorphic && genotypeAlleles.size() <= 1 && genotypeAlleles.front().isReference()) {
             DEBUG("no alternate genotype alleles passed filters at " << parser->currentSequenceName << ":" << parser->currentPosition);
             continue;
         }
@@ -293,11 +304,19 @@ int main (int argc, char *argv[]) {
 
         DEBUG2("calculating data likelihoods");
         // calculate data likelihoods
-        for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
+        //for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) {
+        for (vector<string>::iterator n = parser->sampleList.begin(); n != parser->sampleList.end(); ++n) {
 
-            string sampleName = s->first;
+            //string sampleName = s->first;
+            string& sampleName = *n;
             //DEBUG2("sample: " << sampleName);
-            Sample& sample = s->second;
+            //Sample& sample = s->second;
+            if (samples.find(sampleName) == samples.end()
+                && !(parser->hasInputVariantAllelesAtCurrentPosition()
+                     || parameters.reportMonomorphic)) {
+                continue;
+            }
+            Sample& sample = samples[sampleName];
             vector<Genotype>& genotypes = genotypesByPloidy[parser->currentSamplePloidy(sampleName)];
             vector<Genotype*> genotypesWithObs;
             for (vector<Genotype>::iterator g = genotypes.begin(); g != genotypes.end(); ++g) {
@@ -435,6 +454,7 @@ int main (int argc, char *argv[]) {
 
         long double bestComboOddsRatio = 0;
 
+        bool bestOverallComboIsHet = false;
         GenotypeCombo bestCombo; // = NULL;
 
         // what a hack...
@@ -597,24 +617,24 @@ int main (int argc, char *argv[]) {
         pVar = 1.0;
         pHom = 0.0;
         // calculates pvar and gets the best het combo
-        for (list<GenotypeCombo>::iterator gc = genotypeCombos.begin(); gc != genotypeCombos.end(); ++gc) {
-            if (gc->isHomozygous()
-                && (parameters.useRefAllele
-                    || !parameters.useRefAllele && gc->alleles().front() == referenceBase)) {
+        list<GenotypeCombo>::iterator gc = genotypeCombos.begin();
+        bestCombo = *gc;
+        for ( ; gc != genotypeCombos.end(); ++gc) {
+            if (gc->isHomozygous() && gc->alleles().front() == referenceBase) {
                 pVar -= big_exp(gc->posteriorProb - posteriorNormalizer);
                 pHom += big_exp(gc->posteriorProb - posteriorNormalizer);
+            } else if (gc == genotypeCombos.begin()) {
+                bestOverallComboIsHet = true;
             }
         }
 
         // report the maximum a posteriori estimate
         // unless we're reporting the GL maximum
-        if (!parameters.reportGenotypeLikelihoodMax) {
-            bestCombo = genotypeCombos.front();
-        } else {
+        if (parameters.reportGenotypeLikelihoodMax) {
             bestCombo = glMax;
         }
 
-        DEBUG2("best combo: " << bestCombo);
+        DEBUG("best combo: " << bestCombo);
 
         // odds ratio between the first and second-best combinations
         if (genotypeCombos.size() > 1) {
diff --git a/src/version_git.h b/src/version_git.h
deleted file mode 100644
index e9dbe54..0000000
--- a/src/version_git.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef VERSION_GIT_H
-#define VERSION_GIT_H
-#define VERSION_GIT "v0.9.13-5-ga830efd-dirty"
-#endif /* VERSION_GIT_H */
diff --git a/src/version_release.txt b/src/version_release.txt
index 6fbaf06..82f7124 100644
--- a/src/version_release.txt
+++ b/src/version_release.txt
@@ -1,5 +1,5 @@
-# This file was auto-generated by running "make setversion VERSION=v0.9.14"
-# on Mon Mar  3 09:43:15 EST 2014 .
+# This file was auto-generated by running "make setversion VERSION=v0.9.20"
+# on Fri Dec 12 16:34:49 EST 2014 .
 # Please do not edit or commit this file manually.
 #
-v0.9.14
+v0.9.20

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/freebayes.git



More information about the debian-med-commit mailing list