[med-svn] [bwa] 02/04: Imported Upstream version 0.7.12

Mon Feb 9 08:06:52 UTC 2015

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository bwa.

commit 41430b90dca9b49a359f5b3070a20b32a118d7f0
Author: Andreas Tille <tille at debian.org>
Date:   Tue Jan 27 08:47:32 2015 +0100

    Imported Upstream version 0.7.12
---
 Makefile                 |  10 +-
 NEWS.md                  |  72 +++++
 README-alt.md            | 178 ++++++++++++
 README.md                |  72 +++--
 bntseq.c                 |  48 +++-
 bntseq.h                 |   1 +
 bwa-helper.js            | 706 -----------------------------------------------
 bwa.1                    |  58 +++-
 bwa.c                    | 146 +++++++++-
 bwa.h                    |  16 +-
 bwakit/README.md         | 115 ++++++++
 bwakit/bwa-postalt.js    | 524 +++++++++++++++++++++++++++++++++++
 bwakit/run-HLA           |  20 ++
 bwakit/run-bwamem        | 186 +++++++++++++
 bwakit/run-gen-ref       |  39 +++
 bwakit/typeHLA-selctg.js |  62 +++++
 bwakit/typeHLA.js        | 496 +++++++++++++++++++++++++++++++++
 bwakit/typeHLA.sh        |  49 ++++
 bwamem.c                 | 159 ++++++++---
 bwamem.h                 |  15 +-
 bwamem_extra.c           |  68 +++--
 bwamem_pair.c            | 102 ++++---
 bwashm.c                 | 213 ++++++++++++++
 bwt.c                    |  42 ++-
 bwt.h                    |   4 +-
 bwt_gen.c                |   9 +-
 bwtindex.c               |  18 +-
 example.c                |  12 +-
 fastmap.c                | 251 ++++++++++++-----
 kseq.h                   |  18 +-
 kthread.c                | 104 ++++++-
 main.c                   |   8 +-
 utils.c                  |  11 +
 utils.h                  |   2 +-
 34 files changed, 2846 insertions(+), 988 deletions(-)

diff --git a/Makefile b/Makefile
index 60c2104..1d6ae44 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD $(WRAP_MALLOC)
 LOBJS=		utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o
-AOBJS=		QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
+AOBJS=		QSufSort.o bwt_gen.o bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
 			is.o bwtindex.o bwape.o kopen.o pemerge.o \
 			bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
 			bwtsw2_chain.o fastmap.o bwtsw2_pair.o
@@ -14,6 +14,10 @@ INCLUDES=
 LIBS=		-lm -lz -lpthread
 SUBDIRS=	.
 
+ifeq ($(shell uname -s),Linux)
+	LIBS += -lrt
+endif
+
 .SUFFIXES:.c .o .cc
 
 .c.o:
@@ -40,7 +44,7 @@ depend:
 
 QSufSort.o: QSufSort.h
 bamlite.o: bamlite.h malloc_wrap.h
-bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h
+bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
 bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kseq.h
 bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
 bwamem.o: ksort.h utils.h kbtree.h
@@ -52,6 +56,7 @@ bwape.o: ksw.h khash.h
 bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h
 bwase.o: bwa.h ksw.h
 bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h
+bwashm.o: bwa.h bntseq.h bwt.h
 bwt.o: utils.h bwt.h kvec.h malloc_wrap.h
 bwt_gen.o: QSufSort.h malloc_wrap.h
 bwt_lite.o: bwt_lite.h malloc_wrap.h
@@ -75,5 +80,4 @@ ksw.o: ksw.h malloc_wrap.h
 main.o: kstring.h malloc_wrap.h utils.h
 malloc_wrap.o: malloc_wrap.h
 pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
-test-extend.o: ksw.h
 utils.o: utils.h ksort.h malloc_wrap.h kseq.h
diff --git a/NEWS.md b/NEWS.md
index 33a4760..4692889 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,75 @@
+Release 0.7.12 (28 December 2014)
+---------------------------------
+
+This release fixed a bug in the pair-end mode when ALT contigs are present. It
+leads to undercalling in regions overlapping ALT contigs.
+
+(0.7.12: 28 December 2014, r1039)
+
+
+
+Release 0.7.11 (23 December, 2014)
+----------------------------------
+
+A major change to BWA-MEM is the support of mapping to ALT contigs in addition
+to the primary assembly. Part of the ALT mapping strategy is implemented in
+BWA-MEM and the rest in a postprocessing script for now. Due to the extra
+layer of complexity on generating the reference genome and on the two-step
+mapping, we start to provide a wrapper script and precompiled binaries since
+this release. The package may be more convenient to some specific use cases.
+For general uses, the single BWA binary still works like the old way.
+
+Another major addition to BWA-MEM is HLA typing, which made possible with the
+new ALT mapping strategy. Necessary data and programs are included in the
+binary release. The wrapper script also optionally performs HLA typing when HLA
+genes are included in the reference genome as additional ALT contigs.
+
+Other notable changes to BWA-MEM:
+
+ * Added option `-b` to `bwa index`. This option tunes the batch size used in
+   the construction of BWT. It is advised to use large `-b` for huge reference
+   sequences such as the BLAST *nt* database.
+
+ * Optimized for PacBio data. This includes a change to scoring based on a
+   study done by Aaron Quinlan and a heuristic speedup. Further speedup is
+   possible, but needs more careful investigation.
+
+ * Dropped PacBio read-to-read alignment for now. BWA-MEM is good for finding
+   the best hit, but is not very sensitive to suboptimal hits. Option `-x pbread`
+   is still available, but hidden on the command line. This may be removed in
+   future releases.
+
+ * Added a new pre-setting for Oxford Nanopore 2D reads. LAST is still a little
+   more sensitive on older bacterial data, but bwa-mem is as good on more
+   recent data and is times faster for mapping against mammalian genomes.
+
+ * Added LAST-like seeding. This improves the accuracy for longer reads.
+
+ * Added option `-H` to insert arbitrary header lines.
+
+ * Smarter option `-p`. Given an interleaved FASTQ stream, old bwa-mem identifies
+   the 2i-th and (2i+1)-th reads as a read pair. The new verion identifies
+   adjacent reads with the same read name as a read pair. It is possible to mix
+   single-end and paired-end reads in one FASTQ.
+
+ * Improved parallelization. Old bwa-mem waits for I/O. The new version puts
+   I/O on a separate thread. It performs mapping while reading FASTQ and
+   writing SAM. This saves significant wall-clock time when reading from
+   or writing to a slow Unix pipe.
+
+With the new release, the recommended way to map Illumina reads to GRCh38 is to
+use the bwakit binary package:
+
+    bwa.kit/run-gen-ref hs38DH
+    bwa.kit/bwa index hs38DH.fa
+    bwa.kit/run-bwamem -t8 -H -o out-prefix hs38DH.fa read1.fq.gz read2.fq.gz | sh
+
+Please check bwa.kit/README.md for details and command line options.
+
+(0.7.11: 23 December 2014, r1034)
+
+
+
 Release 0.7.10 (13 July, 2014)
 ------------------------------
 
diff --git a/README-alt.md b/README-alt.md
new file mode 100644
index 0000000..4f72042
--- /dev/null
+++ b/README-alt.md
@@ -0,0 +1,178 @@
+## For the Impatient
+
+```sh
+# Download bwakit (or from <http://sourceforge.net/projects/bio-bwa/files/bwakit/> manually)
+wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
+  | gzip -dc | tar xf -
+# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
+bwa.kit/run-gen-ref hs38DH   # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa  # create BWA index
+# mapping
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh  # skip "|sh" to show command lines
+```
+
+This generates `out.aln.bam` as the final alignment, `out.hla.top` for best HLA
+genotypes on each gene and `out.hla.all` for other possible HLA genotypes.
+Please check out [bwa/bwakit/README.md][kithelp] for details.
+
+## Background
+
+GRCh38 consists of several components: chromosomal assembly, unlocalized contigs
+(chromosome known but location unknown), unplaced contigs (chromosome unknown)
+and ALT contigs (long clustered variations). The combination of the first three
+components is called the *primary assembly*. It is recommended to use the
+complete primary assembly for all analyses. Using ALT contigs in read mapping is
+tricky.
+
+GRCh38 ALT contigs are totaled 109Mb in length, spanning 60Mbp of the primary
+assembly. However, sequences that are highly diverged from the primary assembly
+only contribute a few million bp. Most subsequences of ALT contigs are nearly
+identical to the primary assembly. If we align sequence reads to GRCh38+ALT
+blindly, we will get many additional reads with zero mapping quality and miss
+variants on them. It is crucial to make mappers aware of ALTs.
+
+BWA-MEM is ALT-aware. It essentially computes mapping quality across the
+non-redundant content of the primary assembly plus the ALT contigs and is free
+of the problem above.
+
+## Methods
+
+### Sequence alignment
+
+As of now, ALT mapping is done in two separate steps: BWA-MEM mapping and
+postprocessing. The `bwa.kit/run-bwamem` script performs the two steps when ALT
+contigs are present. The following picture shows an example about how BWA-MEM
+infers mapping quality and reports alignment after step 2:
+
+![](http://lh3lh3.users.sourceforge.net/images/alt-demo.png)
+
+#### Step 1: BWA-MEM mapping
+
+At this step, BWA-MEM reads the ALT contig names from "*idxbase*.alt", ignoring
+the ALT-to-ref alignment, and labels a potential hit as *ALT* or *non-ALT*,
+depending on whether the hit lands on an ALT contig or not. BWA-MEM then reports
+alignments and assigns mapQ following these two rules:
+
+1. The mapQ of a non-ALT hit is computed across non-ALT hits only. The mapQ of
+   an ALT hit is computed across all hits.
+
+2. If there are no non-ALT hits, the best ALT hit is outputted as the primary
+   alignment. If there are both ALT and non-ALT hits, non-ALT hits will be
+   primary and ALT hits be supplementary (SAM flag 0x800).
+
+In theory, non-ALT alignments from step 1 should be identical to alignments
+against the reference genome with ALT contigs. In practice, the two types of
+alignments may differ in rare cases due to seeding heuristics. When an ALT hit
+is significantly better than non-ALT hits, BWA-MEM may miss seeds on the
+non-ALT hits.
+
+If we don't care about ALT hits, we may skip postprocessing (step 2).
+Nonetheless, postprocessing is recommended as it improves mapQ and gives more
+information about ALT hits.
+
+#### Step 2: Postprocessing
+
+Postprocessing is done with a separate script `bwa-postalt.js`. It reads all
+potential hits reported in the XA tag, lifts ALT hits to the chromosomal
+positions using the ALT-to-ref alignment, groups them based on overlaps between
+their lifted positions, and then re-estimates mapQ across the best scoring hit
+in each group. Being aware of the ALT-to-ref alignment, this script can greatly
+improve mapQ of ALT hits and occasionally improve mapQ of non-ALT hits. It also
+writes each hit overlapping the reported hit into a separate SAM line. This
+enables variant calling on each ALT contig independent of others.
+
+### On the completeness of GRCh38+ALT
+
+While GRCh38 is much more complete than GRCh37, it is still missing some true
+human sequences. To make sure every piece of sequence in the reference assembly
+is correct, the [Genome Reference Consortium][grc] (GRC) require each ALT contig
+to have enough support from multiple sources before considering to add it to the
+reference assembly. This careful and sophisticated procedure has left out some
+sequences, one of which is [this example][novel], a 10kb contig assembled from
+CHM1 short reads and present also in NA12878. You can try [BLAT][blat] or
+[BLAST][blast] to see where it maps.
+
+For a more complete reference genome, we compiled a new set of decoy sequences
+from GenBank clones and the de novo assembly of 254 public [SGDP][sgdp] samples.
+The sequences are included in `hs38DH-extra.fa` from the [BWA binary
+package][res].
+
+In addition to decoy, we also put multiple alleles of HLA genes in
+`hs38DH-extra.fa`. These genomic sequences were acquired from [IMGT/HLA][hladb],
+version 3.18.0 and are used to collect reads sequenced from these genes.
+
+### HLA typing
+
+HLA genes are known to be associated with many autoimmune diseases, infectious
+diseases and drug responses. They are among the most important genes but are
+rarely studied by WGS projects due to the high sequence divergence between
+HLA genes and the reference genome in these regions.
+
+By including the HLA gene regions in the reference assembly as ALT contigs, we
+are able to effectively identify reads coming from these genes. We also provide
+a pipeline, which is included in the [BWA binary package][res], to type the
+several classic HLA genes. The pipeline is conceptually simple. It de novo
+assembles sequence reads mapped to each gene, aligns exon sequences of each
+allele to the assembled contigs and then finds the pairs of alleles that best
+explain the contigs. In practice, however, the completeness of IMGT/HLA and
+copy-number changes related to these genes are not so straightforward to
+resolve. HLA typing may not always be successful. Users may also consider to use
+other programs for typing such as [Warren et al (2012)][hla4], [Liu et al
+(2013)][hla2], [Bai et al (2014)][hla3] and [Dilthey et al (2014)][hla1], though
+most of them are distributed under restrictive licenses.
+
+## Preliminary Evaluation
+
+To check whether GRCh38 is better than GRCh37, we mapped the CHM1 and NA12878
+unitigs to GRCh37 primary (hs37), GRCh38 primary (hs38) and GRCh38+ALT+decoy
+(hs38DH), and called small variants from the alignment. CHM1 is haploid.
+Ideally, heterozygous calls are false positives (FP). NA12878 is diploid. The
+true positive (TP) heterozygous calls from NA12878 are approximately equal
+to the difference between NA12878 and CHM1 heterozygous calls. A better assembly
+should yield higher TP and lower FP. The following table shows the numbers for
+these assemblies:
+
+|Assembly|hs37   |hs38   |hs38DH|CHM1_1.1|  huref|
+|:------:|------:|------:|------:|------:|------:|
+|FP      | 255706| 168068| 142516|307172 | 575634|
+|TP      |2142260|2163113|2150844|2167235|2137053|
+
+With this measurement, hs38 is clearly better than hs37. Genome hs38DH reduces
+FP by ~25k but also reduces TP by ~12k. We manually inspected variants called
+from hs38 only and found the majority of them are associated with excessive read
+depth, clustered variants or weak alignment. We believe most hs38-only calls are
+problematic. In addition, if we compare two NA12878 replicates from HiSeq X10
+with nearly identical library construction, the difference is ~140k, an order
+of magnitude higher than the difference between hs38 and hs38DH. ALT contigs,
+decoy and HLA genes in hs38DH improve variant calling and enable the analyses of
+ALT contigs and HLA typing at little cost.
+
+## Problems and Future Development
+
+There are some uncertainties about ALT mappings - we are not sure whether they
+help biological discovery and don't know the best way to analyze them. Without
+clear demand from downstream analyses, it is very difficult to design the
+optimal mapping strategy. The current BWA-MEM method is just a start. If it
+turns out to be useful in research, we will probably rewrite bwa-postalt.js in C
+for performance; if not, we may make changes. It is also possible that we might
+make breakthrough on the representation of multiple genomes, in which case, we
+can even get rid of ALT contigs for good.
+
+
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
+[sb]: https://github.com/GregoryFaust/samblaster
+[grc]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/
+[novel]: https://gist.github.com/lh3/9935148b71f04ba1a8cc
+[blat]: https://genome.ucsc.edu/cgi-bin/hgBlat
+[blast]: http://blast.st-va.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
+[sgdp]: http://www.simonsfoundation.org/life-sciences/simons-genome-diversity-project/
+[hladb]: http://www.ebi.ac.uk/ipd/imgt/hla/
+[grcdef]: http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/info/definitions.shtml
+[hla1]: http://biorxiv.org/content/early/2014/07/08/006973
+[hlalink]: http://www.hladiseaseassociations.com
+[hlatools]: https://www.biostars.org/p/93245/
+[hla2]: http://nar.oxfordjournals.org/content/41/14/e142.full.pdf+html
+[hla3]: http://www.biomedcentral.com/1471-2164/15/325
+[hla4]: http://genomemedicine.com/content/4/12/95
+[kithelp]: https://github.com/lh3/bwa/tree/master/bwakit
diff --git a/README.md b/README.md
index dff245e..9ac49bb 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Build Status](https://travis-ci.org/lh3/bwa.svg?branch=dev)](https://travis-ci.org/lh3/bwa)
+[![Build Status](https://drone.io/github.com/lh3/bwa/status.png)](https://drone.io/github.com/lh3/bwa/latest)
 ##Getting started
 
 	git clone https://github.com/lh3/bwa.git
@@ -30,6 +32,11 @@ SourceForge. After you acquire the source code, simply use `make` to compile
 and copy the single executable `bwa` to the destination you want. The only
 dependency required to build BWA is [zlib][14].
 
+Since 0.7.11, precompiled binary for x86\_64-linux is available in [bwakit][17].
+In addition to BWA, this self-consistent package also comes with bwa-associated
+and 3rd-party tools for proper BAM-to-FASTQ conversion, mapping to ALT contigs,
+adapter triming, duplicate marking, HLA typing and associated data files.
+
 ##Seeking helps
 
 The detailed usage is described in the man page available together with the
@@ -63,7 +70,8 @@ do not have plan to submit it to a peer-reviewed journal in the near future.
 3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb)
 4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0)
 5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref)
-6. [How to map sequences to GRCh38 with ALT contigs?](#h38)
+6. [Does BWA work with ALT contigs in the GRCh38 release?](#altctg)
+7. [Can I just run BWA-MEM against GRCh38+ALT without post-processing?](#postalt)
 
 ####<a name="type"></a>1. What types of data does BWA work with?
 
@@ -72,11 +80,11 @@ algorithm and setting may vary. The following list gives the recommended
 settings:
 
 * Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly
-  contigs up to a few megabases mapped to a close related reference genome:
+  contigs up to a few megabases mapped to a closely related reference genome:
 
 		bwa mem ref.fa reads.fq > aln.sam
 
-* Illumina single-end reads no longer than ~70bp:
+* Illumina single-end reads shorter than ~70bp:
 
 		bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam
 
@@ -84,24 +92,21 @@ settings:
 
 		bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
 
-* Illumina paired-end reads no longer than ~70bp:
+* Illumina paired-end reads shorter than ~70bp:
 
 		bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai
-		bwa samse ref.fa reads.sai reads.fq > aln-pe.sam
+		bwa sampe ref.fa read1.sai read2.sai read1.fq read2.fq > aln-pe.sam
 
-* PacBio subreads to a reference genome:
+* PacBio subreads or Oxford Nanopore reads to a reference genome:
 
 		bwa mem -x pacbio ref.fa reads.fq > aln.sam
-
-* PacBio subreads to themselves (the output is not SAM):
-
-		bwa mem -x pbread reads.fq reads.fq > overlap.pas
+		bwa mem -x ont2d ref.fa reads.fq > aln.sam
 
 BWA-MEM is recommended for query sequences longer than ~70bp for a variety of
 error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with
 errors given longer query sequences as the chance of missing all seeds is small.
-As is shown above, with non-default settings, BWA-MEM works with PacBio subreads
-with a sequencing error rate as high as ~15%.
+As is shown above, with non-default settings, BWA-MEM works with Oxford Nanopore
+reads with a sequencing error rate over 20%.
 
 ####<a name="multihit"></a>2. Why does a read appear multiple times in the output SAM?
 
@@ -130,37 +135,22 @@ case, BWA-backtrack will flag the read as unmapped (0x4), but you will see
 position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment
 as well. BWA-MEM does not have this problem.
 
-####<a name="h38"></a>6. How to map sequences to GRCh38 with ALT contigs?
-
-BWA-backtrack and BWA-MEM partially support mapping to a reference containing
-ALT contigs that represent alternative alleles highly divergent from the
-reference genome.
-
-	# download the K8 executable required by bwa-helper.js
-	wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download
-	tar -jxf k8-0.2.1.tar.bz2
-
-	# download the ALT-to-GRCh38 alignment in the SAM format
-	wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download
-
-	# download the GRCh38 sequences with ALT contigs
-	wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
+####<a name="altctg"></a>6. Does BWA work with ALT contigs in the GRCh38 release?
 
-	# index and mapping
-	bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz
-	bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam
+Yes, since 0.7.11, BWA-MEM officially supports mapping to GRCh38+ALT.
+BWA-backtrack and BWA-SW don't properly support ALT mapping as of now. Please
+see [README-alt.md][18] for details. Briefly, it is recommended to use
+[bwakit][17], the binary release of BWA, for generating the reference genome
+and for mapping.
 
-Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the
-read has 50 or fewer hits. For each SAM line containing the XA tag,
-`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted
-to the same chromosomal region, adjusts mapping quality and outputs all the
-hits overlapping the reported hit. A read may be mapped to both the primary
-assembly and one or more ALT contigs all with high mapping quality.
+####<a name="postalt"></a>7. Can I just run BWA-MEM against GRCh38+ALT without post-processing?
 
-Note that this procedure assumes reads are single-end and may miss hits to
-highly repetitive regions as these hits will not be reported with option
-`-h50`. `bwa-helper.js` is a prototype implementation not recommended for
-production uses.
+If you are not interested in hits to ALT contigs, it is okay to run BWA-MEM
+without post-processing. The alignments produced this way are very close to
+alignments against GRCh38 without ALT contigs. Nonetheless, applying
+post-processing helps to reduce false mappings caused by reads from the
+diverged part of ALT contigs and also enables HLA typing. It is recommended to
+run the post-processing script.
 
 
 
@@ -180,3 +170,5 @@ production uses.
 [14]: http://zlib.net/
 [15]: https://github.com/lh3/bwa/tree/mem
 [16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/
+[17]: http://sourceforge.net/projects/bio-bwa/files/bwakit/
+[18]: https://github.com/lh3/bwa/blob/master/README-alt.md
diff --git a/bntseq.c b/bntseq.c
index eddae84..465e383 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -37,6 +37,9 @@
 #include "kseq.h"
 KSEQ_DECLARE(gzFile)
 
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, int)
+
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
 #endif
@@ -94,7 +97,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix)
 
 bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename)
 {
-	char str[1024];
+	char str[8192];
 	FILE *fp;
 	const char *fname;
 	bntseq_t *bns;
@@ -117,14 +120,14 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
 			if (scanres != 2) goto badread;
 			p->name = strdup(str);
 			// read fasta comments 
-			while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
+			while (q - str < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c;
 			while (c != '\n' && c != EOF) c = fgetc(fp);
 			if (c == EOF) {
 				scanres = EOF;
 				goto badread;
 			}
 			*q = 0;
-			if (q - str > 1) p->anno = strdup(str + 1); // skip leading space
+			if (q - str > 1 && strcmp(str, " (null)") != 0) p->anno = strdup(str + 1); // skip leading space
 			else p->anno = strdup("");
 			// read the rest
 			scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs);
@@ -165,11 +168,41 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c
 
 bntseq_t *bns_restore(const char *prefix)
 {  
-	char ann_filename[1024], amb_filename[1024], pac_filename[1024];
+	char ann_filename[1024], amb_filename[1024], pac_filename[1024], alt_filename[1024];
+	FILE *fp;
+	bntseq_t *bns;
 	strcat(strcpy(ann_filename, prefix), ".ann");
 	strcat(strcpy(amb_filename, prefix), ".amb");
 	strcat(strcpy(pac_filename, prefix), ".pac");
-	return bns_restore_core(ann_filename, amb_filename, pac_filename);
+	bns = bns_restore_core(ann_filename, amb_filename, pac_filename);
+	if (bns == 0) return 0;
+	if ((fp = fopen(strcat(strcpy(alt_filename, prefix), ".alt"), "r")) != 0) { // read .alt file if present
+		char str[1024];
+		khash_t(str) *h;
+		int c, i, absent;
+		khint_t k;
+		h = kh_init(str);
+		for (i = 0; i < bns->n_seqs; ++i) {
+			k = kh_put(str, h, bns->anns[i].name, &absent);
+			kh_val(h, k) = i;
+		}
+		i = 0;
+		while ((c = fgetc(fp)) != EOF) {
+			if (c == '\t' || c == '\n' || c == '\r') {
+				str[i] = 0;
+				if (str[0] != '@') {
+					k = kh_get(str, h, str);
+					if (k != kh_end(h))
+						bns->anns[kh_val(h, k)].is_alt = 1;
+				}
+				while (c != '\n' && c != EOF) c = fgetc(fp);
+				i = 0;
+			} else str[i++] = c; // FIXME: potential segfault here
+		}
+		kh_destroy(str, h);
+		fclose(fp);
+	}
+	return bns;
 }
 
 void bns_destroy(bntseq_t *bns)
@@ -201,7 +234,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_
 	}
 	p = bns->anns + bns->n_seqs;
 	p->name = strdup((char*)seq->name.s);
-	p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)");
+	p->anno = seq->comment.l > 0? strdup((char*)seq->comment.s) : strdup("(null)");
 	p->gi = 0; p->len = seq->seq.l;
 	p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len;
 	p->n_ambs = 0;
@@ -333,8 +366,9 @@ int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re)
 {
 	int is_rev, rid_b, rid_e;
 	if (rb < bns->l_pac && re > bns->l_pac) return -2;
+	assert(rb <= re);
 	rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev));
-	rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1);
+	rid_e = rb < re? bns_pos2rid(bns, bns_depos(bns, re - 1, &is_rev)) : rid_b;
 	return rid_b == rid_e? rid_b : -1;
 }
 
diff --git a/bntseq.h b/bntseq.h
index 6437cf6..03671d6 100644
--- a/bntseq.h
+++ b/bntseq.h
@@ -43,6 +43,7 @@ typedef struct {
 	int32_t len;
 	int32_t n_ambs;
 	uint32_t gi;
+	int32_t is_alt;
 	char *name, *anno;
 } bntann1_t;
 
diff --git a/bwa-helper.js b/bwa-helper.js
deleted file mode 100644
index 8a234b8..0000000
--- a/bwa-helper.js
+++ /dev/null
@@ -1,706 +0,0 @@
-/*****************************************************************
- * The K8 Javascript interpreter is required to run this script. *
- *                                                               *
- *   Source code: https://github.com/attractivechaos/k8          *
- *   Binary: http://sourceforge.net/projects/lh3/files/k8/       *
- *                                                               *
- * Data file used for generating GRCh38 ALT alignments:          *
- *                                                               *
- *   http://sourceforge.net/projects/bio-bwa/files/              *
- *****************************************************************/
-
-/******************
- *** From k8.js ***
- ******************/
-
-var getopt = function(args, ostr) {
-	var oli; // option letter list index
-	if (typeof(getopt.place) == 'undefined')
-		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
-	if (getopt.place == -1) { // update scanning pointer
-		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
-			getopt.place = -1;
-			return null;
-		}
-		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
-			++getopt.ind;
-			getopt.place = -1;
-			return null;
-		}
-	}
-	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
-	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
-		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
-		if (getopt.place < 0) ++getopt.ind;
-		return '?';
-	}
-	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
-		getopt.arg = null;
-		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
-	} else { // need an argument
-		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
-			getopt.arg = args[getopt.ind].substr(getopt.place);
-		else if (args.length <= ++getopt.ind) { // no arg
-			getopt.place = -1;
-			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
-			return '?';
-		} else getopt.arg = args[getopt.ind]; // white space
-		getopt.place = -1;
-		++getopt.ind;
-	}
-	return optopt;
-}
-
-function obj2str(o)
-{
-	if (typeof(o) != 'object') {
-		return o.toString();
-	} else if (o == null) {
-		return "null";
-	} else if (Array.isArray(o)) {
-		var s = "[";
-		for (var i = 0; i < o.length; ++i) {
-			if (i) s += ',';
-			s += obj2str(o[i]);
-		}
-		return s + "]";
-	} else {
-		var i = 0, s = "{";
-		for (var key in o) {
-			if (i++) s += ',';
-			s += key + ":";
-			s += obj2str(o[key]);
-		}
-		return s + "}";
-	}
-}
-
-Bytes.prototype.reverse = function()
-{
-	for (var i = 0; i < this.length>>1; ++i) {
-		var tmp = this[i];
-		this[i] = this[this.length - i - 1];
-		this[this.length - i - 1] = tmp;
-	}
-}
-
-Bytes.prototype.revcomp = function()
-{
-	if (Bytes.rctab == null) {
-		var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
-		var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
-		Bytes.rctab = [];
-		for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
-		for (var i = 0; i < s1.length; ++i)
-			Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
-	}
-	for (var i = 0; i < this.length>>1; ++i) {
-		var tmp = this[this.length - i - 1];
-		this[this.length - i - 1] = Bytes.rctab[this[i]];
-		this[i] = Bytes.rctab[tmp];
-	}
-	if (this.length>>1)
-		this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
-}
-
-/************************
- *** command markovlp ***
- ************************/
-
-function bwa_markOvlp(args)
-{
-	var c, min_aln_ratio = .9, min_ext = 50;
-	while ((c = getopt(args, "r:e:")) != null) {
-		if (c == 'r') min_aln_ratio = parseFloat(getopt.arg);
-		else if (c == 'e') min_ext = parseInt(getopt.arg);
-	}
-
-	var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]);
-	var buf = new Bytes();
-	var dir4 = ['>>', '><', '<>', '<<'];
-
-	while (file.readline(buf) >= 0) {
-		var t = buf.toString().split("\t")
-			for (var i = 0; i < t.length; ++i)
-				if (i != 0 && i != 4)
-					t[i] = parseInt(t[i]);
-		var el, a1, a2, e1, e2, r1, r2; // a: aligned length; e: extended length; r: remaining length
-		e2 = a2 = t[7] - t[6];
-		if (t[2] < t[3]) { // forward-forward match
-			e1 = a1 = t[3] - t[2];
-			r1 = t[2] - t[6];
-			r2 = (t[5] - t[7]) - (t[1] - t[3]);
-			el = r1 > 0? t[6] : t[2];
-			el += r2 > 0? t[1] - t[3] : t[5] - t[7];
-		} else { // reverse-forward match
-			e1 = a1 = t[2] - t[3];
-			r1 = (t[1] - t[2]) - t[6];
-			r2 = (t[5] - t[7]) - t[3];
-			el = r1 > 0? t[6] : t[1] - t[2];
-			el += r2 > 0? t[3] : t[5] - t[7];
-		}
-		e1 += el; e2 += el;
-		var type;
-		if (a1 / e1 >= min_aln_ratio && a2 / e2 >= min_aln_ratio) {
-			if ((r1 >= min_ext && r2 >= min_ext) || (r1 <= -min_ext && r2 <= -min_ext)) { // suffix-prefix match
-				var d = t[2] < t[3]? 0 : 2;
-				if (r1 < 0) d ^= 3; // reverse the direction
-				type = 'O' + dir4[d];
-			} else type = 'C' + (e1 / t[1] > e2 / t[5]? 1 : 2);
-		} else type = 'I'; // internal local match; not a suffix-prefix match
-		//print(t[1], e1, a1, t[5], e2, a2);
-		print(type, buf);
-	}
-
-	buf.destroy();
-	file.close();
-}
-
-/***********************
- *** command pas2bed ***
- ***********************/
-
-function bwa_pas2reg(args)
-{
-	var file = args.length? new File(args[0]) : new File();
-	var buf = new Bytes();
-
-	while (file.readline(buf) >= 0) {
-		var t = buf.toString().split("\t");
-		if (t[0] == t[4]) continue;
-		if (parseInt(t[2]) < parseInt(t[3])) print(t[0], t[1], t[2], t[3], t[8]);
-		else print(t[0], t[1], t[3], t[2], t[8]);
-		print(t[4], t[5], t[6], t[7], t[8]);
-	}
-
-	buf.destroy();
-	file.close();
-}
-
-/*******************
- * command sam2pas *
- *******************/
-
-function bwa_sam2pas(args)
-{
-	var file = args.length == 0? new File() : new File(args[0]);
-	var buf = new Bytes();
-	var seq_dict = {};
-
-	while (file.readline(buf) >= 0) {
-		var line = buf.toString();
-		var m;
-		if (/^@SQ/.test(line)) {
-			var name = null, len = null;
-			if ((m = /\tSN:(\S+)/.exec(line)) != null) name = m[1];
-			if ((m = /\tLN:(\S+)/.exec(line)) != null) len = parseInt(m[1]);
-			if (name != null && len != null) seq_dict[name] = len;
-		}
-		if (/^@/.test(line)) continue;
-		var t = line.split("\t");
-		var pos = parseInt(t[3]) - 1;
-		var x = 0, y = 0, i = 0, clip = [0, 0], n_ins = 0, n_del = 0, o_ins = 0, o_del = 0, n_M = 0;
-		var re = /(\d+)([MIDSH])/g;
-		while ((m = re.exec(t[5])) != null) {
-			var l = parseInt(m[1]);
-			if (m[2] == 'M') x += l, y += l, n_M += l;
-			else if (m[2] == 'I') y += l, n_ins += l, ++o_ins;
-			else if (m[2] == 'D') x += l, n_del += l, ++o_del;
-			else if (m[2] == 'S' || m[2] == 'H')
-				clip[i == 0? 0 : 1] = l;
-			++i;
-		}
-		var is_rev = (parseInt(t[1]) & 16)? true : false;
-		var misc = 'mapQ=' + t[4] + ';';
-		var usc = 1;
-		if ((m = /\tNM:i:(\d+)/.exec(line)) != null) {
-			var NM = parseInt(m[1]);
-			var diff = (NM / (n_M + n_ins + n_del)).toFixed(3);
-			misc += 'diff=' + diff + ';n_mis=' + (NM - n_del - n_ins) + ';';
-		}
-		misc += 'n_del='+n_del+';n_ins='+n_ins+';o_del='+o_del+';o_ins='+o_ins + ';';
-		if ((m = /\tAS:i:(\d+)/.exec(line)) != null) {
-			misc += 'AS='+m[1] + ';';
-			usc = (parseInt(m[1]) / (x > y? x : y)).toFixed(3);
-		}
-		if ((m = /\tXS:i:(\d+)/.exec(line)) != null) misc += 'XS='+m[1] + ';';
-		var len = y + clip[0] + clip[1];
-		var z = [t[0], len, clip[0], clip[0] + y, t[2], seq_dict[t[2]], pos, pos + x, usc, misc];
-		if (parseInt(t[1]) & 16) z[2] = clip[1] + y, z[3] = clip[1];
-		print(z.join("\t"));
-	}
-
-	buf.destroy();
-	file.close();
-}
-
-/***********************
- *** command reg2cut ***
- ***********************/
-
-function bwa_reg2cut(args)
-{
-	var c, min_usc = 0.5, min_ext = 100, min_len = 5000, cut = 250;
-	while ((c = getopt(args, "s:e:l:c:")) != null) {
-		if (c == 's') min_usc = parseFloat(getopt.arg);
-		else if (c == 'e') min_ext = parseInt(getopt.arg);
-		else if (c == 'l') min_len = parseInt(getopt.arg);
-		else if (c == 'c') cut = parseInt(getopt.arg);
-	}
-
-	var file = args.length == getopt.ind? new File () : new File(args[getopt.ind]);
-	var buf = new Bytes();
-
-	function print_bed() {
-		for (var i = 0; i < a.length; ++i) {
-			var start = a[i][0] - cut > 0? a[i][0] : 0;
-			var end   = a[i][1] + cut < last_len? a[i][1] : last_len;
-			if (end - start >= min_len) print(last_chr, start, end);
-		}
-	}
-
-	var last_chr = null, last_len = null, max_c_usc = 0, start = 0, end = 0;
-	var a = [];
-	while (file.readline(buf) >= 0) {
-		var t = buf.toString().split("\t");
-		t[1] = parseInt(t[1]);
-		t[2] = parseInt(t[2]);
-		t[3] = parseInt(t[3]);
-		t[4] = parseFloat(t[4]);
-		var is_contained = t[2] < min_ext && t[1] - t[3] < min_ext? true : false;
-		if (t[3] - t[2] < cut<<1) continue;
-		t[2] += cut; t[3] -= cut;
-		if (t[0] != last_chr) {
-			a.push([start, end]);
-			if (last_chr != null && max_c_usc < min_usc) print_bed();
-			last_chr = t[0]; last_len = t[1]; start = t[2]; end = t[3];
-			max_c_usc = is_contained? t[4] : 0;
-			a.length = 0;
-		} else {
-			if (is_contained)
-				max_c_usc = max_c_usc > t[4]? max_c_usc : t[4];
-			if (t[4] < min_usc) continue;
-			if (t[2] > end) {
-				a.push([start, end]);
-				start = t[2];
-				end = end > t[3]? end : t[3];
-			} else end = end > t[3]? end : t[3];
-		}
-	}
-	a.push([start, end]);
-	if (max_c_usc < min_usc) print_bed(); // the last sequence
-
-	buf.destroy();
-	file.close();
-}
-
-function bwa_shortname(args)
-{
-	var file = args.length? new File(args[0]) : new File();
-	var buf = new Bytes();
-
-	var re = /(\S+)\/(\d+)_(\d+)((:\d+-\d+)+)/g;
-	var re2 = /:(\d+)-(\d+)/g;
-	while (file.readline(buf) >= 0) {
-		var match, match2;
-		var line = buf.toString();
-		var x = [];
-		while ((match = re.exec(line)) != null) {
-			var start = parseInt(match[2]), len = parseInt(match[3]) - start;
-			while ((match2 = re2.exec(match[4])) != null) {
-				var a = parseInt(match2[1]) - 1;
-				var b = parseInt(match2[2]);
-				start += a; len = b - a;
-			}
-			x.push([match[0], match[1] + '/' + start.toString() + '_' + (start+len).toString()]);
-		}
-		for (var i = 0; i < x.length; ++i)
-			line = line.replace(x[i][0], x[i][1]);
-		print(line);
-	}
-
-	buf.destroy();
-	file.close();
-}
-
-/*******************
- * Command gff2sam *
- *******************/
-
-function bwa_gff2sam(args)
-{
-	if (args.length < 2) {
-		print("Usage: k8 bwa-helper.js <aln.gff> <query-length.txt>");
-		exit(1);
-	}
-
-	var file = new File(args[1]);
-	var buf = new Bytes();
-	var len = {};
-
-	while (file.readline(buf) >= 0) {
-		var t = buf.toString().split(/\s+/);
-		len[t[0]] = parseInt(t[1]);
-	}
-	file.close();
-
-	file = new File(args[0]);
-	var re_cigar = /([MID])(\d+)/g;
-	var lineno = 0;
-	while (file.readline(buf) >= 0) {
-		++lineno;
-		var t = buf.toString().split("\t");
-		var m = /Target=(\S+)\s+(\d+)\s+(\d+)\s+([+-])/.exec(t[8]);
-		if (m == null) {
-			warn("WARNING: skipped line "+lineno+" due to the lack of Target.");
-			continue;
-		}
-		var qname = m[1];
-		var flag = t[6] == m[4]? 0 : 16;
-		var qb = parseInt(m[2]) - 1, qe = parseInt(m[3]), qlen = len[qname];
-		if (qlen == null)
-			throw Error("Sequence "+qname+" is not present in the query-length.txt");
-		var clip5 = qb, clip3 = qlen - qe;
-		if (flag&16) clip5 ^= clip3, clip3 ^= clip5, clip5 ^= clip3; // swap
-
-		m = /Gap\s*=\s*(([MID]\d+\s*)+)/.exec(t[8]);
-		var cigar = clip5? clip5 + 'S' : '';
-		var n_ins = 0, n_del = 0, n_match = 0, NM = null;
-		if (m) {
-			var mc;
-			while ((mc = re_cigar.exec(m[1])) != null) {
-				var l = parseInt(mc[2]);
-				cigar += mc[2] + mc[1];
-				if (mc[1] == 'I') n_ins += l;
-				else if (mc[1] == 'D') n_del += l;
-				else if (mc[1] == 'M') n_match += l;
-			}
-			if (n_ins + n_match != qe - qb || n_del + n_match != parseInt(t[4]) - parseInt(t[3]) + 1)
-				throw Error("Inconsistent CIGAR at line "+lineno);
-		} else { // ungapped alignment
-			var tb = parseInt(t[3]) - 1, te = parseInt(t[4]);
-			if (te - tb != qe - qb) {
-				warn("WARNING: line "+lineno+" should contain gaps, but lacks Gap. Skipped.\n");
-			} else cigar = (qe - qb) + 'M';
-		}
-		if (clip3) cigar += clip3 + 'S';
-		if ((m = /num_mismatch=(\d+)/.exec(t[8])) != null)
-			NM = parseInt(m[1]) + n_ins + n_del;
-		var out = [qname, flag, t[0], t[3], 255, cigar, '*', 0, 0, '*', '*'];
-		if (NM != null) out.push('NM:i:' + NM);
-		print(out.join("\t"));
-	}
-	buf.destroy();
-	file.close();
-}
-
-
-/******************************
- *** Generate ALT alignment ***
- ******************************/
-
-function intv_ovlp(intv, bits) // interval index
-{
-	if (typeof bits == "undefined") bits = 13;
-	intv.sort(function(a,b) {return a[0]-b[0];});
-	// create the index
-	var idx = [], max = 0;
-	for (var i = 0; i < intv.length; ++i) {
-		var b = intv[i][0]>>bits;
-		var e = (intv[i][1]-1)>>bits;
-		if (b != e) {
-			for (var j = b; j <= e; ++j)
-				if (idx[j] == null) idx[j] = i;
-		} else if (idx[b] == null) idx[b] = i;
-		max = max > e? max : e;
-	}
-	// closure
-	return function(_b, _e) {
-		var x = _b >> bits;
-		if (x > max) return [];
-		var off = idx[x];
-		if (off == null) {
-			var i;
-			for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
-				if (idx[i] != null) break;
-			off = i < 0? 0 : idx[i];
-		}
-		var ovlp = [];
-		for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
-			if (intv[i][1] > _b) ovlp.push(intv[i]);
-		return ovlp;
-	}
-}
-
-function bwa_genalt(args)
-{
-	var re_cigar = /(\d+)([MIDSHN])/g;
-
-	function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
-	{
-		var x = 0, y = 0;
-		for (var i = 0; i < cigar.length; ++i) {
-			var op = cigar[i][0], len = cigar[i][1];
-			if (op == 'M') {
-				if (y <= pos && pos < y + len)
-					return x + (pos - y);
-				x += len, y += len;
-			} else if (op == 'D') {
-				x += len;
-			} else if (op == 'I') {
-				if (y <= pos && pos < y + len)
-					return x;
-				y += len;
-			} else if (op == 'S' || op == 'H') {
-				if (y <= pos && pos < y + len)
-					return -1;
-				y += len;
-			}
-		}
-		return -1;
-	}
-
-	function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5]
-	{
-		var h = {};
-		h.ctg = s[0];
-		h.start = parseInt(s[1].substr(1)) - 1;
-		h.rev = (s[1].charAt(0) == '-');
-		h.cigar = s[2];
-		h.NM = parseInt(s[3]);
-		var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
-		l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
-		while ((m = re_cigar.exec(h.cigar)) != null) {
-			var l = parseInt(m[1]);
-			if (m[2] == 'M') l_match += l;
-			else if (m[2] == 'D') ++n_del, l_del += l;
-			else if (m[2] == 'I') ++n_ins, l_ins += l;
-			else if (m[2] == 'N') l_skip += l;
-			else if (m[2] == 'H' || m[2] == 'S') l_clip += l;
-		}
-		h.end = h.start + l_match + l_del + l_skip;
-		h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
-		h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
-		h.l_query = l_match + l_ins + l_clip;
-		return h;
-	}
-
-	var c, opt = { a:1, b:4, o:6, e:1, verbose:3 };
-
-	while ((c = getopt(args, 'v:')) != null) {
-		if (c == 'v') opt.verbose = parseInt(getopt.arg);
-	}
-
-	if (args.length == getopt.ind) {
-		print("Usage: k8 bwa-helper.js genalt <alt.sam> [aln.sam]");
-		exit(1);
-	}
-
-	var file, buf = new Bytes();
-	var aux = new Bytes(); // used for reverse and reverse complement
-
-	// read the ALT-to-REF alignment and generate the index
-	var intv = {};
-	file = new File(args[getopt.ind]);
-	while (file.readline(buf) >= 0) {
-		var line = buf.toString();
-		var t = line.split("\t");
-		if (line.charAt(0) == '@') continue;
-		var flag = parseInt(t[1]);
-		var m, cigar = [], l_qaln = 0, l_qclip = 0;
-		while ((m = re_cigar.exec(t[5])) != null) {
-			var l = parseInt(m[1]);
-			cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
-			if (m[2] == 'M' || m[2] == 'I') l_qaln += l;
-			else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
-		}
-		var j = flag&16? cigar.length-1 : 0;
-		var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
-		if (intv[t[0]] == null) intv[t[0]] = [];
-		intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]);
-		//print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar);
-	}
-	file.close();
-	// create the interval index
-	var idx = {};
-	for (var ctg in intv)
-		idx[ctg] = intv_ovlp(intv[ctg]);
-
-	// process SAM
-	file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
-	while (file.readline(buf) >= 0) {
-		var m, line = buf.toString();
-		if (line.charAt(0) == '@' || (m = /\tXA:Z:(\S+)/.exec(line)) == null) { // TODO: this does not work with PE file
-			if (opt.verbose < 4) print(line);
-			continue;
-		}
-
-		// parse hits
-		var hits = [];
-		var XA_strs = m[1].split(";");
-		var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
-		var t = line.split("\t");
-		var flag = parseInt(t[1]);
-		hits.push(parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt));
-		for (var i = 0; i < XA_strs.length; ++i) // hits in the XA tag
-			if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
-				hits.push(parse_hit(XA_strs[i].split(","), opt));
-
-		// lift mapping positions to coordinates on the primary assembly
-		var n_lifted = 0;
-		for (var i = 0; i < hits.length; ++i) {
-			var h = hits[i];
-
-			if (idx[h.ctg] == null) continue;
-			var a = idx[h.ctg](h.start, h.end);
-			if (a == null || a.length == 0) continue;
-
-			// find the approximate position on the primary assembly
-			var lifted = [];
-			for (var j = 0; j < a.length; ++j) {
-				var s, e;
-				if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
-					s = cigar2pos(a[j][6], h.start);
-					e = cigar2pos(a[j][6], h.end - 1) + 1;
-				} else {
-					s = cigar2pos(a[j][6], a[j][2] - h.end);
-					e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
-				}
-				if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
-				s += a[j][5]; e += a[j][5];
-				lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
-			}
-			if (lifted.length) ++n_lifted, hits[i].lifted = lifted;
-		}
-		if (n_lifted == 0) {
-			if (opt.verbose < 4) print(line);
-			continue;
-		}
-
-		// group hits
-		for (var i = 0; i < hits.length; ++i) { // set keys for sorting
-			if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used
-				hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
-			else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
-			hits[i].i = i; // keep the original index
-		}
-		hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
-		var last_chr = null, end = 0, g = -1;
-		for (var i = 0; i < hits.length; ++i) {
-			if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
-			else if (hits[i].pstart >= end) ++g;
-			hits[i].g = g;
-			end = end > hits[i].pend? end : hits[i].pend;
-		}
-		var reported_g = null, reported_i = null;
-		for (var i = 0; i < hits.length; ++i)
-			if (hits[i].i == 0)
-				reported_g = hits[i].g, reported_i = i;
-		var n_group0 = 0; // #hits overlapping the reported hit
-		for (var i = 0; i < hits.length; ++i)
-			if (hits[i].g == reported_g)
-				++n_group0;
-		if (n_group0 == 1) { // then keep the reported alignment and mapQ
-			if (opt.verbose < 4) print(line);
-			continue;
-		}
-
-		// re-estimate mapQ
-		var group_max = [];
-		for (var i = 0; i < hits.length; ++i) {
-			var g = hits[i].g;
-			if (group_max[g] == null || group_max[g][0] < hits[i].score)
-				group_max[g] = [hits[i].score, g];
-		}
-		if (group_max.length > 1)
-			group_max.sort(function(x,y) {return y[0]-x[0]});
-		var mapQ;
-		if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
-			mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
-		} else mapQ = 0;
-		mapQ = mapQ < 60? mapQ : 60;
-		var ori_mapQ = parseInt(t[4]);
-		mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
-
-		// generate lifted_str
-		for (var i = 0; i < hits.length; ++i) {
-			if (hits[i].lifted && hits[i].lifted.length) {
-				var lifted = hits[i].lifted;
-				var u = '';
-				for (var j = 0; j < lifted.length; ++j)
-					u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
-				hits[i].lifted_str = u;
-			}
-		}
-
-		// generate reversed quality and reverse-complemented sequence if necessary
-		var rs = null, rq = null; // reversed quality and reverse complement sequence
-		var need_rev = false;
-		for (var i = 0; i < hits.length; ++i) {
-			if (hits[i].g != reported_g || i == reported_i) continue;
-			if (hits[i].rev != hits[reported_i].rev)
-				need_rev = true;
-		}
-		if (need_rev) { // reverse and reverse complement
-			aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
-			aux.set(t[10],0); aux.reverse(); rq = aux.toString();
-		}
-
-		// print
-		t[4] = mapQ;
-		t.push("om:i:"+ori_mapQ);
-		if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
-		print(t.join("\t"));
-		var cnt = 0;
-		for (var i = 0; i < hits.length; ++i) {
-			if (opt.verbose >= 5) print(obj2str(hits[i]));
-			if (hits[i].g != reported_g || i == reported_i) continue;
-			var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0];
-			// update name
-			if (flag&0x40) s[0] += "/1";
-			if (flag&0x80) s[0] += "/2";
-			s[0] += "_" + (++cnt);
-			if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]);
-			else s.push(rs, rq);
-			s.push("NM:i:" + hits[i].NM);
-			if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
-			print(s.join("\t"));
-		}
-	}
-	file.close();
-
-	aux.destroy();
-	buf.destroy();
-}
-
-/*********************
- *** Main function ***
- *********************/
-
-function main(args)
-{
-	if (args.length == 0) {
-		print("\nUsage:    k8 bwa-helper.js <command> [arguments]\n");
-		print("Commands: genalt       generate ALT alignments");
-		print("          sam2pas      convert SAM to pairwise alignment summary format (PAS)");
-		print("          pas2reg      extract covered regions");
-		print("          reg2cut      regions to extract for the 2nd round bwa-mem");
-		print("          markovlp     identify bi-directional overlaps");
-		print("          gff2sam      convert GFF3 alignment to SAM");
-		print("          shortname    shorten sequence name after subseq (PacBio read names only)");
-		print("");
-		exit(1);
-	}
-
-	var cmd = args.shift();
-	if (cmd == 'sam2pas') bwa_sam2pas(args);
-	else if (cmd == 'gff2sam') bwa_gff2sam(args);
-	else if (cmd == 'markovlp') bwa_markOvlp(args);
-	else if (cmd == 'pas2reg') bwa_pas2reg(args);
-	else if (cmd == 'reg2cut') bwa_reg2cut(args);
-	else if (cmd == 'genalt') bwa_genalt(args);
-	else if (cmd == 'shortname') bwa_shortname(args);
-	else warn("Unrecognized command");
-}
-
-main(arguments);
diff --git a/bwa.1 b/bwa.1
index ba8bc9e..e30f382 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,4 +1,4 @@
-.TH bwa 1 "19 May 2014" "bwa-0.7.9-r783" "Bioinformatics tools"
+.TH bwa 1 "23 December 2014" "bwa-0.7.12-r1034" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
@@ -75,7 +75,7 @@ appropriate algorithm will be chosen automatically.
 .TP
 .B mem
 .B bwa mem
-.RB [ -aCHMpP ]
+.RB [ -aCHjMpP ]
 .RB [ -t
 .IR nThreads ]
 .RB [ -k
@@ -88,6 +88,12 @@ appropriate algorithm will be chosen automatically.
 .IR seedSplitRatio ]
 .RB [ -c
 .IR maxOcc ]
+.RB [ -D
+.IR chainShadow ]
+.RB [ -m
+.IR maxMateSW ]
+.RB [ -W
+.IR minSeedMatch ]
 .RB [ -A
 .IR matchScore ]
 .RB [ -B
@@ -102,6 +108,8 @@ appropriate algorithm will be chosen automatically.
 .IR unpairPen ]
 .RB [ -R
 .IR RGline ]
+.RB [ -H
+.IR HDlines ]
 .RB [ -v
 .IR verboseLevel ]
 .I db.prefix
@@ -193,9 +201,28 @@ Discard a MEM if it has more than
 .I INT
 occurence in the genome. This is an insensitive parameter. [500]
 .TP
+.BI -D \ INT
+Drop chains shorter than
+.I FLOAT
+fraction of the longest overlapping chain [0.5]
+.TP
+.BI -m \ INT
+Perform at most
+.I INT
+rounds of mate-SW [50]
+.TP
+.BI -W \ INT
+Drop a chain if the number of bases in seeds is smaller than
+.IR INT .
+This option is primarily used for longer contigs/reads. When positive, it also
+affects seed filtering. [0]
+.TP
 .B -P
 In the paired-end mode, perform SW to rescue missing hits only but do not try to find
 hits that fit a proper pair.
+
+.TP
+.B SCORING OPTIONS:
 .TP
 .BI -A \ INT
 Matching score. [1]
@@ -233,8 +260,9 @@ more aggressive read pair. [17]
 .B INPUT/OUTPUT OPTIONS:
 .TP
 .B -p
-Assume the first input query file is interleaved paired-end FASTA/Q. See the
-command description for details.
+Smart pairing. If two adjacent reads have the same name, they are considered
+to form a read pair. This way, paired-end and single-end reads can be mixed
+in a single FASTA/Q stream.
 .TP
 .BI -R \ STR
 Complete read group header line. '\\t' can be used in
@@ -243,15 +271,30 @@ and will be converted to a TAB in the output SAM. The read group ID will be
 attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
 [null]
 .TP
+.BI -H \ ARG
+If ARG starts with @, it is interpreted as a string and gets inserted into the
+output SAM header; otherwise, ARG is interpreted as a file with all lines
+starting with @ in the file inserted into the SAM header. [null]
+.TP
 .BI -T \ INT
 Don't output alignment with score lower than
 .IR INT .
 This option affects output and occasionally SAM flag 2. [30]
 .TP
-.BI -h \ INT
+.BI -j
+Treat ALT contigs as part of the primary assembly (i.e. ignore the
+.I db.prefix.alt
+file).
+.TP
+.BI -h \ INT[,INT2]
 If a query has not more than
 .I INT
-hits with score higher than 80% of the best hit, output them all in the XA tag [5]
+hits with score higher than 80% of the best hit, output them all in the XA tag.
+If
+.I INT2
+is specified, BWA-MEM outputs up to
+.I INT2
+hits if the list contains a hit to an ALT contig. [5,200]
 .TP
 .B -a
 Output all found alignments for single-end or unpaired paired-end reads. These
@@ -571,6 +614,7 @@ R	0x0020	strand of the mate
 s	0x0100	the alignment is not primary
 f	0x0200	QC failure
 d	0x0400	optical or PCR duplicate
+S	0x0800	supplementary alignment
 .TE
 
 .PP
@@ -604,8 +648,6 @@ _
 XS	Suboptimal alignment score
 XF	Support from forward/reverse alignment
 XE	Number of supporting seeds
-_
-XP	Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/
 .TE
 
 .PP
diff --git a/bwa.c b/bwa.c
index 43d8a58..f9ce6a1 100644
--- a/bwa.c
+++ b/bwa.c
@@ -7,6 +7,7 @@
 #include "ksw.h"
 #include "utils.h"
 #include "kstring.h"
+#include "kvec.h"
 
 #ifdef USE_MALLOC_WRAPPERS
 #  include "malloc_wrap.h"
@@ -14,6 +15,7 @@
 
 int bwa_verbose = 3;
 char bwa_rg_id[256];
+char *bwa_pg;
 
 /************************
  * Batch FASTA/Q reader *
@@ -54,10 +56,12 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
 		}
 		trim_readno(&ks->name);
 		kseq2bseq1(ks, &seqs[n]);
+		seqs[n].id = n;
 		size += seqs[n++].l_seq;
 		if (ks2) {
 			trim_readno(&ks2->name);
 			kseq2bseq1(ks2, &seqs[n]);
+			seqs[n].id = n;
 			size += seqs[n++].l_seq;
 		}
 		if (size >= chunk_size && (n&1) == 0) break;
@@ -70,6 +74,24 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
 	return seqs;
 }
 
+void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2])
+{
+	int i, has_last;
+	kvec_t(bseq1_t) a[2] = {{0,0,0}, {0,0,0}};
+	for (i = 1, has_last = 1; i < n; ++i) {
+		if (has_last) {
+			if (strcmp(seqs[i].name, seqs[i-1].name) == 0) {
+				kv_push(bseq1_t, a[1], seqs[i-1]);
+				kv_push(bseq1_t, a[1], seqs[i]);
+				has_last = 0;
+			} else kv_push(bseq1_t, a[0], seqs[i-1]);
+		} else has_last = 1;
+	}
+	if (has_last) kv_push(bseq1_t, a[0], seqs[i-1]);
+	sep[0] = a[0].a, m[0] = a[0].n;
+	sep[1] = a[1].a, m[1] = a[1].n;
+}
+
 /*****************
  * CIGAR related *
  *****************/
@@ -227,7 +249,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
 	return bwt;
 }
 
-bwaidx_t *bwa_idx_load(const char *hint, int which)
+bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
 {
 	bwaidx_t *idx;
 	char *prefix;
@@ -239,7 +261,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
 	idx = calloc(1, sizeof(bwaidx_t));
 	if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
 	if (which & BWA_IDX_BNS) {
+		int i, c;
 		idx->bns = bns_restore(prefix);
+		for (i = c = 0; i < idx->bns->n_seqs; ++i)
+			if (idx->bns->anns[i].is_alt) ++c;
+		if (bwa_verbose >= 3)
+			fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
 		if (which & BWA_IDX_PAC) {
 			idx->pac = calloc(idx->bns->l_pac/4+1, 1);
 			err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
@@ -251,27 +278,113 @@ bwaidx_t *bwa_idx_load(const char *hint, int which)
 	return idx;
 }
 
+bwaidx_t *bwa_idx_load(const char *hint, int which)
+{
+	return bwa_idx_load_from_disk(hint, which);
+}
+
 void bwa_idx_destroy(bwaidx_t *idx)
 {
 	if (idx == 0) return;
-	if (idx->bwt) bwt_destroy(idx->bwt);
-	if (idx->bns) bns_destroy(idx->bns);
-	if (idx->pac) free(idx->pac);
+	if (idx->mem == 0) {
+		if (idx->bwt) bwt_destroy(idx->bwt);
+		if (idx->bns) bns_destroy(idx->bns);
+		if (idx->pac) free(idx->pac);
+	} else {
+		free(idx->bwt); free(idx->bns->anns); free(idx->bns);
+		if (!idx->is_shm) free(idx->mem);
+	}
 	free(idx);
 }
 
+int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx)
+{
+	int64_t k = 0, x;
+	int i;
+
+	// generate idx->bwt
+	x = sizeof(bwt_t); idx->bwt = malloc(x); memcpy(idx->bwt, mem + k, x); k += x;
+	x = idx->bwt->bwt_size * 4; idx->bwt->bwt = (uint32_t*)(mem + k); k += x;
+	x = idx->bwt->n_sa * sizeof(bwtint_t); idx->bwt->sa = (bwtint_t*)(mem + k); k += x;
+
+	// generate idx->bns and idx->pac
+	x = sizeof(bntseq_t); idx->bns = malloc(x); memcpy(idx->bns, mem + k, x); k += x;
+	x = idx->bns->n_holes * sizeof(bntamb1_t); idx->bns->ambs = (bntamb1_t*)(mem + k); k += x;
+	x = idx->bns->n_seqs  * sizeof(bntann1_t); idx->bns->anns = malloc(x); memcpy(idx->bns->anns, mem + k, x); k += x;
+	for (i = 0; i < idx->bns->n_seqs; ++i) {
+		idx->bns->anns[i].name = (char*)(mem + k); k += strlen(idx->bns->anns[i].name) + 1;
+		idx->bns->anns[i].anno = (char*)(mem + k); k += strlen(idx->bns->anns[i].anno) + 1;
+	}
+	idx->pac = (uint8_t*)(mem + k); k += idx->bns->l_pac/4+1;
+	assert(k == l_mem);
+
+	idx->l_mem = k; idx->mem = mem;
+	return 0;
+}
+
+int bwa_idx2mem(bwaidx_t *idx)
+{
+	int i;
+	int64_t k, x, tmp;
+	uint8_t *mem;
+
+	// copy idx->bwt
+	x = idx->bwt->bwt_size * 4;
+	mem = realloc(idx->bwt->bwt, sizeof(bwt_t) + x); idx->bwt->bwt = 0;
+	memmove(mem + sizeof(bwt_t), mem, x);
+	memcpy(mem, idx->bwt, sizeof(bwt_t)); k = sizeof(bwt_t) + x;
+	x = idx->bwt->n_sa * sizeof(bwtint_t); mem = realloc(mem, k + x); memcpy(mem + k, idx->bwt->sa, x); k += x;
+	free(idx->bwt->sa);
+	free(idx->bwt); idx->bwt = 0;
+
+	// copy idx->bns
+	tmp = idx->bns->n_seqs * sizeof(bntann1_t) + idx->bns->n_holes * sizeof(bntamb1_t);
+	for (i = 0; i < idx->bns->n_seqs; ++i) // compute the size of heap-allocated memory
+		tmp += strlen(idx->bns->anns[i].name) + strlen(idx->bns->anns[i].anno) + 2;
+	mem = realloc(mem, k + sizeof(bntseq_t) + tmp);
+	x = sizeof(bntseq_t); memcpy(mem + k, idx->bns, x); k += x;
+	x = idx->bns->n_holes * sizeof(bntamb1_t); memcpy(mem + k, idx->bns->ambs, x); k += x;
+	free(idx->bns->ambs);
+	x = idx->bns->n_seqs * sizeof(bntann1_t); memcpy(mem + k, idx->bns->anns, x); k += x;
+	for (i = 0; i < idx->bns->n_seqs; ++i) {
+		x = strlen(idx->bns->anns[i].name) + 1; memcpy(mem + k, idx->bns->anns[i].name, x); k += x;
+		x = strlen(idx->bns->anns[i].anno) + 1; memcpy(mem + k, idx->bns->anns[i].anno, x); k += x;
+		free(idx->bns->anns[i].name); free(idx->bns->anns[i].anno);
+	}
+	free(idx->bns->anns);
+
+	// copy idx->pac
+	x = idx->bns->l_pac/4+1;
+	mem = realloc(mem, k + x);
+	memcpy(mem + k, idx->pac, x); k += x;
+	free(idx->bns); idx->bns = 0;
+	free(idx->pac); idx->pac = 0;
+
+	return bwa_mem2idx(k, mem, idx);
+}
+
 /***********************
  * SAM header routines *
  ***********************/
 
-void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
+void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line)
 {
-	int i;
+	int i, n_SQ = 0;
 	extern char *bwa_pg;
-	for (i = 0; i < bns->n_seqs; ++i)
-		err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
-	if (rg_line) err_printf("%s\n", rg_line);
-	err_printf("%s\n", bwa_pg);
+	if (hdr_line) {
+		const char *p = hdr_line;
+		while ((p = strstr(p, "@SQ\t")) != 0) {
+			if (p == hdr_line || *(p-1) == '\n') ++n_SQ;
+			p += 4;
+		}
+	}
+	if (n_SQ == 0) {
+		for (i = 0; i < bns->n_seqs; ++i)
+			err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
+	} else if (n_SQ != bns->n_seqs && bwa_verbose >= 2)
+		fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs);
+	if (hdr_line) err_printf("%s\n", hdr_line);
+	if (bwa_pg) err_printf("%s\n", bwa_pg);
 }
 
 static char *bwa_escape(char *s)
@@ -319,3 +432,16 @@ err_set_rg:
 	return 0;
 }
 
+char *bwa_insert_header(const char *s, char *hdr)
+{
+	int len = 0;
+	if (s == 0 || s[0] != '@') return hdr;
+	if (hdr) {
+		len = strlen(hdr);
+		hdr = realloc(hdr, len + strlen(s) + 2);
+		hdr[len++] = '\n';
+		strcpy(hdr + len, s);
+	} else hdr = strdup(s);
+	bwa_escape(hdr + len);
+	return hdr;
+}
diff --git a/bwa.h b/bwa.h
index bbc2525..1541c1c 100644
--- a/bwa.h
+++ b/bwa.h
@@ -10,14 +10,20 @@
 #define BWA_IDX_PAC 0x4
 #define BWA_IDX_ALL 0x7
 
+#define BWA_CTL_SIZE 0x10000
+
 typedef struct {
 	bwt_t    *bwt; // FM-index
 	bntseq_t *bns; // information on the reference sequences
 	uint8_t  *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
+
+	int    is_shm;
+	int64_t l_mem;
+	uint8_t  *mem;
 } bwaidx_t;
 
 typedef struct {
-	int l_seq;
+	int l_seq, id;
 	char *name, *comment, *seq, *qual, *sam;
 } bseq1_t;
 
@@ -29,6 +35,7 @@ extern "C" {
 #endif
 
 	bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
+	void bseq_classify(int n, bseq1_t *seqs, int m[2], bseq1_t *sep[2]);
 
 	void bwa_fill_scmat(int a, int b, int8_t mat[25]);
 	uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
@@ -37,11 +44,16 @@ extern "C" {
 	char *bwa_idx_infer_prefix(const char *hint);
 	bwt_t *bwa_idx_load_bwt(const char *hint);
 
+	bwaidx_t *bwa_idx_load_from_shm(const char *hint);
+	bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
 	bwaidx_t *bwa_idx_load(const char *hint, int which);
 	void bwa_idx_destroy(bwaidx_t *idx);
+	int bwa_idx2mem(bwaidx_t *idx);
+	int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
 
-	void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
+	void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line);
 	char *bwa_set_rg(const char *s);
+	char *bwa_insert_header(const char *s, char *hdr);
 
 #ifdef __cplusplus
 }
diff --git a/bwakit/README.md b/bwakit/README.md
new file mode 100644
index 0000000..b3a4033
--- /dev/null
+++ b/bwakit/README.md
@@ -0,0 +1,115 @@
+## Introduction
+
+Bwakit is a self-consistent installation-free package of scripts and precompiled
+binaries, providing an end-to-end solution to read mapping. In addition to the
+basic mapping functionality implemented in bwa, bwakit is able to generate
+proper human reference genome and to take advantage of ALT contigs, if present,
+to improve read mapping and to perform HLA typing for high-coverage human data.
+It can remap name- or coordinate-sorted BAM with read group and barcode
+information retained. Bwakit also *optionally* trims adapters (via
+[trimadap][ta]), marks duplicates (via [samblaster][sb]) and sorts the final
+alignment (via [samtools][smtl]).
+
+Bwakit has two entry scripts: `run-gen-ref` which downloads and generates human
+reference genomes, and `run-bwamem` which prints mapping command lines on the
+standard output that can be piped to `sh` to execute. The two scripts will call
+other programs or use data in `bwa.kit`. The following shows an example about
+how to use bwakit:
+
+```sh
+# Download the bwa-0.7.11 binary package (download link may change)
+wget -O- http://sourceforge.net/projects/bio-bwa/files/bwakit/bwakit-0.7.11_x64-linux.tar.bz2/download \
+  | gzip -dc | tar xf -
+# Generate the GRCh38+ALT+decoy+HLA and create the BWA index
+bwa.kit/run-gen-ref hs38DH   # download GRCh38 and write hs38DH.fa
+bwa.kit/bwa index hs38DH.fa  # create BWA index
+# mapping
+bwa.kit/run-bwamem -o out -H hs38DH.fa read1.fq read2.fq | sh
+```
+
+The last mapping command line will generate the following files:
+
+* `out.aln.bam`: unsorted alignments with ALT-aware mapping quality. In this
+  file, one read may be placed on multiple overlapping ALT contigs at the same
+  time even if the read is mapped better to some contigs than others. This makes
+  it possible to analyze each contig independent of others.
+
+* `out.hla.top`: best genotypes for HLA-A, -B, -C, -DQA1, -DQB1 and -DRB1 genes.
+
+* `out.hla.all`: other possible genotypes on the six HLA genes.
+
+* `out.log.*`: bwa-mem, samblaster and HLA typing log files.
+
+Bwakit can be [downloaded here][res]. It is only available to x86_64-linux. The
+scripts in the package are available in the [bwa/bwakit][kit] directory.
+Packaging is done manually for now.
+
+## Limitations
+
+* HLA typing only works for high-coverage human data. The typing accuracy can
+  still be improved. We encourage researchers to develop better HLA typing tools
+  based on the intermediate output of bwakit (for each HLA gene included in the
+  index, bwakit writes all reads matching it in a separate file).
+
+* Duplicate marking only works when all reads from a single paired-end library
+  are provided as the input. This limitation is the necessary tradeoff of fast
+  MarkDuplicate provided by samblaster.
+
+* The adapter trimmer is chosen as it is fast, pipe friendly and does not
+  discard reads. However, it is conservative and suboptimal. If this is a
+  concern, it is recommended to preprocess input reads with a more sophisticated
+  adapter trimmer. We also hope existing trimmers can be modified to operate on
+  an interleaved FASTQ stream. We will replace trimadap once a better trimmer
+  meets our needs.
+
+* Bwakit can be memory demanding depends on the functionality invoked. For 30X
+  human data, bwa-mem takes about 11GB RAM with 32 threads, samblaster uses
+  close to 10GB and BAM shuffling (if the input is sorted BAM) uses several GB.
+  In the current setting, sorting uses about 10GB.
+
+
+## Package Contents  
+```
+bwa.kit
+|-- README.md                  This README file.
+|-- run-bwamem                 *Entry script* for the entire mapping pipeline.
+|-- bwa                        *BWA binary*
+|-- k8                         Interpretor for *.js scripts.
+|-- bwa-postalt.js             Post-process alignments to ALT contigs/decoys/HLA genes.
+|-- htsbox                     Used by run-bwamem for shuffling BAMs and BAM=>FASTQ.
+|-- samblaster                 MarkDuplicates for reads from the same library. v0.1.20
+|-- samtools                   SAMtools for sorting and SAM=>BAM conversion. v1.1
+|-- seqtk                      For FASTQ manipulation.
+|-- trimadap                   Trim Illumina PE sequencing adapters.
+|
+|-- run-gen-ref                *Entry script* for generating human reference genomes.
+|-- resource-GRCh38            Resources for generating GRCh38
+|   |-- hs38DH-extra.fa        Decoy and HLA gene sequences. Used by run-gen-ref.
+|   `-- hs38DH.fa.alt          ALT-to-GRCh38 alignment. Used by run-gen-ref.
+|
+|-- run-HLA                    HLA typing for sequences extracted by bwa-postalt.js.
+|-- typeHLA.sh                 Type one HLA-gene. Called by run-HLA.
+|-- typeHLA.js                 HLA typing from exon-to-contig alignment. Used by typeHLA.sh.
+|-- typeHLA-selctg.js          Select contigs overlapping HLA exons. Used by typeHLA.sh.
+|-- fermi2.pl                  Fermi2 wrapper. Used by typeHLA.sh for de novo assembly.
+|-- fermi2                     Fermi2 binary. Used by fermi2.pl.
+|-- ropebwt2                   RopeBWT2 binary. Used by fermi2.pl.
+|-- resource-human-HLA         Resources for HLA typing
+|   |-- HLA-ALT-exons.bed      Exonic regions of HLA ALT contigs. Used by typeHLA.sh.
+|   |-- HLA-CDS.fa             CDS of HLA-{A,B,C,DQA1,DQB1,DRB1} genes from IMGT/HLA-3.18.0.
+|   |-- HLA-ALT-type.txt       HLA types for each HLA ALT contig. Not used.
+|   `-- HLA-ALT-idx            BWA indices of each HLA ALT contig. Used by typeHLA.sh
+|       `-- (...)
+|
+`-- doc                        BWA documentations
+    |-- bwa.1                  Manpage
+    |-- NEWS.md                Release Notes
+    |-- README.md              GitHub README page
+    `-- README-alt.md          Documentation for ALT mapping
+```
+
+[res]: https://sourceforge.net/projects/bio-bwa/files/bwakit
+[sb]: https://github.com/GregoryFaust/samblaster
+[ta]: https://github.com/lh3/seqtk/blob/master/trimadap.c
+[smtl]: http://www.htslib.org
+[kit]: https://github.com/lh3/bwa/tree/master/bwakit
diff --git a/bwakit/bwa-postalt.js b/bwakit/bwa-postalt.js
new file mode 100644
index 0000000..5717045
--- /dev/null
+++ b/bwakit/bwa-postalt.js
@@ -0,0 +1,524 @@
+/*****************************************************************
+ * The K8 Javascript interpreter is required to run this script. *
+ *                                                               *
+ *   Source code: https://github.com/attractivechaos/k8          *
+ *   Binary: http://sourceforge.net/projects/lh3/files/k8/       *
+ *                                                               *
+ * Data file used for generating GRCh38 ALT alignments:          *
+ *                                                               *
+ *   http://sourceforge.net/projects/bio-bwa/files/              *
+ *****************************************************************/
+
+/******************
+ *** From k8.js ***
+ ******************/
+
+// Parse command-line options. A BSD getopt() clone in javascript.
+var getopt = function(args, ostr) {
+	var oli; // option letter list index
+	if (typeof(getopt.place) == 'undefined')
+		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+	if (getopt.place == -1) { // update scanning pointer
+		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+			getopt.place = -1;
+			return null;
+		}
+		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+			++getopt.ind;
+			getopt.place = -1;
+			return null;
+		}
+	}
+	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
+		if (getopt.place < 0) ++getopt.ind;
+		return '?';
+	}
+	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+		getopt.arg = null;
+		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+	} else { // need an argument
+		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+			getopt.arg = args[getopt.ind].substr(getopt.place);
+		else if (args.length <= ++getopt.ind) { // no arg
+			getopt.place = -1;
+			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+			return '?';
+		} else getopt.arg = args[getopt.ind]; // white space
+		getopt.place = -1;
+		++getopt.ind;
+	}
+	return optopt;
+}
+
+// reverse a string
+Bytes.prototype.reverse = function()
+{
+	for (var i = 0; i < this.length>>1; ++i) {
+		var tmp = this[i];
+		this[i] = this[this.length - i - 1];
+		this[this.length - i - 1] = tmp;
+	}
+}
+
+// reverse complement a DNA string
+Bytes.prototype.revcomp = function()
+{
+	if (Bytes.rctab == null) {
+		var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn';
+		var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn';
+		Bytes.rctab = [];
+		for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0;
+		for (var i = 0; i < s1.length; ++i)
+			Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i);
+	}
+	for (var i = 0; i < this.length>>1; ++i) {
+		var tmp = this[this.length - i - 1];
+		this[this.length - i - 1] = Bytes.rctab[this[i]];
+		this[i] = Bytes.rctab[tmp];
+	}
+	if (this.length>>1)
+		this[this.length>>1] = Bytes.rctab[this[this.length>>1]];
+}
+
+// create index for a list of intervals for fast interval queries; ported from bedidx.c in samtools
+function intv_ovlp(intv, bits)
+{
+	if (typeof bits == "undefined") bits = 13;
+	intv.sort(function(a,b) {return a[0]-b[0];});
+	// create the index
+	var idx = [], max = 0;
+	for (var i = 0; i < intv.length; ++i) {
+		var b = intv[i][0]>>bits;
+		var e = (intv[i][1]-1)>>bits;
+		if (b != e) {
+			for (var j = b; j <= e; ++j)
+				if (idx[j] == null) idx[j] = i;
+		} else if (idx[b] == null) idx[b] = i;
+		max = max > e? max : e;
+	}
+	// closure
+	return function(_b, _e) {
+		var x = _b >> bits;
+		if (x > max) return [];
+		var off = idx[x];
+		if (off == null) {
+			var i;
+			for (i = ((_e - 1) >> bits) - 1; i >= 0; --i)
+				if (idx[i] != null) break;
+			off = i < 0? 0 : idx[i];
+		}
+		var ovlp = [];
+		for (var i = off; i < intv.length && intv[i][0] < _e; ++i)
+			if (intv[i][1] > _b) ovlp.push(intv[i]);
+		return ovlp;
+	}
+}
+
+var re_cigar = /(\d+)([MIDSHN])/g;
+
+/******************************
+ *** Generate ALT alignment ***
+ ******************************/
+
+// given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF
+function cigar2pos(cigar, pos)
+{
+	var x = 0, y = 0;
+	for (var i = 0; i < cigar.length; ++i) {
+		var op = cigar[i][0], len = cigar[i][1];
+		if (op == 'M') {
+			if (y <= pos && pos < y + len)
+				return x + (pos - y);
+			x += len, y += len;
+		} else if (op == 'D') {
+			x += len;
+		} else if (op == 'I') {
+			if (y <= pos && pos < y + len)
+				return x;
+			y += len;
+		} else if (op == 'S' || op == 'H') {
+			if (y <= pos && pos < y + len)
+				return -1;
+			y += len;
+		}
+	}
+	return -1;
+}
+
+// Parse a hit. $s is an array that looks something like ["chr1", "+12345", "100M", 5]
+// Return an object keeping various information about the alignment.
+function parse_hit(s, opt)
+{
+	var h = {};
+	h.ctg = s[0];
+	h.start = parseInt(s[1].substr(1)) - 1;
+	h.rev = (s[1].charAt(0) == '-');
+	h.cigar = s[2];
+	h.NM = parseInt(s[3]);
+	h.hard = false;
+	var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip;
+	l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0;
+	while ((m = re_cigar.exec(h.cigar)) != null) {
+		var l = parseInt(m[1]);
+		if (m[2] == 'M') l_match += l;
+		else if (m[2] == 'D') ++n_del, l_del += l;
+		else if (m[2] == 'I') ++n_ins, l_ins += l;
+		else if (m[2] == 'N') l_skip += l;
+		else if (m[2] == 'H' || m[2] == 'S') {
+			l_clip += l;
+			if (m[2] == 'H') h.hard = true;
+		}
+	}
+	h.end = h.start + l_match + l_del + l_skip;
+	h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins;
+	h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499);
+	h.l_query = l_match + l_ins + l_clip;
+	return h;
+}
+
+function print_buffer(buf2, fp_hla, hla) // output alignments
+{
+	if (buf2.length == 0) return;
+	for (var i = 0; i < buf2.length; ++i)
+		print(buf2[i].join("\t"));
+	if (fp_hla != null) {
+		var name = buf2[0][0] + '/' + (buf2[0][1]>>6&3) + ((buf2[0][1]&16)? '-' : '+');
+		for (var x in hla) {
+			if (fp_hla[x] != null);
+				fp_hla[x].write('@' + name + '\n' + buf2[0][9] + '\n+\n' + buf2[0][10] + '\n');
+		}
+	}
+}
+
+function collect_hla_hits(idx, ctg, start, end, hla) // collect reads hit to HLA genes
+{
+	var m, ofunc = idx[ctg];
+	if (ofunc == null) return;
+	var ovlp_alt = ofunc(start, end);
+	for (var i = 0; i < ovlp_alt.length; ++i)
+		if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(ovlp_alt[i][2])) != null)
+			hla[m[1]] = true;
+}
+
+function bwa_postalt(args)
+{
+	var version = "r985";
+	var c, opt = { a:1, b:4, o:6, e:1, min_mapq:10, min_sc:90, max_nm_sc:10, min_pa_ratio:1 };
+
+	while ((c = getopt(args, 'vp:r:')) != null) {
+		if (c == 'p') opt.pre = getopt.arg;
+		else if (c == 'r') opt.min_pa_ratio = parseFloat(getopt.arg);
+		else if (c == 'v') { print(version); exit(0); }
+	}
+	if (opt.min_pa_ratio > 1.) opt.min_pa_ratio = 1.;
+
+	if (args.length == getopt.ind) {
+		print("");
+		print("Usage:   k8 bwa-postalt.js [options] <alt.sam> [aln.sam]\n");
+		print("Options: -p STR     prefix of output files containting sequences matching HLA genes [null]");
+		print("         -r FLOAT   reduce mapQ to 0 if not overlapping lifted best and pa<FLOAT ["+opt.min_pa_ratio+"]");
+		print("         -v         show version number");
+		print("");
+		print("Note: This script extracts the XA tag, lifts the mapping positions of ALT hits to");
+		print("      the primary assembly, groups them and then estimates mapQ across groups. If");
+		print("      a non-ALT hit overlaps a lifted ALT hit, its mapping quality is set to the");
+		print("      smaller between its original mapQ and the adjusted mapQ of the ALT hit. If");
+		print("      multiple ALT hits are lifted to the same position, they will yield new SAM");
+		print("      lines with the same mapQ.");
+		print("");
+		exit(1);
+	}
+
+	var aux = new Bytes(); // used for reverse and reverse complement
+	var buf = new Bytes(); // line reading buffer
+
+	// read ALT-to-REF alignment
+	var intv_alt = {}, intv_pri = {}, hla_ctg = {}, is_alt = {}, hla_chr = null;
+	var file = new File(args[getopt.ind]);
+	while (file.readline(buf) >= 0) {
+		var line = buf.toString();
+		if (line.charAt(0) == '@') continue;
+		var t = line.split("\t");
+		if (t.length < 11) continue; // incomplete lines
+		is_alt[t[0]] = true;
+		var pos = parseInt(t[3]) - 1;
+		var flag = parseInt(t[1]);
+		if ((flag&4) || t[2] == '*') continue;
+		var m, cigar = [], l_qaln = 0, l_tlen = 0, l_qclip = 0;
+		if ((m = /^(HLA-[^\s\*]+)\*\d+/.exec(t[0])) != null) { // read HLA contigs
+			if (hla_ctg[m[1]] == null) hla_ctg[m[1]] = 0;
+			++hla_ctg[m[1]];
+			hla_chr = t[2];
+		}
+		while ((m = re_cigar.exec(t[5])) != null) {
+			var l = parseInt(m[1]);
+			cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip
+			if (m[2] == 'M') l_qaln += l, l_tlen += l;
+			else if (m[2] == 'I') l_qaln += l;
+			else if (m[2] == 'S' || m[2] == 'H') l_qclip += l;
+			else if (m[2] == 'D' || m[2] == 'N') l_tlen += l;
+		}
+		var j = flag&16? cigar.length-1 : 0;
+		var start = cigar[j][0] == 'S'? cigar[j][1] : 0;
+		if (intv_alt[t[0]] == null) intv_alt[t[0]] = [];
+		intv_alt[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, pos - 1, cigar, pos + l_tlen]);
+		if (intv_pri[t[2]] == null) intv_pri[t[2]] = [];
+		intv_pri[t[2]].push([pos, pos + l_tlen, t[0]]);
+	}
+	file.close();
+	var idx_alt = {}, idx_pri = {};
+	for (var ctg in intv_alt) idx_alt[ctg] = intv_ovlp(intv_alt[ctg]);
+	for (var ctg in intv_pri) idx_pri[ctg] = intv_ovlp(intv_pri[ctg]);
+
+	// initialize the list of HLA contigs
+	var fp_hla = null;
+	if (opt.pre) {
+		fp_hla = {};
+		for (var h in hla_ctg)
+			fp_hla[h] = new File(opt.pre + '.' + h + '.fq', "w");
+	}
+
+	// process SAM
+	var buf2 = [], hla = {};
+	file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File();
+	while (file.readline(buf) >= 0) {
+		var m, line = buf.toString();
+
+		if (line.charAt(0) == '@') { // print and then skip the header line
+			print(line);
+			continue;
+		}
+
+		var t = line.split("\t");
+		t[1] = parseInt(t[1]); t[3] = parseInt(t[3]); t[4] = parseInt(t[4]);
+
+		// print bufferred reads
+		if (buf2.length && (buf2[0][0] != t[0] || (buf2[0][1]&0xc0) != (t[1]&0xc0))) {
+			print_buffer(buf2, fp_hla, hla);
+			buf2 = [], hla = {};
+		}
+
+		// skip unmapped lines
+		if (t[1]&4) {
+			buf2.push(t);
+			continue;
+		}
+
+		// parse the reported hit
+		var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1];
+		var flag = t[1];
+		var h = parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt);
+		if (t[2] == hla_chr) collect_hla_hits(idx_pri, h.ctg, h.start, h.end, hla);
+
+		if (h.hard) { // the following does not work with hard clipped alignments
+			buf2.push(t);
+			continue;
+		}
+		var hits = [h];
+
+		// parse hits in the XA tag
+		if ((m = /\tXA:Z:(\S+)/.exec(line)) != null) {
+			var XA_strs = m[1].split(";");
+			for (var i = 0; i < XA_strs.length; ++i)
+				if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string
+					hits.push(parse_hit(XA_strs[i].split(","), opt));
+		}
+
+		// check if there are ALT hits
+		var has_alt = false;
+		for (var i = 0; i < hits.length; ++i)
+			if (is_alt[hits[i].ctg] != null) {
+				has_alt = true;
+				break;
+			}
+		if (!has_alt) {
+			buf2.push(t);
+			continue;
+		}
+
+		// lift mapping positions to the primary assembly
+		var n_rpt_lifted = 0, rpt_lifted = null;
+		for (var i = 0; i < hits.length; ++i) {
+			var a, h = hits[i];
+
+			if (idx_alt[h.ctg] == null || (a = idx_alt[h.ctg](h.start, h.end)) == null || a.length == 0)
+				continue;
+
+			// find the approximate position on the primary assembly
+			var lifted = [];
+			for (var j = 0; j < a.length; ++j) {
+				var s, e;
+				if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly
+					s = cigar2pos(a[j][6], h.start);
+					e = cigar2pos(a[j][6], h.end - 1) + 1;
+				} else {
+					s = cigar2pos(a[j][6], a[j][2] - h.end);
+					e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1;
+				}
+				if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment
+				s += a[j][5]; e += a[j][5];
+				lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]);
+				if (i == 0) ++n_rpt_lifted;
+			}
+			if (i == 0 && n_rpt_lifted == 1) rpt_lifted = lifted[0].slice(0);
+			if (lifted.length) hits[i].lifted = lifted;
+		}
+
+		// prepare for hits grouping
+		for (var i = 0; i < hits.length; ++i) { // set keys for sorting
+			if (hits[i].lifted != null) // TODO: only the first element in lifted[] is used
+				hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3];
+			else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end;
+			hits[i].i = i; // keep the original index
+		}
+
+		// group hits based on the lifted positions on non-ALT sequences
+		if (hits.length > 1) {
+			hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart });
+			var last_chr = null, end = 0, g = -1;
+			for (var i = 0; i < hits.length; ++i) {
+				if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0;
+				else if (hits[i].pstart >= end) ++g;
+				hits[i].g = g;
+				end = end > hits[i].pend? end : hits[i].pend;
+			}
+		} else hits[0].g = 0;
+
+		// find the index and group id of the reported hit; find the size of the reported group
+		var reported_g = null, reported_i = null, n_group0 = 0;
+		if (hits.length > 1) {
+			for (var i = 0; i < hits.length; ++i)
+				if (hits[i].i == 0)
+					reported_g = hits[i].g, reported_i = i;
+			for (var i = 0; i < hits.length; ++i)
+				if (hits[i].g == reported_g)
+					++n_group0;
+		} else {
+			if (is_alt[hits[0].ctg] == null) { // no need to go through the following if the single hit is non-ALT
+				buf2.push(t);
+				continue;
+			}
+			reported_g = reported_i = 0, n_group0 = 1;
+		}
+
+		// re-estimate mapping quality if necessary
+		var mapQ, ori_mapQ = t[4];
+		if (n_group0 > 1) {
+			var group_max = [];
+			for (var i = 0; i < hits.length; ++i) {
+				var g = hits[i].g;
+				if (group_max[g] == null || group_max[g][0] < hits[i].score)
+					group_max[g] = [hits[i].score, g];
+			}
+			if (group_max.length > 1)
+				group_max.sort(function(x,y) {return y[0]-x[0]});
+			if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM
+				mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]);
+			} else mapQ = 0;
+			mapQ = mapQ < 60? mapQ : 60;
+			if (idx_alt[t[2]] == null) mapQ = mapQ < ori_mapQ? mapQ : ori_mapQ;
+			else mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ;
+		} else mapQ = t[4];
+
+		// find out whether the read is overlapping HLA genes
+		if (hits[reported_i].pctg == hla_chr) {
+			var rpt_start = 1<<30, rpt_end = 0;
+			for (var i = 0; i < hits.length; ++i) {
+				var h = hits[i];
+				if (h.g == reported_g) {
+					rpt_start = rpt_start < h.pstart? rpt_start : h.pstart;
+					rpt_end   = rpt_end   > h.pend  ? rpt_end   : h.pend;
+				}
+			}
+			collect_hla_hits(idx_pri, hla_chr, rpt_start, rpt_end, hla);
+		}
+
+		// adjust the mapQ of the primary hits
+		if (n_rpt_lifted <= 1) {
+			var l = n_rpt_lifted == 1? rpt_lifted : null;
+			for (var i = 0; i < buf2.length; ++i) {
+				var s = buf2[i], is_ovlp = true;
+				if (l != null) {
+					if (l[0] != s[2]) is_ovlp = false; // different chr
+					else if (((s[1]&16) != 0) != l[1]) is_ovlp = false; // different strand
+					else {
+						var start = s[3] - 1, end = start;
+						while ((m = re_cigar.exec(t[5])) != null)
+							if (m[2] == 'M' || m[2] == 'D' || m[2] == 'N')
+								end += parseInt(m[1]);
+						if (!(start < l[3] && l[2] < end)) is_ovlp = false; // no overlap
+					}
+				} else is_ovlp = false;
+				// get the "pa" tag if present
+				var om = -1, pa = 10.;
+				for (var j = 11; j < s.length; ++j)
+					if ((m = /^om:i:(\d+)/.exec(s[j])) != null)
+						om = parseInt(m[1]);
+					else if ((m = /^pa:f:(\S+)/.exec(s[j])) != null)
+						pa = parseFloat(m[1]);
+				if (is_ovlp) { // overlapping the lifted hit
+					if (om > 0) s[4] = om;
+					s[4] = s[4] < mapQ? s[4] : mapQ;
+				} else if (pa < opt.min_pa_ratio) { // not overlapping; has a small pa
+					if (om < 0) s.push("om:i:" + s[4]);
+					s[4] = 0;
+				}
+			}
+		}
+
+		// generate lifted_str
+		for (var i = 0; i < hits.length; ++i) {
+			if (hits[i].lifted && hits[i].lifted.length) {
+				var u = '', lifted = hits[i].lifted;
+				for (var j = 0; j < lifted.length; ++j)
+					u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";";
+				hits[i].lifted_str = u;
+			}
+		}
+
+		// stage the reported hit
+		t[4] = mapQ;
+		if (n_group0 > 1) t.push("om:i:"+ori_mapQ);
+		if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str);
+		buf2.push(t);
+
+		// stage the hits generated from the XA tag
+		var cnt = 0, rs = null, rq = null; // rq: reverse quality; rs: reverse complement sequence
+		var rg = (m = /\t(RG:Z:\S+)/.exec(line)) != null? m[1] : null;
+		for (var i = 0; i < hits.length; ++i) {
+			if (hits[i].g != reported_g || i == reported_i) continue;
+			if (idx_alt[hits[i].ctg] == null) continue;
+			var s = [t[0], 0, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, t[6], t[7], t[8]];
+			if (t[6] == '=' && s[2] != t[2]) s[6] = t[2];
+			// print sequence/quality and set the rev flag
+			if (hits[i].rev == hits[reported_i].rev) {
+				s.push(t[9], t[10]);
+				s[1] = flag | 0x800;
+			} else { // we need to write the reverse sequence
+				if (rs == null || rq == null) {
+					aux.length = 0;
+					aux.set(t[9], 0); aux.revcomp(); rs = aux.toString();
+					aux.set(t[10],0); aux.reverse(); rq = aux.toString();
+				}
+				s.push(rs, rq);
+				s[1] = (flag ^ 0x10) | 0x800;
+			}
+			s.push("NM:i:" + hits[i].NM);
+			if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str);
+			if (rg != null) s.push(rg);
+			buf2.push(s);
+		}
+	}
+	print_buffer(buf2, fp_hla, hla);
+	file.close();
+	if (fp_hla != null)
+		for (var h in fp_hla)
+			fp_hla[h].close();
+
+	buf.destroy();
+	aux.destroy();
+}
+
+bwa_postalt(arguments);
diff --git a/bwakit/run-HLA b/bwakit/run-HLA
new file mode 100755
index 0000000..4fee16f
--- /dev/null
+++ b/bwakit/run-HLA
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+ctg_opt=""
+if [ $# -gt 1 ] && [ $1 == '-A' ]; then
+	ctg_opt="-A"
+	shift
+fi
+
+if [ $# -eq 0 ]; then
+	echo "Usage: $0 <prefix>"
+	exit 1
+fi
+
+for f in $1.HLA-*.fq; do
+	gene=`echo $f | perl -pe 's/^.*(HLA-[A-Z]+[0-9]*).*fq$/$1/'`
+	echo -e "\n*** Processing gene $gene...\n" >&2
+	`dirname $0`/typeHLA.sh $ctg_opt $1 $gene
+done
+
+ls $1.HLA-*.gt | xargs -i echo grep ^GT {} \| head -1 | sh | sed "s,^GT,$1,"
diff --git a/bwakit/run-bwamem b/bwakit/run-bwamem
new file mode 100755
index 0000000..165f93e
--- /dev/null
+++ b/bwakit/run-bwamem
@@ -0,0 +1,186 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+my %opts = (t=>1);
+getopts("SadskHo:R:x:t:", \%opts);
+
+die('
+Usage:   run-bwamem [options] <idxbase> <file1> [file2]
+
+Options: -o STR    prefix for output files                       [inferred from input]
+         -R STR    read group header line such as \'@RG\tID:foo\tSM:bar\'         [null]
+         -x STR    read type: pacbio, ont2d or intractg                      [default]
+                   intractg: intra-species contig (kb query, highly similar)
+                   pacbio:   pacbio subreads (~10kb query, high error rate)
+                   ont2d:    Oxford Nanopore reads (~10kb query, higher error rate)
+         -t INT    number of threads                                               [1]
+
+         -H        apply HLA typing
+         -a        trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
+         -d        mark duplicate (via samblaster)
+         -S        for BAM input, don\'t shuffle
+         -s        sort the output alignment (via samtools; requring more RAM)
+         -k        keep temporary files generated by typeHLA
+
+Examples:
+
+ * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
+
+     run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
+
+   Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
+   with the quality of input. It is only intended for research purpose, not for diagnostic.
+
+ * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
+   sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
+   the two types. Specifying -R stops read group transfer.
+
+     run-bwamem -sao prefix hs38DH.fa old-srt.bam
+
+   Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
+   mapping pipeline better. It is conservative and suboptimal. A more sophisticated
+   trimmer is recommended if this becomes a concern.
+
+ * Remap name-grouped BAM and mark duplicates:
+ 
+     run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
+
+   Note: streamed duplicate marking requires all reads from a single paired-end library
+   to be aligned at the same time.
+
+Output files:
+
+  {-o}.aln.bam - final alignment
+  {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
+  {-o}.hla.all - additional HLA genotypes consistent with data
+  {-o}.log.*   - log files
+
+') if @ARGV < 2;
+
+my $idx = $ARGV[0];
+
+my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
+my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
+$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
+die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
+
+die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
+  unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
+
+if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
+	warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
+	@ARGV = 2;
+}
+
+if (defined($opts{p}) && @ARGV >= 3) {
+	warn("WARNING: option -P is ignored as there are two input sequence files.\n");
+	delete $opts{p};
+}
+
+my $prefix;
+if (defined $opts{o}) {
+	$prefix = $opts{o};
+} elsif (@ARGV >= 3) {
+	my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
+	my $i;
+	for ($i = 0; $i < $len; ++$i) {
+		last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
+	}
+	$prefix = substr($ARGV[1], 0, $i) if $i > 0;
+} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
+	$prefix = $1;
+}
+die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
+
+my $size = 0;
+my $comp_ratio = 3.;
+for my $f (@ARGV[1..$#ARGV]) {
+	my @a = stat($f);
+	my $s = $a[7];
+	die("ERROR: failed to read file $f\n") if !defined($s);
+	$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
+	$size += int($s) + 1;
+}
+
+my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
+my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
+
+if (defined($opts{x})) {
+	delete($opts{d}); delete($opts{a}); delete $opts{p};
+}
+
+# for BAM input, find @RG header lines
+my @RG_lines = ();
+if ($is_bam && !defined($opts{R})) {
+	my $fh;
+	open($fh, "$root/samtools view -H $ARGV[1] |") || die;
+	while (<$fh>) {
+		chomp;
+		if (/^\@RG\t/) {
+			s/\t/\\t/g;
+			push(@RG_lines, "-H'$_'");
+		}
+	}
+	close($fh);
+}
+
+warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0;
+
+my $cmd = '';
+if ($is_bam) {
+	my $cmd_sam2bam = "cat $ARGV[1] \\\n";
+	my $ntmps = int($size / 4e9) + 1;
+	my $cmd_shuf = !defined($opts{S})? "  | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
+	my $bam2fq_opt = @RG_lines > 0? " -t" : "";
+	my $cmd_bam2fq = "  | $root/htsbox bam2fq -O$bam2fq_opt - \\\n";
+	$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
+} elsif (@ARGV >= 3) {
+	$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
+} else {
+	$cmd = "cat $ARGV[1] \\\n";
+}
+
+my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "");
+$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
+
+$cmd .= "  | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
+$cmd .= "  | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
+$cmd .= "  | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
+
+my $has_hla = 0;
+if (-f "$ARGV[0].alt") {
+	my $fh;
+	open($fh, "$ARGV[0].alt") || die;
+	while (<$fh>) {
+		$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
+	}
+	close($fh);
+	my $hla_pre = $has_hla? "-p $prefix.hla " : "";
+	$cmd .= "  | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
+}
+
+my $t_sort = $opts{t} < 4? $opts{t} : 4;
+$cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - $prefix.aln;\n" : "  | $root/samtools view -1 - > $prefix.aln.bam;\n";
+
+if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
+	$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
+	$cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
+	$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
+}
+
+print $cmd;
+
+sub which
+{
+	my $file = shift;
+	my $path = (@_)? shift : $ENV{PATH};
+	return if (!defined($path));
+	foreach my $x (split(":", $path)) {
+		$x =~ s/\/$//;
+		return "$x/$file" if (-x "$x/$file");
+	}
+	return;
+}
diff --git a/bwakit/run-gen-ref b/bwakit/run-gen-ref
new file mode 100755
index 0000000..e129b62
--- /dev/null
+++ b/bwakit/run-gen-ref
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+root=`dirname $0`
+
+url38="ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
+url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
+
+if [ $# -eq 0 ]; then
+	echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
+	echo "Analysis sets:"
+	echo "  hs38     primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
+	echo "  hs38a    hs38 plus ALT contigs"
+	echo "  hs38DH   hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
+	echo "  hs37     primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
+	echo "  hs37d5   hs37 plus decoy contigs (used by 1000g phase 3)"
+	echo ""
+	echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
+	echo "      sequences and ALT-to-REF mapping included in the bwa.kit package."
+	exit 1;
+fi
+
+if [ $1 == "hs38DH" ]; then
+	(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
+	[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
+elif [ $1 == "hs38a" ]; then
+	wget -O- $url38 | gzip -dc > $1.fa
+	[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
+elif [ $1 == "hs38" ]; then
+	wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
+elif [ $1 == "hs37d5" ]; then
+	wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
+elif [ $1 == "hs37" ]; then
+	wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
+else
+	echo "ERROR: unknown genome build"
+fi
+
+[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"
+
diff --git a/bwakit/typeHLA-selctg.js b/bwakit/typeHLA-selctg.js
new file mode 100644
index 0000000..0e02a65
--- /dev/null
+++ b/bwakit/typeHLA-selctg.js
@@ -0,0 +1,62 @@
+var min_ovlp = 30;
+
+if (arguments.length < 3) {
+	print("Usage: k8 selctg.js <HLA-gene> <HLA-ALT-exons.bed> <ctg-to-ALT.sam> [min_ovlp="+min_ovlp+"]");
+	exit(1);
+}
+
+if (arguments.length >= 4) min_ovlp = parseInt(arguments[3]);
+var gene = arguments[0];
+
+var buf = new Bytes();
+
+var h = {};
+var file = new File(arguments[1]);
+while (file.readline(buf) >= 0) {
+	var t = buf.toString().split("\t");
+	if (t[3] != gene) continue;
+	if (h[t[0]] == null) h[t[0]] = [];
+	h[t[0]].push([parseInt(t[1]), parseInt(t[2])]);
+}
+file.close();
+
+var s = {}, re = /(\d+)([MIDSHN])/g;
+file = new File(arguments[2]);
+while (file.readline(buf) >= 0) {
+	var line = buf.toString();
+	var m, t = line.split("\t");
+	var x = h[t[2]];
+	if (x == null) continue;
+
+	var start = parseInt(t[3]) - 1, end = start;
+	while ((m = re.exec(t[5])) != null) // parse CIGAR to get the end position
+		if (m[2] == 'M' || m[2] == 'D')
+			end += parseInt(m[1]);
+
+	var max_ovlp = 0;
+	for (var i = 0; i < x.length; ++i) {
+		var max_left = x[i][0] > start? x[i][0] : start;
+		var min_rght = x[i][1] < end  ? x[i][1] : end;
+		max_ovlp = max_ovlp > min_rght - max_left? max_ovlp : min_rght - max_left;
+	}
+
+	var AS = null, XS = null;
+	if ((m = /AS:i:(\d+)/.exec(line)) != null) AS = parseInt(m[1]);
+	if ((m = /XS:i:(\d+)/.exec(line)) != null) XS = parseInt(m[1]);
+
+	if (s[t[0]] == null) s[t[0]] = [];
+	s[t[0]].push([AS, XS, max_ovlp]);
+}
+file.close();
+
+buf.destroy();
+
+for (var x in s) {
+	var is_rejected = false, y = s[x];
+	y.sort(function(a,b) {return b[0]-a[0]});
+	for (var i = 0; i < y.length && y[i][0] == y[0][0]; ++i)
+		if (y[0][2] < min_ovlp || y[i][0] == y[i][1])
+			is_rejected = true;
+	if (is_rejected) continue;
+	print(x);
+}
diff --git a/bwakit/typeHLA.js b/bwakit/typeHLA.js
new file mode 100644
index 0000000..b265d07
--- /dev/null
+++ b/bwakit/typeHLA.js
@@ -0,0 +1,496 @@
+/*****************************************************************
+ * The K8 Javascript interpreter is required to run this script. *
+ *                                                               *
+ *   Source code: https://github.com/attractivechaos/k8          *
+ *   Binary: http://sourceforge.net/projects/lh3/files/k8/       *
+ *****************************************************************/
+
+var getopt = function(args, ostr) {
+	var oli; // option letter list index
+	if (typeof(getopt.place) == 'undefined')
+		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
+	if (getopt.place == -1) { // update scanning pointer
+		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
+			getopt.place = -1;
+			return null;
+		}
+		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
+			++getopt.ind;
+			getopt.place = -1;
+			return null;
+		}
+	}
+	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
+	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
+		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
+		if (getopt.place < 0) ++getopt.ind;
+		return '?';
+	}
+	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
+		getopt.arg = null;
+		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
+	} else { // need an argument
+		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
+			getopt.arg = args[getopt.ind].substr(getopt.place);
+		else if (args.length <= ++getopt.ind) { // no arg
+			getopt.place = -1;
+			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
+			return '?';
+		} else getopt.arg = args[getopt.ind]; // white space
+		getopt.place = -1;
+		++getopt.ind;
+	}
+	return optopt;
+}
+
+/************************
+ * Command line parsing *
+ ************************/
+
+var ver = "r19";
+var c, thres_len = 50, thres_ratio = .8, thres_nm = 5, thres_frac = .33, dbg = false;
+
+// parse command line options
+while ((c = getopt(arguments, "vdl:n:f:")) != null) {
+	if (c == 'l') thres_len = parseInt(getopt.arg);
+	else if (c == 'n') thres_nm = parseInt(getopt.arg);
+	else if (c == 'd') dbg = true;
+	else if (c == 'f') thres_frac = parseFloat(getopt.arg);
+	else if (c == 'v') { print(ver); exit(0); }
+}
+if (arguments.length == getopt.ind) {
+	print("");
+	print("Usage:   k8 typeHLA.js [options] <exon-to-contig.sam>\n");
+	print("Options: -n INT     drop a contig if the edit distance to the closest gene is >INT ["+thres_nm+"]");
+	print("         -l INT     drop a contig if its match too short ["+thres_len+"]");
+	print("         -f FLOAT   drop inconsistent contigs if their length <FLOAT fraction of total length ["+thres_ratio.toFixed(2)+"]");
+	print("         -d         output extra info for debugging");
+	print("         -v         show version number");
+	print("");
+	print("Note: The output is TAB delimited with each GT line consisting of allele1, allele2,");
+	print("      #mismatches/gaps on primary exons, #mismatches/gaps on other exons and #exons");
+	print("      used in typing. If unusure, use the first GT line as the final genotype.\n");
+	exit(1);
+}
+
+/*********************************
+ * Read gene-to-contig alignment *
+ *********************************/
+
+var file = new File(arguments[getopt.ind]);
+var buf = new Bytes();
+var re_cigar = /(\d+)([MIDSH])/g;
+
+var len = {}, list = [], gcnt = [];
+while (file.readline(buf) >= 0) {
+	var m, mm, line = buf.toString();
+	var t = line.split("\t");
+	var flag = parseInt(t[1]);
+	// SAM header
+	if (t[0].charAt(0) == '@') {
+		if (t[0] == '@SQ' && (m = /LN:(\d+)/.exec(line)) != null && (mm = /SN:(\S+)/.exec(line)) != null)
+			len[mm[1]] = parseInt(m[1]);
+		continue;
+	}
+	// parse gene name and exon number
+	var gene = null, exon = null;
+	if ((m = /^(HLA-[^\s_]+)_(\d+)/.exec(t[0])) != null) {
+		gene = m[1], exon = parseInt(m[2]) - 1;
+		if (gcnt[exon] == null) gcnt[exon] = {};
+		gcnt[exon][gene] = true;
+	}
+	if (gene == null || exon == null || t[2] == '*') continue;
+	// parse clipping and aligned length
+	var x = 0, ts = parseInt(t[3]) - 1, te = ts, clip = [0, 0];
+	while ((m = re_cigar.exec(t[5])) != null) {
+		var l = parseInt(m[1]);
+		if (m[2] == 'M') x += l, te += l;
+		else if (m[2] == 'I') x += l;
+		else if (m[2] == 'D') te += l;
+		else if (m[2] == 'S' || m[2] == 'H') clip[x==0?0:1] = l;
+	}
+	var tl = len[t[2]];
+	var left  = ts < clip[0]? ts : clip[0];
+	var right = tl - te < clip[1]? tl - te : clip[1];
+	var qs, qe, ql = clip[0] + x + clip[1];
+	if (flag & 16) qs = clip[1], qe = ql - clip[0];
+	else qs = clip[0], qe = ql - clip[1];
+	var nm = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0;
+	list.push([t[2], gene, exon, ts, te, nm, left + right, qs, qe, ql]); // left+right should be 0 given a prefix-suffix alignment
+}
+
+buf.destroy();
+file.close();
+
+/**************************************
+ * Prepare data structures for typing *
+ **************************************/
+
+// identify the primary exons, the exons associated with most genes
+var pri_exon = [], n_pri_exons;
+{
+	var cnt = [], max = 0;
+	// count the number of genes per exon and track the max
+	for (var e = 0; e < gcnt.length; ++e) {
+		if (gcnt[e] != null) {
+			var c = 0, h = gcnt[e];
+			for (var x in h) ++c;
+			cnt[e] = c;
+			max = max > c? max : c;
+		} else cnt[e] = 0;
+	}
+	warn("- Number of genes for each exon: [" +cnt.join(",") + "]");
+	// find primary exons
+	var pri_list = [];
+	for (var e = 0; e < cnt.length; ++e) {
+		if (cnt[e] == max) pri_list.push(e + 1);
+		pri_exon[e] = cnt[e] == max? 1 : 0;
+	}
+	warn("- List of primary exon(s): ["+pri_list.join(",")+"]");
+	n_pri_exons = pri_list.length;
+}
+
+// convert strings to integers (for performance)
+var ghash = {}, glist = [], chash = {}, clist = [], elist = [];
+for (var i = 0; i < list.length; ++i) {
+	if (ghash[list[i][1]] == null) {
+		ghash[list[i][1]] = glist.length;
+		glist.push(list[i][1]);
+	}
+	if (chash[list[i][0]] == null) {
+		chash[list[i][0]] = clist.length;
+		clist.push(list[i][0]);
+	}
+	var g = ghash[list[i][1]];
+	if (elist[g] == null) elist[g] = {};
+	elist[g][list[i][2]] = true;
+}
+
+// extract the 3rd and 4th digits
+var gsub = [], gsuf = [];
+for (var i = 0; i < glist.length; ++i) {
+	var m = /^HLA-[^*\s]+\*\d+:(\d+).*([A-Z]?)$/.exec(glist[i]);
+	gsub[i] = parseInt(m[1]);
+	gsuf[i] = /[A-Z]$/.test(glist[i])? 1 : 0;
+}
+
+/*************************************************
+ * Collect genes with perfect matches on primary *
+ *************************************************/
+
+// collect exons with fully covered by perfect match(es)
+var perf_exons = [];
+
+function push_perf_exons(matches, last)
+{
+	matches.sort(function(a, b) { return a[0]-b[0]; });
+	var cov = 0, start = 0, end = 0;
+	for (var i = 0; i < matches.length; ++i) {
+		if (matches[i][3] > 0) continue;
+		if (matches[i][0] <= end)
+			end = end > matches[i][1]? end : matches[i][1];
+		else cov += end - start, start = matches[i][0], end = matches[i][1];
+	}
+	cov += end - start;
+	if (matches[0][2] == cov) {
+		if (perf_exons[last[1]] == null) perf_exons[last[1]] = [];
+		//print(last[0], last[1], ghash[last[0]]);
+		perf_exons[last[1]].push(ghash[last[0]]);
+	}
+}
+
+var last = [null, -1], matches = [];
+for (var i = 0; i < list.length; ++i) {
+	var li = list[i];
+	if (last[0] != li[1] || last[1] != li[2]) {
+		if (matches.length) push_perf_exons(matches, last);
+		matches = [];
+		last = [li[1], li[2]];
+	}
+	matches.push([li[7], li[8], li[9], li[5]+li[6]]);
+}
+if (matches.length) push_perf_exons(matches, last);
+
+// for each gene, count how many primary exons are perfect
+var pg_aux_cnt = {};
+for (var e = 0; e < perf_exons.length; ++e) {
+	if (!pri_exon[e]) continue;
+	var pe = perf_exons[e];
+	var n = pe? pe.length : 0;
+	for (var i = 0; i < n; ++i) {
+		var g = pe[i];
+		if (pg_aux_cnt[g] == null) pg_aux_cnt[g] = 1;
+		else ++pg_aux_cnt[g];
+	}
+}
+
+// find genes with perfect matches on the primary exons
+var perf_genes = [];
+for (var g in pg_aux_cnt)
+	if (pg_aux_cnt[g] == n_pri_exons)
+		perf_genes.push(parseInt(g));
+warn("- Found " +perf_genes.length+ " genes fully covered by perfect matches on the primary exon(s)");
+
+var h_perf_genes = {};
+for (var i = 0; i < perf_genes.length; ++i) {
+	if (dbg) print("PG", glist[perf_genes[i]]);
+	h_perf_genes[perf_genes[i]] = true;
+}
+
+/*******************
+ * Filter hit list *
+ *******************/
+
+// reorganize hits to exons
+function list2exons(list, flt_flag, perf_hash)
+{
+	var exons = [];
+	for (var i = 0; i < list.length; ++i) {
+		var li = list[i], c = chash[li[0]], g = ghash[li[1]];
+		if (flt_flag != null && flt_flag[c] == 1) continue;
+		if (perf_hash != null && !perf_hash[g]) continue;
+		if (exons[li[2]] == null) exons[li[2]] = [];
+		exons[li[2]].push([c, g, li[5] + li[6], li[4] - li[3]]);
+	}
+	return exons;
+}
+
+var exons = list2exons(list), flt_flag = [], ovlp_len = [];
+for (var c = 0; c < clist.length; ++c) flt_flag[c] = ovlp_len[c] = 0;
+for (var e = 0; e < exons.length; ++e) {
+	if (!pri_exon[e]) continue;
+	var ee = exons[e];
+	var max_len = [];
+	for (var c = 0; c < clist.length; ++c) max_len[c] = 0;
+	for (var i = 0; i < ee.length; ++i) {
+		var l = ee[i][3] - ee[i][2];
+		if (l < 1) l = 1;
+		if (max_len[ee[i][0]] < l) max_len[ee[i][0]] = l;
+	}
+	for (var c = 0; c < clist.length; ++c) ovlp_len[c] += max_len[c];
+	for (var i = 0; i < ee.length; ++i)
+		flt_flag[ee[i][0]] |= (!h_perf_genes[ee[i][1]] || ee[i][2])? 1 : 1<<1;	
+}
+
+var l_cons = 0, l_incons = 0;
+for (var c = 0; c < clist.length; ++c)
+	if (flt_flag[c]&2) l_cons += ovlp_len[c];
+	else if (flt_flag[c] == 1) l_incons += ovlp_len[c];
+
+warn("- Total length of contigs consistent/inconsistent with perfect genes: " +l_cons+ "/" +l_incons);
+var attempt_perf = (l_incons/(l_cons+l_incons) < thres_frac);
+
+/********************************
+ * Core function for genotyping *
+ ********************************/
+
+function type_gene(perf_mode)
+{
+	if (perf_mode) {
+		var flt_list = [];
+		for (var c = 0; c < clist.length; ++c)
+			if (flt_flag[c] == 1) flt_list.push(clist[c]);
+		warn("  - Filtered " +flt_list.length+ " inconsistent contig(s): [" +flt_list.join(",")+ "]");
+		exons = list2exons(list, flt_flag, h_perf_genes);
+	} else exons = list2exons(list);
+
+	/***********************
+	 * Score each genotype *
+	 ***********************/
+
+	// initialize genotype scores
+	var pair = [];
+	for (var i = 0; i < glist.length; ++i) {
+		pair[i] = [];
+		for (var j = 0; j <= i; ++j)
+			pair[i][j] = 0;
+	}
+
+	// these two arrays are used to output debugging information
+	var score = [], ctg = [];
+
+	function type_exon(e, gt_list)
+	{
+		function update_pair(x, m, is_pri)
+		{
+			var y, z;
+			y = (x>>14&0xff) + m < 0xff? (x>>14&0xff) + m : 0xff;
+			if (is_pri) z = (x>>22) + m < 0xff? (x>>22) + m : 0xff;
+			else z = x>>22;
+			return z<<22 | y<<14 | ((x&0x3fff) + (1<<6|is_pri));
+		}
+
+		score[e] = []; ctg[e] = [];
+		if (exons[e] == null) return;
+		var ee = exons[e], is_pri = pri_exon[e]? 1 : 0;
+		// find contigs and genes associated with the current exon
+		var ch = {}, gh = {};
+		for (var i = 0; i < ee.length; ++i)
+			if (elist[ee[i][1]][e] != null)
+				ch[ee[i][0]] = true, gh[ee[i][1]] = true;
+		var ga = [], ca = ctg[e];
+		for (var c in ch) ca.push(parseInt(c));
+		for (var g in gh) ga.push(parseInt(g));
+		var named_ca = [];
+		for (var i = 0; i < ca.length; ++i) named_ca.push(clist[ca[i]]);
+		warn("    - Processing exon "+(e+1)+" (" +ga.length+ " genes; " +ca.length+ " contigs: [" +named_ca.join(", ")+ "])...");
+		// set unmapped entries to high mismatch
+		var sc = score[e];
+		for (var k = 0; k < ga.length; ++k) {
+			var g = ga[k];
+			if (sc[g] == null) sc[g] = [];
+			for (var i = 0; i < ca.length; ++i)
+				sc[g][ca[i]] = 0xff;
+		}
+		// convert representation again and compute max_len[]
+		var max_len = [];
+		for (var i = 0; i < ee.length; ++i) {
+			var c = ee[i][0], g = ee[i][1];
+			if (gh[g] == null || ch[c] == null) continue;
+			sc[g][c] = sc[g][c] < ee[i][2]? sc[g][c] : ee[i][2];
+			if (max_len[c] == null) max_len[c] = 0;
+			max_len[c] = max_len[c] > ee[i][3]? max_len[c] : ee[i][3];
+		}
+		// drop mismapped contigs
+		var max_max_len = 0;
+		for (var k = 0; k < ca.length; ++k)
+			max_max_len = max_max_len > max_len[ca[k]]? max_max_len : max_len[ca[k]];
+		var dropped = [];
+		for (var k = 0; k < ca.length; ++k) {
+			var min = 0x7fffffff, c = ca[k];
+			for (var i = 0; i < ga.length; ++i) {
+				var g = ga[i];
+				min = min < sc[g][c]? min : sc[g][c];
+			}
+			dropped[c] = min > thres_nm? true : false;
+			if (max_len[c] < thres_len && max_len[c] < thres_ratio * max_max_len) dropped[c] = true;
+			if (dropped[c]) warn("      . Dropped low-quality contig " +clist[c]+ " (minNM=" +min+ "; maxLen=" +max_len[c]+ ")");
+		}
+		// fill the pair array
+		if (gt_list == null) {
+			for (var i = 0; i < ga.length; ++i) {
+				var m = 0, gi = ga[i], g1 = sc[gi];
+				// homozygous
+				for (var k = 0; k < ca.length; ++k) {
+					var c = ca[k];
+					if (!dropped[c]) m += g1[c];
+				}
+				pair[gi][gi] = update_pair(pair[gi][gi], m, is_pri);
+				// heterozygous
+				for (var j = i + 1; j < ga.length; ++j) {
+					var gj = ga[j], g2 = sc[gj], m = 0, a = [0, 0];
+					for (var k = 0; k < ca.length; ++k) {
+						var c = ca[k];
+						if (!dropped[c]) {
+							m += g1[c] < g2[c]? g1[c] : g2[c];
+							++a[g1[c]<g2[c]? 0:1];
+						}
+					}
+					if (a[0] == 0 || a[1] == 0) m = 0xff; // if all contigs are assigned to one gene, it is not good
+					if (gi < gj) pair[gj][gi] = update_pair(pair[gj][gi], m, is_pri);
+					else pair[gi][gj] = update_pair(pair[gi][gj], m, is_pri);
+				}
+			}
+		} else {
+			var tmp_pairs = [], min = 0xff;
+			for (var i = 0; i < gt_list.length; ++i) {
+				var gt = gt_list[i], m = 0;
+				var g1 = sc[gt[0]], g2 = sc[gt[1]], a = [0, 0];
+				if (g1 == null || g2 == null) continue;
+				if (gt[0] == gt[1]) {
+					for (var k = 0; k < ca.length; ++k) {
+						var c = ca[k];
+						if (!dropped[c]) m += g1[c];
+					}
+				} else {
+					var a = [0, 0];
+					for (k = 0; k < ca.length; ++k) {
+						var c = ca[k];
+						if (!dropped[c]) {
+							m += g1[c] < g2[c]? g1[c] : g2[c];
+							++a[g1[c]<g2[c]? 0:1];
+						}
+					}
+					if (a[0] == 0 || a[1] == 0) m = 0xff;
+				}
+				tmp_pairs.push([gt[0], gt[1], m]);
+				min = min < m? min : m;
+			}
+			if (min < 0xff) {
+				for (var i = 0; i < tmp_pairs.length; ++i) {
+					var t = tmp_pairs[i];
+					pair[t[0]][t[1]] = update_pair(pair[t[0]][t[1]], t[2], is_pri);
+				}
+			} else warn("      . Skipped exon " +(e+1)+ " as the assembly may be incomplete");
+		}
+	}
+
+	// type primary exons
+	warn("  - Processing primary exon(s)...");
+	for (var e = 0; e < exons.length; ++e)
+		if (pri_exon[e]) type_exon(e);
+
+	// generate the list of best genotypes on primary exons
+	var min_nm_pri = 0x7fffffff;
+	for (var i = 0; i < glist.length; ++i)
+		for (var j = 0; j <= i; ++j)
+			if ((pair[i][j]&63) == n_pri_exons)
+				min_nm_pri = min_nm_pri < pair[i][j]>>22? min_nm_pri : pair[i][j]>>22;
+
+	var gt_list = [];
+	for (var i = 0; i < glist.length; ++i)
+		for (var j = 0; j <= i; ++j)
+			if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>22 == min_nm_pri)
+				gt_list.push([i, j]);
+
+	warn("  - Collected " +gt_list.length+ " top genotypes on the primary exon(s); minimal edit distance: " +min_nm_pri);
+
+	// type other exons
+	warn("  - Processing other exon(s)...");
+	for (var e = 0; e < exons.length; ++e)
+		if (!pri_exon[e]) type_exon(e, gt_list);
+
+	/*****************************
+	 * Choose the best genotypes *
+	 *****************************/
+
+	// genotyping
+	var min_nm = 0x7fffffff;
+	for (var i = 0; i < glist.length; ++i)
+		for (var j = 0; j <= i; ++j)
+			if ((pair[i][j]&63) == n_pri_exons)
+				min_nm = min_nm < pair[i][j]>>14? min_nm : pair[i][j]>>14;
+
+	var out = [];
+	for (var i = 0; i < glist.length; ++i)
+		for (var j = 0; j <= i; ++j)
+			if ((pair[i][j]&63) == n_pri_exons && pair[i][j]>>14 <= min_nm + 1)
+				out.push([pair[i][j]>>14, pair[i][j]>>6&0xff, i, j, (gsuf[i] + gsuf[j])<<16|(gsub[i] + gsub[j])]);
+
+	out.sort(function(a, b) { return a[0]!=b[0]? a[0]-b[0] : a[1]!=b[1]? b[1]-a[1] : a[4]!=b[4]? a[4]-b[4] : a[2]!=b[2]? a[2]-b[2] : a[3]-b[3]});
+
+	return out;
+}
+
+/**********************
+ * Perform genotyping *
+ **********************/
+
+warn("- Typing in the imperfect mode...");
+var rst = type_gene(false);
+if (attempt_perf) {
+	warn("- Typing in the perfect mode...");
+	var rst_perf = type_gene(true);
+	warn("- Imperfect vs perfect mode: [" +(rst[0][0]>>8&0xff)+ "," +(rst[0][0]&0xff)+ "] vs [" +(rst_perf[0][0]>>8&0xff)+ "," +(rst_perf[0][0]&0xff)+ "]");
+	if (rst_perf[0][0] < rst[0][0]) {
+		warn("- Chose the result from the perfect mode");
+		rst = rst_perf;
+	} else warn("- Chose the result from the imperfect mode");
+} else warn("- Perfect mode is not attempted");
+
+/**********
+ * Output *
+ **********/
+
+for (var i = 0; i < rst.length; ++i)
+	print("GT", glist[rst[i][3]], glist[rst[i][2]], rst[i][0]>>8&0xff, rst[i][0]&0xff, rst[i][1]);
diff --git a/bwakit/typeHLA.sh b/bwakit/typeHLA.sh
new file mode 100755
index 0000000..b73100d
--- /dev/null
+++ b/bwakit/typeHLA.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+is_ctg=0
+
+if [ $# -gt 1 ] && [ $1 == '-A' ]; then
+	is_ctg=1
+	shift
+fi
+
+if [ $# -lt 2 ]; then
+	echo "Usage: $0 [-A] <prefix> <gene>"
+	exit 1
+fi
+
+preres="resource-human-HLA"
+root=`dirname $0`
+pre=$1.$2
+touch $pre.gt
+
+if [ ! -s $pre.fq ]; then
+	echo '** Empty input file. Abort!' >&2
+	exit 0
+fi
+
+if [ $is_ctg -eq 0 ]; then
+	echo "** De novo assembling..." >&2
+	len=`$root/seqtk comp $pre.fq | awk '{++x;y+=$2}END{printf("%.0f\n", y/x)}'`
+	$root/fermi2.pl unitig -f $root/fermi2 -r $root/ropebwt2 -t2 -l$len -p $pre.tmp $pre.fq > $pre.tmp.mak
+	make -f $pre.tmp.mak >&2
+	cp $pre.tmp.mag.gz $pre.mag.gz
+else
+	rm -f $pre.tmp.mag.gz
+	ln -s $pre.fq $pre.tmp.mag.gz
+fi
+
+echo "** Selecting contigs overlapping target exons..." >&2
+(ls $root/$preres/HLA-ALT-idx/*.fa.bwt | sed s,.bwt,, | xargs -i $root/bwa mem -t2 -B1 -O1 -E1 {} $pre.tmp.mag.gz 2>/dev/null) | grep -v ^@ | sort -k3,3 -k4,4n | gzip > $pre.tmp.ALT.sam.gz
+$root/k8 $root/typeHLA-selctg.js $2 $root/$preres/HLA-ALT-exons.bed $pre.tmp.ALT.sam.gz | $root/seqtk subseq $pre.tmp.mag.gz - | gzip -1 > $pre.tmp.fq.gz
+
+echo "** Mapping exons to de novo contigs..." >&2
+$root/bwa index -p $pre.tmp $pre.tmp.fq.gz 2>/dev/null
+$root/seqtk comp $root/$preres/HLA-CDS.fa | cut -f1 | grep ^$2 | $root/seqtk subseq $root/$preres/HLA-CDS.fa - | $root/bwa mem -aD.1 -t2 $pre.tmp - 2>/dev/null | gzip -1 > $pre.sam.gz
+
+echo "** Typing..." >&2
+$root/k8 $root/typeHLA.js $pre.sam.gz > $pre.gt
+
+# delete temporary files
+rm -f $pre.tmp.*
+[ $is_ctg -eq 1 ] && rm -f $pre.mag.gz
diff --git a/bwamem.c b/bwamem.c
index e9d9304..b32c166 100644
--- a/bwamem.c
+++ b/bwamem.c
@@ -2,6 +2,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
 #ifdef HAVE_PTHREAD
 #include <pthread.h>
@@ -57,6 +58,9 @@ mem_opt_t *mem_opt_init()
 	o->zdrop = 100;
 	o->pen_unpaired = 17;
 	o->pen_clip5 = o->pen_clip3 = 5;
+
+	o->max_mem_intv = 20;
+
 	o->min_seed_len = 19;
 	o->split_width = 10;
 	o->max_occ = 500;
@@ -68,7 +72,8 @@ mem_opt_t *mem_opt_init()
 	o->split_factor = 1.5;
 	o->chunk_size = 10000000;
 	o->n_threads = 1;
-	o->max_hits = 5;
+	o->max_XA_hits = 5;
+	o->max_XA_hits_alt = 200;
 	o->max_matesw = 50;
 	o->mask_level_redun = 0.95;
 	o->min_chain_weight = 0;
@@ -132,9 +137,26 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co
 		if (end - start < split_len || p->x[2] > opt->split_width) continue;
 		bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv);
 		for (i = 0; i < a->mem1.n; ++i)
-			if ((a->mem1.a[i].info>>32) - (uint32_t)a->mem1.a[i].info >= opt->min_seed_len)
+			if ((uint32_t)a->mem1.a[i].info - (a->mem1.a[i].info>>32) >= opt->min_seed_len)
 				kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
 	}
+	// third pass: LAST-like
+	if (opt->max_mem_intv > 0) {
+		x = 0;
+		while (x < len) {
+			if (seq[x] < 4) {
+				if (1) {
+					bwtintv_t m;
+					x = bwt_seed_strategy1(bwt, len, seq, x, opt->min_seed_len, opt->max_mem_intv, &m);
+					if (m.x[2] > 0) kv_push(bwtintv_t, a->mem, m);
+				} else { // for now, we never come to this block which is slower
+					x = bwt_smem1a(bwt, len, seq, x, start_width, opt->max_mem_intv, &a->mem1, a->tmpv);
+					for (i = 0; i < a->mem1.n; ++i)
+						kv_push(bwtintv_t, a->mem, a->mem1.a[i]);
+				}
+			} else ++x;
+		}
+	}
 	// sort
 	ks_introsort(mem_intv, a->mem.n, a->mem.a);
 }
@@ -151,7 +173,7 @@ typedef struct {
 
 typedef struct {
 	int n, m, first, rid;
-	uint32_t w:30, kept:2;
+	uint32_t w:29, kept:2, is_alt:1;
 	float frac_rep;
 	int64_t pos;
 	mem_seed_t *seeds;
@@ -252,7 +274,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
 		bwtintv_t *p = &aux->mem.a[i];
 		int step, count, slen = (uint32_t)p->info - (p->info>>32); // seed length
 		int64_t k;
-		if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
+		// if (slen < opt->min_seed_len) continue; // ignore if too short or too repetitive
 		step = p->x[2] > opt->max_occ? p->x[2] / opt->max_occ : 1;
 		for (k = count = 0; k < p->x[2] && count < opt->max_occ; k += step, ++count) {
 			mem_chain_t tmp, *lower, *upper;
@@ -272,6 +294,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
 				tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
 				tmp.seeds[0] = s;
 				tmp.rid = rid;
+				tmp.is_alt = !!bns->anns[rid].is_alt;
 				kb_putp(chn, tree, &tmp);
 			}
 		}
@@ -325,7 +348,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a)
 			int j = chains.a[k];
 			int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]);
 			int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]);
-			if (e_min > b_max) { // have overlap
+			if (e_min > b_max && (!a[j].is_alt || a[i].is_alt)) { // have overlap; don't consider ovlp where the kept chain is ALT while the current chain is primary
 				int li = chn_end(a[i]) - chn_beg(a[i]);
 				int lj = chn_end(a[j]) - chn_beg(a[j]);
 				int min_l = li < lj? li : lj;
@@ -371,9 +394,12 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2)
 #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
 KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
 
-#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))
+#define alnreg_hlt(a, b)  ((a).score > (b).score || ((a).score == (b).score && ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && (a).hash < (b).hash))))
 KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt)
 
+#define alnreg_hlt2(a, b) ((a).is_alt < (b).is_alt || ((a).is_alt == (b).is_alt && ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash))))
+KSORT_INIT(mem_ars_hash2, mem_alnreg_t, alnreg_hlt2)
+
 #define PATCH_MAX_R_BW 0.05f
 #define PATCH_MIN_SC_RATIO 0.90f
 
@@ -469,36 +495,73 @@ int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int
 	return n - 1;
 }
 
-void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+typedef kvec_t(int) int_v;
+
+static void mem_mark_primary_se_core(const mem_opt_t *opt, int n, mem_alnreg_t *a, int_v *z)
 { // similar to the loop in mem_chain_flt()
 	int i, k, tmp;
-	kvec_t(int) z;
-	if (n == 0) return;
-	kv_init(z);
-	for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i);
-	ks_introsort(mem_ars_hash, n, a);
 	tmp = opt->a + opt->b;
 	tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp;
 	tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp;
-	kv_push(int, z, 0);
+	z->n = 0;
+	kv_push(int, *z, 0);
 	for (i = 1; i < n; ++i) {
-		for (k = 0; k < z.n; ++k) {
-			int j = z.a[k];
+		for (k = 0; k < z->n; ++k) {
+			int j = z->a[k];
 			int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
 			int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
 			if (e_min > b_max) { // have overlap
 				int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
 				if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
 					if (a[j].sub == 0) a[j].sub = a[i].score;
-					if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
+					if (a[j].score - a[i].score <= tmp && (a[j].is_alt || !a[i].is_alt))
+						++a[j].sub_n;
 					break;
 				}
 			}
 		}
-		if (k == z.n) kv_push(int, z, i);
-		else a[i].secondary = z.a[k];
+		if (k == z->n) kv_push(int, *z, i);
+		else a[i].secondary = z->a[k];
+	}
+}
+
+int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id)
+{
+	int i, n_pri;
+	int_v z = {0,0,0};
+	if (n == 0) return 0;
+	for (i = n_pri = 0; i < n; ++i) {
+		a[i].sub = a[i].alt_sc = 0, a[i].secondary = a[i].secondary_all = -1, a[i].hash = hash_64(id+i);
+		if (!a[i].is_alt) ++n_pri;
+	}
+	ks_introsort(mem_ars_hash, n, a);
+	mem_mark_primary_se_core(opt, n, a, &z);
+	for (i = 0; i < n; ++i) {
+		mem_alnreg_t *p = &a[i];
+		p->secondary_all = i; // keep the rank in the first round
+		if (!p->is_alt && p->secondary >= 0 && a[p->secondary].is_alt)
+			p->alt_sc = a[p->secondary].score;
+	}
+	if (n_pri >= 0 && n_pri < n) {
+		kv_resize(int, z, n);
+		if (n_pri > 0) ks_introsort(mem_ars_hash2, n, a);
+		for (i = 0; i < n; ++i) z.a[a[i].secondary_all] = i;
+		for (i = 0; i < n; ++i) {
+			if (a[i].secondary >= 0) {
+				a[i].secondary_all = z.a[a[i].secondary];
+				if (a[i].is_alt) a[i].secondary = INT_MAX;
+			} else a[i].secondary_all = -1;
+		}
+		if (n_pri > 0) { // mark primary for hits to the primary assembly only
+			for (i = 0; i < n_pri; ++i) a[i].sub = 0, a[i].secondary = -1;
+			mem_mark_primary_se_core(opt, n_pri, a, &z);
+		}
+	} else {
+		for (i = 0; i < n; ++i)
+			a[i].secondary_all = a[i].secondary;
 	}
 	free(z.a);
+	return n_pri;
 }
 
 /*********************************
@@ -621,12 +684,12 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
 			// qd: distance ahead of the seed on query; rd: on reference
 			qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
 			max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
-			w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
+			w = max_gap < p->w? max_gap : p->w; // bounded by the band width
 			if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
 			// similar to the previous four lines, but this time we look at the region behind
 			qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
 			max_gap = cal_max_gap(opt, qd < rd? qd : rd);
-			w = max_gap < opt->w? max_gap : opt->w;
+			w = max_gap < p->w? max_gap : p->w;
 			if (qd - rd < w && rd - qd < w) break;
 		}
 		if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln
@@ -753,7 +816,7 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar)
 	return l;
 }
 
-void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_, int softclip_all)
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_)
 {
 	int i, l_name;
 	mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert
@@ -782,7 +845,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
 		if (p->n_cigar) { // aligned
 			for (i = 0; i < p->n_cigar; ++i) {
 				int c = p->cigar[i]&0xf;
-				if (!softclip_all && (c == 3 || c == 4))
+				if (!(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt && (c == 3 || c == 4))
 					c = which? 4 : 3; // use hard clipping for supplementary alignments
 				kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str);
 			}
@@ -810,7 +873,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
 		kputsn("*\t*", 3, str);
 	} else if (!p->is_rev) { // the forward strand
 		int i, qb = 0, qe = s->l_seq;
-		if (p->n_cigar && which && !softclip_all) { // have cigar && not the primary alignment && not softclip all
+		if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) { // have cigar && not the primary alignment && not softclip all
 			if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qb += p->cigar[0]>>4;
 			if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qe -= p->cigar[p->n_cigar-1]>>4;
 		}
@@ -824,7 +887,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
 		} else kputc('*', str);
 	} else { // the reverse strand
 		int i, qb = 0, qe = s->l_seq;
-		if (p->n_cigar && which && !softclip_all) {
+		if (p->n_cigar && which && !(opt->flag&MEM_F_SOFTCLIP) && !p->is_alt) {
 			if ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3) qe -= p->cigar[0]>>4;
 			if ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3) qb += p->cigar[p->n_cigar-1]>>4;
 		}
@@ -854,7 +917,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
 			for (i = 0; i < n; ++i) {
 				const mem_aln_t *r = &list[i];
 				int k;
-				if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
+				if (i == which || (r->flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit
 				kputs(bns->anns[r->rid].name, str); kputc(',', str);
 				kputl(r->pos+1, str); kputc(',', str);
 				kputc("+-"[r->is_rev], str); kputc(',', str);
@@ -866,9 +929,19 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m
 				kputc(';', str);
 			}
 		}
+		if (p->alt_sc > 0)
+			ksprintf(str, "\tpa:f:%.3f", (double)p->score / p->alt_sc);
 	}
 	if (p->XA) { kputsn("\tXA:Z:", 6, str); kputs(p->XA, str); }
 	if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+	if ((opt->flag&MEM_F_REF_HDR) && p->rid >= 0 && bns->anns[p->rid].anno != 0 && bns->anns[p->rid].anno[0] != 0) {
+		int tmp;
+		kputsn("\tXR:Z:", 6, str);
+		tmp = str->l;
+		kputs(bns->anns[p->rid].anno, str);
+		for (i = tmp; i < str->l; ++i) // replace TAB in the comment to SPACE
+			if (str->s[i] == '\t') str->s[i] = ' ';
+	}
 	kputc('\n', str);
 }
 
@@ -903,42 +976,43 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
 }
 
 // TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible
-void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
+void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m)
 {
 	extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, mem_alnreg_v *a, int l_query, const char *query);
 	kstring_t str;
 	kvec_t(mem_aln_t) aa;
-	int k;
+	int k, l;
 	char **XA = 0;
 
 	if (!(opt->flag & MEM_F_ALL))
 		XA = mem_gen_alt(opt, bns, pac, a, s->l_seq, s->seq);
 	kv_init(aa);
 	str.l = str.m = 0; str.s = 0;
-	for (k = 0; k < a->n; ++k) {
+	for (k = l = 0; k < a->n; ++k) {
 		mem_alnreg_t *p = &a->a[k];
 		mem_aln_t *q;
 		if (p->score < opt->T) continue;
-		if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
-		if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
+		if (p->secondary >= 0 && (p->is_alt || !(opt->flag&MEM_F_ALL))) continue;
+		if (p->secondary >= 0 && p->secondary < INT_MAX && p->score < a->a[p->secondary].score * opt->drop_ratio) continue;
 		q = kv_pushp(mem_aln_t, aa);
 		*q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p);
 		assert(q->rid >= 0); // this should not happen with the new code
 		q->XA = XA? XA[k] : 0;
 		q->flag |= extra_flag; // flag secondary
 		if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score
-		if (k && p->secondary < 0) // if supplementary
+		if (l && p->secondary < 0) // if supplementary
 			q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800;
-		if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+		if (l && !p->is_alt && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq;
+		++l;
 	}
 	if (aa.n == 0) { // no alignments good enough; then write an unaligned record
 		mem_aln_t t;
 		t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0);
 		t.flag |= extra_flag;
-		mem_aln2sam(bns, &str, s, 1, &t, 0, m, opt->flag&MEM_F_SOFTCLIP);
+		mem_aln2sam(opt, bns, &str, s, 1, &t, 0, m);
 	} else {
 		for (k = 0; k < aa.n; ++k)
-			mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m, opt->flag&MEM_F_SOFTCLIP);
+			mem_aln2sam(opt, bns, &str, s, aa.n, aa.a, k, m);
 		for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar);
 		free(aa.a);
 	}
@@ -981,6 +1055,11 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse
 			printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re);
 		}
 	}
+	for (i = 0; i < regs.n; ++i) {
+		mem_alnreg_t *p = &regs.a[i];
+		if (p->rid >= 0 && bns->anns[p->rid].is_alt)
+			p->is_alt = 1;
+	}
 	return regs;
 }
 
@@ -1051,6 +1130,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *
 	assert(a.rid == ar->rid);
 	a.pos = pos - bns->anns[a.rid].offset;
 	a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub;
+	a.is_alt = ar->is_alt; a.alt_sc = ar->alt_sc;
 	free(query);
 	return a;
 }
@@ -1092,7 +1172,7 @@ static void worker2(void *data, int i, int tid)
 			mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]);
 		} else {
 			mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i);
-			mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
+			mem_reg2sam(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
 		}
 		free(w->regs[i].a);
 	} else {
@@ -1106,16 +1186,15 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
 {
 	extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n);
 	worker_t w;
-	mem_alnreg_v *regs;
 	mem_pestat_t pes[4];
 	double ctime, rtime;
 	int i;
 
 	ctime = cputime(); rtime = realtime();
 	global_bns = bns;
-	regs = malloc(n * sizeof(mem_alnreg_v));
+	w.regs = malloc(n * sizeof(mem_alnreg_v));
 	w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac;
-	w.seqs = seqs; w.regs = regs; w.n_processed = n_processed;
+	w.seqs = seqs; w.n_processed = n_processed;
 	w.pes = &pes[0];
 	w.aux = malloc(opt->n_threads * sizeof(smem_aux_t));
 	for (i = 0; i < opt->n_threads; ++i)
@@ -1126,10 +1205,10 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn
 	free(w.aux);
 	if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided
 		if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0
-		else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data
+		else mem_pestat(opt, bns->l_pac, n, w.regs, pes); // otherwise, infer the insert size distribution from data
 	}
 	kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment
-	free(regs);
+	free(w.regs);
 	if (bwa_verbose >= 3)
 		fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime);
 }
diff --git a/bwamem.h b/bwamem.h
index 7b14ca8..8ffe729 100644
--- a/bwamem.h
+++ b/bwamem.h
@@ -18,7 +18,9 @@ typedef struct __smem_i smem_i;
 #define MEM_F_NO_RESCUE 0x20
 #define MEM_F_SELF_OVLP 0x40
 #define MEM_F_ALN_REG   0x80
+#define MEM_F_REF_HDR	0x100
 #define MEM_F_SOFTCLIP  0x200
+#define MEM_F_SMARTPE   0x400
 
 typedef struct {
 	int a, b;               // match score and mismatch penalty
@@ -29,6 +31,8 @@ typedef struct {
 	int w;                  // band width
 	int zdrop;              // Z-dropoff
 
+	uint64_t max_mem_intv;
+
 	int T;                  // output score threshold; only affecting output
 	int flag;               // see MEM_F_* macros
 	int min_seed_len;       // minimum seed length
@@ -48,7 +52,7 @@ typedef struct {
 	int mapQ_coef_fac;
 	int max_ins;            // when estimating insert size distribution, skip pairs with insert longer than this value
 	int max_matesw;         // perform maximally max_matesw rounds of mate-SW for each end
-	int max_hits;           // if there are max_hits or fewer, output them all
+	int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
 	int8_t mat[25];         // scoring matrix; mat[0] == 0 if unset
 } mem_opt_t;
 
@@ -59,13 +63,15 @@ typedef struct {
 	int score;      // best local SW score
 	int truesc;     // actual score corresponding to the aligned region; possibly smaller than $score
 	int sub;        // 2nd best SW score
+	int alt_sc;
 	int csub;       // SW score of a tandem hit
 	int sub_n;      // approximate number of suboptimal hits
 	int w;          // actual band width used in extension
 	int seedcov;    // length of regions coverged by seeds
 	int secondary;  // index of the parent hit shadowing the current hit; <0 if primary
+	int secondary_all;
 	int seedlen0;   // length of the starting seed
-	int n_comp;     // number of sub-alignments chained together
+	int n_comp:30, is_alt:2; // number of sub-alignments chained together
 	float frac_rep;
 	uint64_t hash;
 } mem_alnreg_t;
@@ -82,12 +88,12 @@ typedef struct { // This struct is only used for the convenience of API.
 	int64_t pos;     // forward strand 5'-end mapping position
 	int rid;         // reference sequence index in bntseq_t; <0 for unmapped
 	int flag;        // extra flag
-	uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
+	uint32_t is_rev:1, is_alt:1, mapq:8, NM:22; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance
 	int n_cigar;     // number of CIGAR operations
 	uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234
 	char *XA;        // alternative mappings
 
-	int score, sub;
+	int score, sub, alt_sc;
 } mem_aln_t;
 
 #ifdef __cplusplus
@@ -97,6 +103,7 @@ extern "C" {
 	smem_i *smem_itr_init(const bwt_t *bwt);
 	void smem_itr_destroy(smem_i *itr);
 	void smem_set_query(smem_i *itr, int len, const uint8_t *query);
+	void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv);
 	const bwtintv_v *smem_next(smem_i *itr);
 
 	mem_opt_t *mem_opt_init(void);
diff --git a/bwamem_extra.c b/bwamem_extra.c
index 59bb68e..02e817a 100644
--- a/bwamem_extra.c
+++ b/bwamem_extra.c
@@ -1,3 +1,4 @@
+#include <limits.h>
 #include "bwa.h"
 #include "bwamem.h"
 #include "bntseq.h"
@@ -11,6 +12,8 @@ struct __smem_i {
 	const bwt_t *bwt;
 	const uint8_t *query;
 	int start, len;
+	int min_intv, max_len;
+	uint64_t max_intv;
 	bwtintv_v *matches; // matches; to be returned by smem_next()
 	bwtintv_v *sub;     // sub-matches inside the longest match; temporary
 	bwtintv_v *tmpvec[2]; // temporary arrays
@@ -25,6 +28,9 @@ smem_i *smem_itr_init(const bwt_t *bwt)
 	itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
 	itr->matches   = calloc(1, sizeof(bwtintv_v));
 	itr->sub       = calloc(1, sizeof(bwtintv_v));
+	itr->min_intv = 1;
+	itr->max_len  = INT_MAX;
+	itr->max_intv = 0;
 	return itr;
 }
 
@@ -44,21 +50,22 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query)
 	itr->len = len;
 }
 
+void smem_config(smem_i *itr, int min_intv, int max_len, uint64_t max_intv)
+{
+	itr->min_intv = min_intv;
+	itr->max_len  = max_len;
+	itr->max_intv = max_intv;
+}
+
 const bwtintv_v *smem_next(smem_i *itr)
 {
-	int i, max, max_i, ori_start;
+	int ori_start;
 	itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
 	if (itr->start >= itr->len || itr->start < 0) return 0;
 	while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
 	if (itr->start == itr->len) return 0;
 	ori_start = itr->start;
-	itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
-	if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
-	for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
-		bwtintv_t *p = &itr->matches->a[i];
-		int len = (uint32_t)p->info - (p->info>>32);
-		if (max < len) max = len, max_i = i;
-	}
+	itr->start = bwt_smem1a(itr->bwt, itr->len, itr->query, ori_start, itr->min_intv, itr->max_intv, itr->matches, itr->tmpvec); // search for SMEM
 	return itr->matches;
 }
 
@@ -105,43 +112,54 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac,
 	s->sam = str.s;
 }
 
+static inline int get_pri_idx(double XA_drop_ratio, const mem_alnreg_t *a, int i)
+{
+	int k = a[i].secondary_all;
+	if (k >= 0 && a[i].score >= a[k].score * XA_drop_ratio) return k;
+	return -1;
+}
+
 // Okay, returning strings is bad, but this has happened a lot elsewhere. If I have time, I need serious code cleanup.
 char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query) // ONLY work after mem_mark_primary_se()
 {
-	int i, k, *cnt, tot;
-	kstring_t *aln = 0;
-	char **XA = 0;
+	int i, k, r, *cnt, tot;
+	kstring_t *aln = 0, str = {0,0,0};
+	char **XA = 0, *has_alt;
 
 	cnt = calloc(a->n, sizeof(int));
+	has_alt = calloc(a->n, 1);
 	for (i = 0, tot = 0; i < a->n; ++i) {
-		int j = a->a[i].secondary;
-		if (j >= 0 && a->a[i].score >= a->a[j].score * opt->XA_drop_ratio)
-			++cnt[j], ++tot;
+		r = get_pri_idx(opt->XA_drop_ratio, a->a, i);
+		if (r >= 0) {
+			++cnt[r], ++tot;
+			if (a->a[i].is_alt) has_alt[r] = 1;
+		}
 	}
 	if (tot == 0) goto end_gen_alt;
 	aln = calloc(a->n, sizeof(kstring_t));
 	for (i = 0; i < a->n; ++i) {
 		mem_aln_t t;
-		int j = a->a[i].secondary;
-		if (j < 0 || a->a[i].score < a->a[j].score * opt->XA_drop_ratio) continue; // we don't process the primary alignments as they will be converted to SAM later
-		if (cnt[j] > opt->max_hits) continue;
+		if ((r = get_pri_idx(opt->XA_drop_ratio, a->a, i)) < 0) continue;
+		if (cnt[r] > opt->max_XA_hits_alt || (!has_alt[r] && cnt[r] > opt->max_XA_hits)) continue;
 		t = mem_reg2aln(opt, bns, pac, l_query, query, &a->a[i]);
-		kputs(bns->anns[t.rid].name, &aln[j]);
-		kputc(',', &aln[j]); kputc("+-"[t.is_rev], &aln[j]); kputl(t.pos + 1, &aln[j]);
-		kputc(',', &aln[j]);
+		str.l = 0;
+		kputs(bns->anns[t.rid].name, &str);
+		kputc(',', &str); kputc("+-"[t.is_rev], &str); kputl(t.pos + 1, &str);
+		kputc(',', &str);
 		for (k = 0; k < t.n_cigar; ++k) {
-			kputw(t.cigar[k]>>4, &aln[j]);
-			kputc("MIDSHN"[t.cigar[k]&0xf], &aln[j]);
+			kputw(t.cigar[k]>>4, &str);
+			kputc("MIDSHN"[t.cigar[k]&0xf], &str);
 		}
-		kputc(',', &aln[j]); kputw(t.NM, &aln[j]);
-		kputc(';', &aln[j]);
+		kputc(',', &str); kputw(t.NM, &str);
+		kputc(';', &str);
 		free(t.cigar);
+		kputsn(str.s, str.l, &aln[r]);
 	}
 	XA = calloc(a->n, sizeof(char*));
 	for (k = 0; k < a->n; ++k)
 		XA[k] = aln[k].s;
 
 end_gen_alt:
-	free(cnt); free(aln);
+	free(has_alt); free(cnt); free(aln); free(str.s);
 	return XA;
 }
diff --git a/bwamem_pair.c b/bwamem_pair.c
index a3aeb80..7950736 100644
--- a/bwamem_pair.c
+++ b/bwamem_pair.c
@@ -70,6 +70,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *
 		if (q->n < MIN_DIR_CNT) {
 			fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
 			r->failed = 1;
+			free(q->a);
 			continue;
 		} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
 		ks_introsort_64(q->n, q->a);
@@ -151,6 +152,7 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 			memset(&b, 0, sizeof(mem_alnreg_t));
 			if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0
 				b.rid = a->rid;
+				b.is_alt = a->is_alt;
 				b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;                                                                                                                                                                              
 				b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; 
 				b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
@@ -177,14 +179,14 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 	return n;
 }
 
-int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
+int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2], int n_pri[2])
 {
 	pair64_v v, u;
 	int r, i, k, y[4], ret; // y[] keeps the last hit
 	int64_t l_pac = bns->l_pac;
 	kv_init(v); kv_init(u);
 	for (r = 0; r < 2; ++r) { // loop through read number
-		for (i = 0; i < a[r].n; ++i) {
+		for (i = 0; i < n_pri[r]; ++i) {
 			pair64_t key;
 			mem_alnreg_t *e = &a[r].a[i];
 			key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
@@ -240,21 +242,25 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons
 	return ret;
 }
 
+void mem_aln2sam(const mem_opt_t *opt, const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m);
+
 #define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499))
 
 int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
 {
-	extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
+	extern int mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id);
 	extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
-	extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
-	extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m, int softclip_all);
+	extern void mem_reg2sam(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m);
 	extern char **mem_gen_alt(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_v *a, int l_query, const char *query);
 
-	int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1;
+	int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1, n_pri[2], n_aa[2];
 	kstring_t str;
-	mem_aln_t h[2];
+	mem_aln_t h[2], g[2], aa[2][2];
 
 	str.l = str.m = 0; str.s = 0;
+	memset(h, 0, sizeof(mem_aln_t) * 2);
+	memset(g, 0, sizeof(mem_aln_t) * 2);
+	n_aa[0] = n_aa[1] = 0;
 	if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment
 		mem_alnreg_v b[2];
 		kv_init(b[0]); kv_init(b[1]);
@@ -267,18 +273,18 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 				n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
 		free(b[0].a); free(b[1].a);
 	}
-	mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
-	mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
+	n_pri[0] = mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0);
+	n_pri[1] = mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1);
 	if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
 	// pairing single-end hits
-	if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
+	if (n_pri[0] && n_pri[1] && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z, n_pri)) > 0) {
 		int is_multi[2], q_pe, score_un, q_se[2];
 		char **XA[2];
 		// check if an end has multiple hits even after mate-SW
 		for (i = 0; i < 2; ++i) {
-			for (j = 1; j < a[i].n; ++j)
+			for (j = 1; j < n_pri[i]; ++j)
 				if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break;
-			is_multi[i] = j < a[i].n? 1 : 0;
+			is_multi[i] = j < n_pri[i]? 1 : 0;
 		}
 		if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
 		// compute mapQ for the best SE hit
@@ -310,42 +316,62 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co
 			q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
 			q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
 		}
-		// suboptimal hits
+		for (i = 0; i < 2; ++i) {
+			int k = a[i].a[z[i]].secondary_all;
+			if (k >= 0 && k < n_pri[i]) { // switch secondary and primary if both of them are non-ALT
+				assert(a[i].a[k].secondary_all < 0);
+				for (j = 0; j < a[i].n; ++j)
+					if (a[i].a[j].secondary_all == k || j == k)
+						a[i].a[j].secondary_all = z[i];
+				a[i].a[z[i]].secondary_all = -1;
+			}
+		}
 		if (!(opt->flag & MEM_F_ALL)) {
-			for (i = 0; i < 2; ++i) {
-				int k = a[i].a[z[i]].secondary;
-				if (k >= 0) { // switch secondary and primary
-					assert(a[i].a[k].secondary < 0);
-					for (j = 0; j < a[i].n; ++j)
-						if (a[i].a[j].secondary == k || j == k)
-							a[i].a[j].secondary = z[i];
-					a[i].a[z[i]].secondary = -1;
-				}
+			for (i = 0; i < 2; ++i)
 				XA[i] = mem_gen_alt(opt, bns, pac, &a[i], s[i].l_seq, s[i].seq);
-			}
 		} else XA[0] = XA[1] = 0;
 		// write SAM
-		h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[0].XA = XA[0]? XA[0][z[0]] : 0;
-		h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; h[1].XA = XA[1]? XA[1][z[1]] : 0;
-		mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1], opt->flag&MEM_F_SOFTCLIP); s[0].sam = strdup(str.s); str.l = 0;
-		mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0], opt->flag&MEM_F_SOFTCLIP); s[1].sam = str.s;
-		if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
-		free(h[0].cigar); // h[0].XA will be freed in the following block
-		free(h[1].cigar);
-		// free XA
 		for (i = 0; i < 2; ++i) {
-			if (XA[i]) {
-				for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
-				free(XA[i]);
+			h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[z[i]]);
+			h[i].mapq = q_se[i];
+			h[i].flag |= 0x40<<i | extra_flag;
+			h[i].XA = XA[i]? XA[i][z[i]] : 0;
+			aa[i][n_aa[i]++] = h[i];
+			if (n_pri[i] < a[i].n) { // the read has ALT hits
+				mem_alnreg_t *p = &a[i].a[n_pri[i]];
+				if (p->score < opt->T || p->secondary >= 0 || !p->is_alt) continue;
+				g[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, p);
+				g[i].flag |= 0x800 | 0x40<<i | extra_flag;
+				g[i].XA = XA[i]? XA[i][n_pri[i]] : 0;
+				aa[i][n_aa[i]++] = g[i];
 			}
 		}
+		for (i = 0; i < n_aa[0]; ++i)
+			mem_aln2sam(opt, bns, &str, &s[0], n_aa[0], aa[0], i, &h[1]); // write read1 hits
+		s[0].sam = strdup(str.s); str.l = 0;
+		for (i = 0; i < n_aa[1]; ++i)
+			mem_aln2sam(opt, bns, &str, &s[1], n_aa[1], aa[1], i, &h[0]); // write read2 hits
+		s[1].sam = str.s;
+		if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
+		// free
+		for (i = 0; i < 2; ++i) {
+			free(h[i].cigar); free(g[i].cigar);
+			if (XA[i] == 0) continue;
+			for (j = 0; j < a[i].n; ++j) free(XA[i][j]);
+			free(XA[i]);
+		}
 	} else goto no_pairing;
 	return n;
 
 no_pairing:
 	for (i = 0; i < 2; ++i) {
-		if (a[i].n && a[i].a[0].score >= opt->T)
-			h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]);
+		int which = -1;
+		if (a[i].n) {
+			if (a[i].a[0].score >= opt->T) which = 0;
+			else if (n_pri[i] < a[i].n && a[i].a[n_pri[i]].score >= opt->T)
+				which = n_pri[i];
+		}
+		if (which >= 0) h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[which]);
 		else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0);
 	}
 	if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it.
@@ -354,8 +380,8 @@ no_pairing:
 		d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist);
 		if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2;
 	}
-	mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
-	mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
+	mem_reg2sam(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]);
+	mem_reg2sam(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]);
 	if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name);
 	free(h[0].cigar); free(h[1].cigar);
 	return n;
diff --git a/bwashm.c b/bwashm.c
new file mode 100644
index 0000000..163f764
--- /dev/null
+++ b/bwashm.c
@@ -0,0 +1,213 @@
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include "bwa.h"
+
+int bwa_shm_stage(bwaidx_t *idx, const char *hint, const char *_tmpfn)
+{
+	const char *name;
+	uint8_t *shm, *shm_idx;
+	uint16_t *cnt;
+	int shmid, to_init = 0, l;
+	char path[PATH_MAX + 1], *tmpfn = (char*)_tmpfn;
+
+	if (hint == 0 || hint[0] == 0) return -1;
+	for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+	++name;
+
+	if ((shmid = shm_open("/bwactl", O_RDWR, 0)) < 0) {
+		shmid = shm_open("/bwactl", O_CREAT|O_RDWR|O_EXCL, 0644);
+		to_init = 1;
+	}
+	if (shmid < 0) return -1;
+	ftruncate(shmid, BWA_CTL_SIZE);
+	shm = mmap(0, BWA_CTL_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+	cnt = (uint16_t*)shm;
+	if (to_init) {
+		memset(shm, 0, BWA_CTL_SIZE);
+		cnt[1] = 4;
+	}
+
+	if (idx->mem == 0) bwa_idx2mem(idx);
+
+	if (tmpfn) {
+		FILE *fp;
+		if ((fp = fopen(tmpfn, "wb")) != 0) {
+			int64_t rest = idx->l_mem;
+			while (rest > 0) {
+				int64_t l = rest < 0x1000000? rest : 0x1000000;
+				rest -= fwrite(&idx->mem[idx->l_mem - rest], 1, l, fp);
+			}
+			fclose(fp);
+			free(idx->mem); idx->mem = 0;
+		} else {
+			fprintf(stderr, "[W::%s] fail to create the temporary file. Option '-f' is ignored.\n", __func__);
+			tmpfn = 0;
+		}
+	}
+
+	strcat(strcpy(path, "/bwaidx-"), name);
+	if ((shmid = shm_open(path, O_CREAT|O_RDWR|O_EXCL, 0644)) < 0) {
+		shm_unlink(path);
+		perror("shm_open()");
+		return -1;
+	}
+	l = 8 + strlen(name) + 1;
+	if (cnt[1] + l > BWA_CTL_SIZE) return -1;
+	memcpy(shm + cnt[1], &idx->l_mem, 8);
+	memcpy(shm + cnt[1] + 8, name, l - 8);
+	cnt[1] += l; ++cnt[0];
+	ftruncate(shmid, idx->l_mem);
+	shm_idx = mmap(0, idx->l_mem, PROT_READ|PROT_WRITE, MAP_SHARED, shmid, 0);
+	if (tmpfn) {
+		FILE *fp;
+		fp = fopen(tmpfn, "rb");
+		int64_t rest = idx->l_mem;
+		while (rest > 0) {
+			int64_t l = rest < 0x1000000? rest : 0x1000000;
+			rest -= fread(&shm_idx[idx->l_mem - rest], 1, l, fp);
+		}
+		fclose(fp);
+		unlink(tmpfn);
+	} else {
+		memcpy(shm_idx, idx->mem, idx->l_mem);
+		free(idx->mem);
+	}
+	bwa_mem2idx(idx->l_mem, shm_idx, idx);
+	idx->is_shm = 1;
+	return 0;
+}
+
+bwaidx_t *bwa_idx_load_from_shm(const char *hint)
+{
+	const char *name;
+	uint8_t *shm, *shm_idx;
+	uint16_t *cnt, i;
+	char *p, path[PATH_MAX + 1];
+	int shmid;
+	int64_t l_mem;
+	bwaidx_t *idx;
+
+	if (hint == 0 || hint[0] == 0) return 0;
+	for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+	++name;
+	if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
+	shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+	cnt = (uint16_t*)shm;
+	if (cnt[0] == 0) return 0;
+	for (i = 0, p = (char*)(shm + 4); i < cnt[0]; ++i) {
+		memcpy(&l_mem, p, 8); p += 8;
+		if (strcmp(p, name) == 0) break;
+		p += strlen(p) + 1;
+	}
+	if (i == cnt[0]) return 0;
+
+	strcat(strcpy(path, "/bwaidx-"), name);
+	if ((shmid = shm_open(path, O_RDONLY, 0)) < 0) return 0;
+	shm_idx = mmap(0, l_mem, PROT_READ, MAP_SHARED, shmid, 0);
+	idx = calloc(1, sizeof(bwaidx_t));
+	bwa_mem2idx(l_mem, shm_idx, idx);
+	idx->is_shm = 1;
+	return idx;
+}
+
+int bwa_shm_test(const char *hint)
+{
+	int shmid;
+	uint16_t *cnt, i;
+	char *p, *shm;
+	const char *name;
+
+	if (hint == 0 || hint[0] == 0) return 0;
+	for (name = hint + strlen(hint) - 1; name >= hint && *name != '/'; --name);
+	++name;
+	if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return 0;
+	shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+	cnt = (uint16_t*)shm;
+	for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+		if (strcmp(p + 8, name) == 0) return 1;
+		p += strlen(p) + 9;
+	}
+	return 0;
+}
+
+int bwa_shm_list(void)
+{
+	int shmid;
+	uint16_t *cnt, i;
+	char *p, *shm;
+	if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
+	shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+	cnt = (uint16_t*)shm;
+	for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+		int64_t l_mem;
+		memcpy(&l_mem, p, 8); p += 8;
+		printf("%s\t%ld\n", p, (long)l_mem);
+		p += strlen(p) + 1;
+	}
+	return 0;
+}
+
+int bwa_shm_destroy(void)
+{
+	int shmid;
+	uint16_t *cnt, i;
+	char *p, *shm;
+	char path[PATH_MAX + 1];
+
+	if ((shmid = shm_open("/bwactl", O_RDONLY, 0)) < 0) return -1;
+	shm = mmap(0, BWA_CTL_SIZE, PROT_READ, MAP_SHARED, shmid, 0);
+	cnt = (uint16_t*)shm;
+	for (i = 0, p = shm + 4; i < cnt[0]; ++i) {
+		int64_t l_mem;
+		memcpy(&l_mem, p, 8); p += 8;
+		strcat(strcpy(path, "/bwaidx-"), p);
+		shm_unlink(path);
+		p += strlen(p) + 1;
+	}
+	munmap(shm, BWA_CTL_SIZE);
+	shm_unlink("/bwactl");
+	return 0;
+}
+
+int main_shm(int argc, char *argv[])
+{
+	int c, to_list = 0, to_drop = 0, ret = 0;
+	char *tmpfn = 0;
+	while ((c = getopt(argc, argv, "ldf:")) >= 0) {
+		if (c == 'l') to_list = 1;
+		else if (c == 'd') to_drop = 1;
+		else if (c == 'f') tmpfn = optarg;
+	}
+	if (optind == argc && !to_list && !to_drop) {
+		fprintf(stderr, "\nUsage: bwa shm [-d|-l] [-f tmpFile] [idxbase]\n\n");
+		fprintf(stderr, "Options: -d       destroy all indices in shared memory\n");
+		fprintf(stderr, "         -l       list names of indices in shared memory\n");
+		fprintf(stderr, "         -f FILE  temporary file to reduce peak memory\n\n");
+		return 1;
+	}
+	if (optind < argc && (to_list || to_drop)) {
+		fprintf(stderr, "[E::%s] open -l or -d cannot be used when 'idxbase' is present\n", __func__);
+		return 1;
+	}
+	if (optind < argc) {
+		if (bwa_shm_test(argv[optind]) == 0) {
+			bwaidx_t *idx;
+			idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
+			if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
+				fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
+				ret = 1;
+			}
+			bwa_idx_destroy(idx);
+		} else fprintf(stderr, "[M::%s] index '%s' is already in shared memory\n", __func__, argv[optind]);
+	}
+	if (to_list) bwa_shm_list();
+	if (to_drop) bwa_shm_destroy();
+	return ret;
+}
diff --git a/bwt.c b/bwt.c
index c9bf6a3..9083654 100644
--- a/bwt.c
+++ b/bwt.c
@@ -30,6 +30,7 @@
 #include <string.h>
 #include <assert.h>
 #include <stdint.h>
+#include <limits.h>
 #include "utils.h"
 #include "bwt.h"
 #include "kvec.h"
@@ -284,8 +285,8 @@ static void bwt_reverse_intvs(bwtintv_v *p)
 		}
 	}
 }
-
-int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+// NOTE: $max_intv is not currently used in BWA-MEM
+int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
 {
 	int i, j, c, ret;
 	bwtintv_t ik, ok[4];
@@ -301,7 +302,10 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
 	ik.info = x + 1;
 
 	for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
-		if (q[i] < 4) { // an A/C/G/T base
+		if (ik.x[2] < max_intv) { // an interval small enough
+			kv_push(bwtintv_t, *curr, ik);
+			break;
+		} else if (q[i] < 4) { // an A/C/G/T base
 			c = 3 - q[i]; // complement of q[i]
 			bwt_extend(bwt, &ik, ok, 0);
 			if (ok[c].x[2] != ik.x[2]) { // change of the interval size
@@ -323,8 +327,8 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
 		c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
 		for (j = 0, curr->n = 0; j < prev->n; ++j) {
 			bwtintv_t *p = &prev->a[j];
-			bwt_extend(bwt, p, ok, 1);
-			if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
+			if (c >= 0 && ik.x[2] >= max_intv) bwt_extend(bwt, p, ok, 1);
+			if (c < 0 || ik.x[2] < max_intv || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
 				if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
 					if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
 						ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
@@ -346,6 +350,34 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv,
 	return ret;
 }
 
+int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+{
+	return bwt_smem1a(bwt, len, q, x, min_intv, 0, mem, tmpvec);
+}
+
+int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem)
+{
+	int i, c;
+	bwtintv_t ik, ok[4];
+
+	memset(mem, 0, sizeof(bwtintv_t));
+	if (q[x] > 3) return x + 1;
+	bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
+	for (i = x + 1; i < len; ++i) { // forward search
+		if (q[i] < 4) { // an A/C/G/T base
+			c = 3 - q[i]; // complement of q[i]
+			bwt_extend(bwt, &ik, ok, 0);
+			if (ok[c].x[2] < max_intv && i - x >= min_len) {
+				*mem = ok[c];
+				mem->info = (uint64_t)x<<32 | (i + 1);
+				return i + 1;
+			}
+			ik = ok[c];
+		} else return i + 1;
+	}
+	return len;
+}
+
 /*************************
  * Read/write BWT and SA *
  *************************/
diff --git a/bwt.h b/bwt.h
index d2ff0ac..c71d6b5 100644
--- a/bwt.h
+++ b/bwt.h
@@ -92,6 +92,7 @@ extern "C" {
 	void bwt_destroy(bwt_t *bwt);
 
 	void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
+	void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
 	void bwt_cal_sa(bwt_t *bwt, int intv);
 
 	void bwt_bwtupdate_core(bwt_t *bwt);
@@ -118,8 +119,9 @@ extern "C" {
 	 * Return the end of the longest exact match starting from _x_.
 	 */
 	int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+	int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
 
-	// SMEM iterator interface
+	int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
 
 #ifdef __cplusplus
 }
diff --git a/bwt_gen.c b/bwt_gen.c
index 6139d80..76f28c9 100644
--- a/bwt_gen.c
+++ b/bwt_gen.c
@@ -1598,15 +1598,20 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o
 	}
 }
 
-void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size)
 {
 	BWTInc *bwtInc;
-	bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000);
+	bwtInc = BWTIncConstructFromPacked(fn_pac, block_size, block_size);
 	printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone);
 	BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0);
 	BWTIncFree(bwtInc);
 }
 
+void bwt_bwtgen(const char *fn_pac, const char *fn_bwt)
+{
+	bwt_bwtgen2(fn_pac, fn_bwt, 10000000);
+}
+
 int bwt_bwtgen_main(int argc, char *argv[])
 {
 	if (argc < 3) {
diff --git a/bwtindex.c b/bwtindex.c
index 9e3ec15..2bfd667 100644
--- a/bwtindex.c
+++ b/bwtindex.c
@@ -189,11 +189,11 @@ int bwa_index(int argc, char *argv[]) // the "index" command
 	extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
 
 	char *prefix = 0, *str, *str2, *str3;
-	int c, algo_type = 0, is_64 = 0;
+	int c, algo_type = 0, is_64 = 0, block_size = 10000000;
 	clock_t t;
 	int64_t l_pac;
 
-	while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
+	while ((c = getopt(argc, argv, "6a:p:b:")) >= 0) {
 		switch (c) {
 		case 'a': // if -a is not set, algo_type will be determined later
 			if (strcmp(optarg, "div") == 0) algo_type = 1;
@@ -203,20 +203,26 @@ int bwa_index(int argc, char *argv[]) // the "index" command
 			break;
 		case 'p': prefix = strdup(optarg); break;
 		case '6': is_64 = 1; break;
+		case 'b':
+			block_size = strtol(optarg, &str, 10);
+			if (*str == 'G' || *str == 'g') block_size *= 1024 * 1024 * 1024;
+			else if (*str == 'M' || *str == 'm') block_size *= 1024 * 1024;
+			else if (*str == 'K' || *str == 'k') block_size *= 1024;
+			break;
 		default: return 1;
 		}
 	}
 
 	if (optind + 1 > argc) {
 		fprintf(stderr, "\n");
-		fprintf(stderr, "Usage:   bwa index [-a bwtsw|is] [-c] <in.fasta>\n\n");
+		fprintf(stderr, "Usage:   bwa index [options] <in.fasta>\n\n");
 		fprintf(stderr, "Options: -a STR    BWT construction algorithm: bwtsw or is [auto]\n");
 		fprintf(stderr, "         -p STR    prefix of the index [same as fasta name]\n");
+		fprintf(stderr, "         -b INT    block size for the bwtsw algorithm (effective with -a bwtsw) [%d]\n", block_size);
 		fprintf(stderr, "         -6        index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
 		fprintf(stderr, "\n");
 		fprintf(stderr,	"Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
-		fprintf(stderr, "         `-a div' do not work not for long genomes. Please choose `-a'\n");
-		fprintf(stderr, "         according to the length of the genome.\n\n");
+		fprintf(stderr, "         `-a div' do not work not for long genomes.\n\n");
 		return 1;
 	}
 	if (prefix == 0) {
@@ -242,7 +248,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command
 		strcpy(str2, prefix); strcat(str2, ".bwt");
 		t = clock();
 		fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n");
-		if (algo_type == 2) bwt_bwtgen(str, str2);
+		if (algo_type == 2) bwt_bwtgen2(str, str2, block_size);
 		else if (algo_type == 1 || algo_type == 3) {
 			bwt_t *bwt;
 			bwt = bwt_pac2bwt(str, algo_type == 3);
diff --git a/example.c b/example.c
index a6c9bdd..4e8494d 100644
--- a/example.c
+++ b/example.c
@@ -7,10 +7,6 @@
 #include "kseq.h" // for the FASTA/Q parser
 KSEQ_DECLARE(gzFile)
 
-#ifdef USE_MALLOC_WRAPPERS
-#  include "malloc_wrap.h"
-#endif
-
 int main(int argc, char *argv[])
 {
 	bwaidx_t *idx;
@@ -47,10 +43,10 @@ int main(int argc, char *argv[])
 			if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
 			a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
 			// print alignment
-			err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
+			printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
 			for (k = 0; k < a.n_cigar; ++k) // print CIGAR
-				err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
-			err_printf("\t%d\n", a.NM); // print edit distance
+				printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
+			printf("\t%d\n", a.NM); // print edit distance
 			free(a.cigar); // don't forget to deallocate CIGAR
 		}
 		free(ar.a); // and deallocate the hit list
@@ -58,7 +54,7 @@ int main(int argc, char *argv[])
 
 	free(opt);
 	kseq_destroy(ks);
-	err_gzclose(fp);
+	gzclose(fp);
 	bwa_idx_destroy(idx);
 	return 0;
 }
diff --git a/fastmap.c b/fastmap.c
index 479e878..3cca4de 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -3,20 +3,98 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 #include <ctype.h>
 #include <math.h>
 #include "bwa.h"
 #include "bwamem.h"
 #include "kvec.h"
 #include "utils.h"
+#include "bntseq.h"
 #include "kseq.h"
-#include "utils.h"
 KSEQ_DECLARE(gzFile)
 
 extern unsigned char nst_nt4_table[256];
 
 void *kopen(const char *fn, int *_fd);
 int kclose(void *a);
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps);
+
+typedef struct {
+	kseq_t *ks, *ks2;
+	mem_opt_t *opt;
+	mem_pestat_t *pes0;
+	int64_t n_processed;
+	int copy_comment, actual_chunk_size;
+	bwaidx_t *idx;
+} ktp_aux_t;
+
+typedef struct {
+	ktp_aux_t *aux;
+	int n_seqs;
+	bseq1_t *seqs;
+} ktp_data_t;
+
+static void *process(void *shared, int step, void *_data)
+{
+	ktp_aux_t *aux = (ktp_aux_t*)shared;
+	ktp_data_t *data = (ktp_data_t*)_data;
+	int i;
+	if (step == 0) {
+		ktp_data_t *ret;
+		int64_t size = 0;
+		ret = calloc(1, sizeof(ktp_data_t));
+		ret->seqs = bseq_read(aux->actual_chunk_size, &ret->n_seqs, aux->ks, aux->ks2);
+		if (ret->seqs == 0) {
+			free(ret);
+			return 0;
+		}
+		if (!aux->copy_comment)
+			for (i = 0; i < ret->n_seqs; ++i) {
+				free(ret->seqs[i].comment);
+				ret->seqs[i].comment = 0;
+			}
+		for (i = 0; i < ret->n_seqs; ++i) size += ret->seqs[i].l_seq;
+		if (bwa_verbose >= 3)
+			fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, ret->n_seqs, (long)size);
+		return ret;
+	} else if (step == 1) {
+		const mem_opt_t *opt = aux->opt;
+		const bwaidx_t *idx = aux->idx;
+		if (opt->flag & MEM_F_SMARTPE) {
+			bseq1_t *sep[2];
+			int n_sep[2];
+			mem_opt_t tmp_opt = *opt;
+			bseq_classify(data->n_seqs, data->seqs, n_sep, sep);
+			if (bwa_verbose >= 3)
+				fprintf(stderr, "[M::%s] %d single-end sequences; %d paired-end sequences\n", __func__, n_sep[0], n_sep[1]);
+			if (n_sep[0]) {
+				tmp_opt.flag &= ~MEM_F_PE;
+				mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, n_sep[0], sep[0], 0);
+				for (i = 0; i < n_sep[0]; ++i)
+					data->seqs[sep[0][i].id].sam = sep[0][i].sam;
+			}
+			if (n_sep[1]) {
+				tmp_opt.flag |= MEM_F_PE;
+				mem_process_seqs(&tmp_opt, idx->bwt, idx->bns, idx->pac, aux->n_processed + n_sep[0], n_sep[1], sep[1], aux->pes0);
+				for (i = 0; i < n_sep[1]; ++i)
+					data->seqs[sep[1][i].id].sam = sep[1][i].sam;
+			}
+			free(sep[0]); free(sep[1]);
+		} else mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, aux->n_processed, data->n_seqs, data->seqs, aux->pes0);
+		aux->n_processed += data->n_seqs;
+		return data;
+	} else if (step == 2) {
+		for (i = 0; i < data->n_seqs; ++i) {
+			if (data->seqs[i].sam) err_fputs(data->seqs[i].sam, stdout);
+			free(data->seqs[i].name); free(data->seqs[i].comment);
+			free(data->seqs[i].seq); free(data->seqs[i].qual); free(data->seqs[i].sam);
+		}
+		free(data->seqs); free(data);
+		return 0;
+	}
+	return 0;
+}
 
 static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
 {
@@ -37,24 +115,24 @@ static void update_a(mem_opt_t *opt, const mem_opt_t *opt0)
 int main_mem(int argc, char *argv[])
 {
 	mem_opt_t *opt, opt0;
-	int fd, fd2, i, c, n, copy_comment = 0;
+	int fd, fd2, i, c, ignore_alt = 0, no_mt_io = 0;
+	int fixed_chunk_size = -1;
 	gzFile fp, fp2 = 0;
-	kseq_t *ks, *ks2 = 0;
-	bseq1_t *seqs;
-	bwaidx_t *idx;
-	char *p, *rg_line = 0;
+	char *p, *rg_line = 0, *hdr_line = 0;
 	const char *mode = 0;
 	void *ko = 0, *ko2 = 0;
-	int64_t n_processed = 0;
-	mem_pestat_t pes[4], *pes0 = 0;
+	mem_pestat_t pes[4];
+	ktp_aux_t aux;
 
+	memset(&aux, 0, sizeof(ktp_aux_t));
 	memset(pes, 0, 4 * sizeof(mem_pestat_t));
 	for (i = 0; i < 4; ++i) pes[i].failed = 1;
 
-	opt = mem_opt_init();
+	aux.opt = opt = mem_opt_init();
 	memset(&opt0, 0, sizeof(mem_opt_t));
-	while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) {
+	while ((c = getopt(argc, argv, "1epaFMCSPVYjk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) {
 		if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
+		else if (c == '1') no_mt_io = 1;
 		else if (c == 'x') mode = optarg;
 		else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1;
 		else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1;
@@ -64,24 +142,34 @@ int main_mem(int argc, char *argv[])
 		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
 		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
 		else if (c == 'a') opt->flag |= MEM_F_ALL;
-		else if (c == 'p') opt->flag |= MEM_F_PE;
+		else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE;
 		else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
 		else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE;
 		else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP;
 		else if (c == 'F') opt->flag |= MEM_F_ALN_REG;
 		else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP;
+		else if (c == 'V') opt->flag |= MEM_F_REF_HDR;
 		else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1;
 		else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1;
 		else if (c == 'v') bwa_verbose = atoi(optarg);
+		else if (c == 'j') ignore_alt = 1;
 		else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.;
 		else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.;
 		else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1;
-		else if (c == 'h') opt->max_hits = atoi(optarg), opt0.max_hits = 1;
 		else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1;
 		else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1;
 		else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1;
 		else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1;
-		else if (c == 'C') copy_comment = 1;
+		else if (c == 'y') opt->max_mem_intv = atol(optarg), opt0.max_mem_intv = 1;
+		else if (c == 'C') aux.copy_comment = 1;
+		else if (c == 'K') fixed_chunk_size = atoi(optarg);
+		else if (c == 'X') opt->mask_level = atof(optarg);
+		else if (c == 'h') {
+			opt0.max_XA_hits = opt0.max_XA_hits_alt = 1;
+			opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10);
+			if (*p != 0 && ispunct(*p) && isdigit(p[1]))
+				opt->max_XA_hits_alt = strtol(p+1, &p, 10);
+		}
 		else if (c == 'Q') {
 			opt0.mapQ_coef_len = 1;
 			opt->mapQ_coef_len = atoi(optarg);
@@ -103,8 +191,24 @@ int main_mem(int argc, char *argv[])
 				opt->pen_clip3 = strtol(p+1, &p, 10);
 		} else if (c == 'R') {
 			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
+		} else if (c == 'H') {
+			if (optarg[0] != '@') {
+				FILE *fp;
+				if ((fp = fopen(optarg, "r")) != 0) {
+					char *buf;
+					buf = calloc(1, 0x10000);
+					while (fgets(buf, 0xffff, fp)) {
+						i = strlen(buf);
+						assert(buf[i-1] == '\n'); // a long line
+						buf[i-1] = 0;
+						hdr_line = bwa_insert_header(buf, hdr_line);
+					}
+					free(buf);
+					fclose(fp);
+				}
+			} else hdr_line = bwa_insert_header(optarg, hdr_line);
 		} else if (c == 'I') { // specify the insert size distribution
-			pes0 = pes;
+			aux.pes0 = pes;
 			pes[1].failed = 0;
 			pes[1].avg = strtod(optarg, &p);
 			pes[1].std = pes[1].avg * .1;
@@ -123,6 +227,12 @@ int main_mem(int argc, char *argv[])
 		}
 		else return 1;
 	}
+
+	if (rg_line) {
+		hdr_line = bwa_insert_header(rg_line, hdr_line);
+		free(rg_line);
+	}
+
 	if (opt->n_threads < 1) opt->n_threads = 1;
 	if (optind + 1 >= argc || optind + 3 < argc) {
 		fprintf(stderr, "\n");
@@ -133,6 +243,7 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -w INT        band width for banded alignment [%d]\n", opt->w);
 		fprintf(stderr, "       -d INT        off-diagonal X-dropoff [%d]\n", opt->zdrop);
 		fprintf(stderr, "       -r FLOAT      look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+		fprintf(stderr, "       -y INT        seed occurrence for the 3rd round seeding [%ld]\n", (long)opt->max_mem_intv);
 //		fprintf(stderr, "       -s INT        look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
 		fprintf(stderr, "       -c INT        skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
 		fprintf(stderr, "       -D FLOAT      drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio);
@@ -141,24 +252,30 @@ int main_mem(int argc, char *argv[])
 		fprintf(stderr, "       -S            skip mate rescue\n");
 		fprintf(stderr, "       -P            skip pairing; mate rescue performed unless -S also in use\n");
 		fprintf(stderr, "       -e            discard full-length exact matches\n");
+		fprintf(stderr, "\nScoring options:\n\n");
 		fprintf(stderr, "       -A INT        score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
 		fprintf(stderr, "       -B INT        penalty for a mismatch [%d]\n", opt->b);
 		fprintf(stderr, "       -O INT[,INT]  gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins);
 		fprintf(stderr, "       -E INT[,INT]  gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins);
 		fprintf(stderr, "       -L INT[,INT]  penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3);
-		fprintf(stderr, "       -U INT        penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+		fprintf(stderr, "       -U INT        penalty for an unpaired read pair [%d]\n\n", opt->pen_unpaired);
 		fprintf(stderr, "       -x STR        read type. Setting -x changes multiple parameters unless overriden [null]\n");
-		fprintf(stderr, "                     pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n");
-		fprintf(stderr, "                     pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n");
+		fprintf(stderr, "                     pacbio: -k17 -W40 -r10 -A1 -B1 -O1 -E1 -L0  (PacBio reads to ref)\n");
+		fprintf(stderr, "                     ont2d: -k14 -W20 -r10 -A1 -B1 -O1 -E1 -L0  (Oxford Nanopore 2D-reads to ref)\n");
+		fprintf(stderr, "                     intractg: -B9 -O16 -L5  (intra-species contigs to ref)\n");
+//		fprintf(stderr, "                     pbread: -k13 -W40 -c1000 -r10 -A1 -B1 -O1 -E1 -N25 -FeaD.001\n");
 		fprintf(stderr, "\nInput/output options:\n\n");
-		fprintf(stderr, "       -p            first query file consists of interleaved paired-end sequences\n");
+		fprintf(stderr, "       -p            smart pairing (ignoring in2.fq)\n");
 		fprintf(stderr, "       -R STR        read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+		fprintf(stderr, "       -H STR/FILE   insert STR to header if it starts with @; or insert lines in FILE [null]\n");
+		fprintf(stderr, "       -j            treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
 		fprintf(stderr, "\n");
 		fprintf(stderr, "       -v INT        verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
 		fprintf(stderr, "       -T INT        minimum score to output [%d]\n", opt->T);
-		fprintf(stderr, "       -h INT        if there are <INT hits with score >80%% of the max score, output all in XA [%d]\n", opt->max_hits);
+		fprintf(stderr, "       -h INT[,INT]  if there are <INT hits with score >80%% of the max score, output all in XA [%d,%d]\n", opt->max_XA_hits, opt->max_XA_hits_alt);
 		fprintf(stderr, "       -a            output all alignments for SE or unpaired PE\n");
 		fprintf(stderr, "       -C            append FASTA/FASTQ comment to SAM output\n");
+		fprintf(stderr, "       -V            output the reference FASTA header in the XR tag\n");
 		fprintf(stderr, "       -Y            use soft clipping for supplementary alignments\n");
 		fprintf(stderr, "       -M            mark shorter split hits as secondary\n\n");
 		fprintf(stderr, "       -I FLOAT[,FLOAT[,INT[,INT]]]\n");
@@ -173,23 +290,33 @@ int main_mem(int argc, char *argv[])
 	}
 
 	if (mode) {
-		if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
-			if (!opt0.a) opt->a = 2, opt0.a = 1;
-			update_a(opt, &opt0);
-			if (!opt0.o_del) opt->o_del = 2;
+		if (strcmp(mode, "intractg") == 0) {
+			if (!opt0.o_del) opt->o_del = 16;
+			if (!opt0.o_ins) opt->o_ins = 16;
+			if (!opt0.b) opt->b = 9;
+			if (!opt0.pen_clip5) opt->pen_clip5 = 5;
+			if (!opt0.pen_clip3) opt->pen_clip3 = 5;
+		} else if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0 || strcmp(mode, "ont2d") == 0) {
+			if (!opt0.o_del) opt->o_del = 1;
 			if (!opt0.e_del) opt->e_del = 1;
-			if (!opt0.o_ins) opt->o_ins = 2;
+			if (!opt0.o_ins) opt->o_ins = 1;
 			if (!opt0.e_ins) opt->e_ins = 1;
-			if (!opt0.b) opt->b = 5;
+			if (!opt0.b) opt->b = 1;
 			if (opt0.split_factor == 0.) opt->split_factor = 10.;
-			if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
-			if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) {
+			if (strcmp(mode, "pbread") == 0) { // pacbio read-to-read setting; NOT working well!
 				opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG;
+				if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
 				if (!opt0.max_occ) opt->max_occ = 1000;
 				if (!opt0.min_seed_len) opt->min_seed_len = 13;
 				if (!opt0.max_chain_extend) opt->max_chain_extend = 25;
 				if (opt0.drop_ratio == 0.) opt->drop_ratio = .001;
+			} else if (strcmp(mode, "ont2d") == 0) {
+				if (!opt0.min_chain_weight) opt->min_chain_weight = 20;
+				if (!opt0.min_seed_len) opt->min_seed_len = 14;
+				if (!opt0.pen_clip5) opt->pen_clip5 = 0;
+				if (!opt0.pen_clip3) opt->pen_clip3 = 0;
 			} else {
+				if (!opt0.min_chain_weight) opt->min_chain_weight = 40;
 				if (!opt0.min_seed_len) opt->min_seed_len = 17;
 				if (!opt0.pen_clip5) opt->pen_clip5 = 0;
 				if (!opt0.pen_clip3) opt->pen_clip3 = 0;
@@ -199,10 +326,16 @@ int main_mem(int argc, char *argv[])
 			return 1; // FIXME memory leak
 		}
 	} else update_a(opt, &opt0);
-//	if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF
 	bwa_fill_scmat(opt->a, opt->b, opt->mat);
 
-	if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+	aux.idx = bwa_idx_load_from_shm(argv[optind]);
+	if (aux.idx == 0) {
+		if ((aux.idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+	} else if (bwa_verbose >= 3)
+		fprintf(stderr, "[M::%s] load the bwa index from shared memory\n", __func__);
+	if (ignore_alt)
+		for (i = 0; i < aux.idx->bns->n_seqs; ++i)
+			aux.idx->bns->anns[i].is_alt = 0;
 
 	ko = kopen(argv[optind + 1], &fd);
 	if (ko == 0) {
@@ -210,11 +343,11 @@ int main_mem(int argc, char *argv[])
 		return 1;
 	}
 	fp = gzdopen(fd, "r");
-	ks = kseq_init(fp);
+	aux.ks = kseq_init(fp);
 	if (optind + 2 < argc) {
 		if (opt->flag&MEM_F_PE) {
 			if (bwa_verbose >= 2)
-				fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
+				fprintf(stderr, "[W::%s] when '-p' is in use, the second query file is ignored.\n", __func__);
 		} else {
 			ko2 = kopen(argv[optind + 2], &fd2);
 			if (ko2 == 0) {
@@ -222,41 +355,21 @@ int main_mem(int argc, char *argv[])
 				return 1;
 			}
 			fp2 = gzdopen(fd2, "r");
-			ks2 = kseq_init(fp2);
+			aux.ks2 = kseq_init(fp2);
 			opt->flag |= MEM_F_PE;
 		}
 	}
 	if (!(opt->flag & MEM_F_ALN_REG))
-		bwa_print_sam_hdr(idx->bns, rg_line);
-	while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
-		int64_t size = 0;
-		if ((opt->flag & MEM_F_PE) && (n&1) == 1) {
-			if (bwa_verbose >= 2)
-				fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__);
-			n = n>>1<<1;
-		}
-		if (!copy_comment)
-			for (i = 0; i < n; ++i) {
-				free(seqs[i].comment); seqs[i].comment = 0;
-			}
-		for (i = 0; i < n; ++i) size += seqs[i].l_seq;
-		if (bwa_verbose >= 3)
-			fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
-		mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0);
-		n_processed += n;
-		for (i = 0; i < n; ++i) {
-			if (seqs[i].sam) err_fputs(seqs[i].sam, stdout);
-			free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
-		}
-		free(seqs);
-	}
-
+		bwa_print_sam_hdr(aux.idx->bns, hdr_line);
+	aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads;
+	kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3);
+	free(hdr_line);
 	free(opt);
-	bwa_idx_destroy(idx);
-	kseq_destroy(ks);
+	bwa_idx_destroy(aux.idx);
+	kseq_destroy(aux.ks);
 	err_gzclose(fp); kclose(ko);
-	if (ks2) {
-		kseq_destroy(ks2);
+	if (aux.ks2) {
+		kseq_destroy(aux.ks2);
 		err_gzclose(fp2); kclose(ko2);
 	}
 	return 0;
@@ -264,7 +377,8 @@ int main_mem(int argc, char *argv[])
 
 int main_fastmap(int argc, char *argv[])
 {
-	int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
+	int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1, max_len = INT_MAX;
+	uint64_t max_intv = 0;
 	kseq_t *seq;
 	bwtint_t k;
 	gzFile fp;
@@ -272,16 +386,26 @@ int main_fastmap(int argc, char *argv[])
 	const bwtintv_v *a;
 	bwaidx_t *idx;
 
-	while ((c = getopt(argc, argv, "w:l:p")) >= 0) {
+	while ((c = getopt(argc, argv, "w:l:pi:I:L:")) >= 0) {
 		switch (c) {
 			case 'p': print_seq = 1; break;
 			case 'w': min_iwidth = atoi(optarg); break;
 			case 'l': min_len = atoi(optarg); break;
+			case 'i': min_intv = atoi(optarg); break;
+			case 'I': max_intv = atol(optarg); break;
+			case 'L': max_len  = atoi(optarg); break;
 		    default: return 1;
 		}
 	}
 	if (optind + 1 >= argc) {
-		fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage:   bwa fastmap [options] <idxbase> <in.fq>\n\n");
+		fprintf(stderr, "Options: -l INT    min SMEM length to output [%d]\n", min_len);
+		fprintf(stderr, "         -w INT    max interval size to find coordiantes [%d]\n", min_iwidth);
+		fprintf(stderr, "         -i INT    min SMEM interval size [%d]\n", min_intv);
+		fprintf(stderr, "         -l INT    max MEM length [%d]\n", max_len);
+		fprintf(stderr, "         -I INT    stop if MEM is longer than -l with a size less than INT [%ld]\n", (long)max_intv);
+		fprintf(stderr, "\n");
 		return 1;
 	}
 
@@ -289,6 +413,7 @@ int main_fastmap(int argc, char *argv[])
 	seq = kseq_init(fp);
 	if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1;
 	itr = smem_itr_init(idx->bwt);
+	smem_config(itr, min_intv, max_len, max_intv);
 	while (kseq_read(seq) >= 0) {
 		err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
 		if (print_seq) {
diff --git a/kseq.h b/kseq.h
index 642cd33..f3862c6 100644
--- a/kseq.h
+++ b/kseq.h
@@ -54,9 +54,9 @@
 #define __KS_BASIC(type_t, __bufsize)								\
 	static inline kstream_t *ks_init(type_t f)						\
 	{																\
-		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
+		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 		ks->f = f;													\
-		ks->buf = (unsigned char*)malloc(__bufsize);						\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
 		return ks;													\
 	}																\
 	static inline void ks_destroy(kstream_t *ks)					\
@@ -74,8 +74,7 @@
 		if (ks->begin >= ks->end) {							\
 			ks->begin = 0;									\
 			ks->end = __read(ks->f, ks->buf, __bufsize);	\
-			if (ks->end < __bufsize) ks->is_eof = 1;		\
-			if (ks->end == 0) return -1;					\
+			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
 		}													\
 		return (int)ks->buf[ks->begin++];					\
 	}
@@ -95,17 +94,16 @@ typedef struct __kstring_t {
 #define __KS_GETUNTIL(__read, __bufsize)								\
 	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 	{																	\
+		int gotany = 0;													\
 		if (dret) *dret = 0;											\
 		str->l = append? str->l : 0;									\
-		if (ks->begin >= ks->end && ks->is_eof) return -1;				\
 		for (;;) {														\
 			int i;														\
 			if (ks->begin >= ks->end) {									\
 				if (!ks->is_eof) {										\
 					ks->begin = 0;										\
 					ks->end = __read(ks->f, ks->buf, __bufsize);		\
-					if (ks->end < __bufsize) ks->is_eof = 1;			\
-					if (ks->end == 0) break;							\
+					if (ks->end == 0) { ks->is_eof = 1; break; }		\
 				} else break;											\
 			}															\
 			if (delimiter == KS_SEP_LINE) { \
@@ -124,8 +122,9 @@ typedef struct __kstring_t {
 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
 				str->m = str->l + (i - ks->begin) + 1;					\
 				kroundup32(str->m);										\
-				str->s = (char*)realloc(str->s, str->m);			\
+				str->s = (char*)realloc(str->s, str->m);				\
 			}															\
+			gotany = 1;													\
 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
 			str->l = str->l + (i - ks->begin);							\
 			ks->begin = i + 1;											\
@@ -134,6 +133,7 @@ typedef struct __kstring_t {
 				break;													\
 			}															\
 		}																\
+		if (!gotany && ks_eof(ks)) return -1;							\
 		if (str->s == 0) {												\
 			str->m = 1;													\
 			str->s = (char*)calloc(1, 1);								\
@@ -155,7 +155,7 @@ typedef struct __kstring_t {
 #define __KSEQ_BASIC(SCOPE, type_t)										\
 	SCOPE kseq_t *kseq_init(type_t fd)									\
 	{																	\
-		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));				\
+		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
 		s->f = ks_init(fd);												\
 		return s;														\
 	}																	\
diff --git a/kthread.c b/kthread.c
index a44426b..80f84cb 100644
--- a/kthread.c
+++ b/kthread.c
@@ -1,23 +1,30 @@
 #include <pthread.h>
 #include <stdlib.h>
+#include <limits.h>
+
+/************
+ * kt_for() *
+ ************/
 
 struct kt_for_t;
 
 typedef struct {
 	struct kt_for_t *t;
-	int i;
+	long i;
 } ktf_worker_t;
 
 typedef struct kt_for_t {
-	int n_threads, n;
+	int n_threads;
+	long n;
 	ktf_worker_t *w;
-	void (*func)(void*,int,int);
+	void (*func)(void*,long,int);
 	void *data;
 } kt_for_t;
 
-static inline int steal_work(kt_for_t *t)
+static inline long steal_work(kt_for_t *t)
 {
-	int i, k, min = 0x7fffffff, min_i = -1;
+	int i, min_i = -1;
+	long k, min = LONG_MAX;
 	for (i = 0; i < t->n_threads; ++i)
 		if (min > t->w[i].i) min = t->w[i].i, min_i = i;
 	k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads);
@@ -27,7 +34,7 @@ static inline int steal_work(kt_for_t *t)
 static void *ktf_worker(void *data)
 {
 	ktf_worker_t *w = (ktf_worker_t*)data;
-	int i;
+	long i;
 	for (;;) {
 		i = __sync_fetch_and_add(&w->i, w->t->n_threads);
 		if (i >= w->t->n) break;
@@ -38,7 +45,7 @@ static void *ktf_worker(void *data)
 	pthread_exit(0);
 }
 
-void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n)
+void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n)
 {
 	int i;
 	kt_for_t t;
@@ -51,3 +58,86 @@ void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n)
 	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]);
 	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
 }
+
+/*****************
+ * kt_pipeline() *
+ *****************/
+
+struct ktp_t;
+
+typedef struct {
+	struct ktp_t *pl;
+	int step, running;
+	void *data;
+} ktp_worker_t;
+
+typedef struct ktp_t {
+	void *shared;
+	void *(*func)(void*, int, void*);
+	int n_workers, n_steps;
+	ktp_worker_t *workers;
+	pthread_mutex_t mutex;
+	pthread_cond_t cv;
+} ktp_t;
+
+static void *ktp_worker(void *data)
+{
+	ktp_worker_t *w = (ktp_worker_t*)data;
+	ktp_t *p = w->pl;
+	while (w->step < p->n_steps) {
+		// test whether we can kick off the job with this worker
+		pthread_mutex_lock(&p->mutex);
+		for (;;) {
+			int i;
+			// test whether another worker is doing the same step
+			for (i = 0; i < p->n_workers; ++i) {
+				if (w == &p->workers[i]) continue; // ignore itself
+				if (p->workers[i].running && p->workers[i].step == w->step)
+					break;
+			}
+			if (i == p->n_workers) break; // no other workers doing w->step; then this worker will
+			pthread_cond_wait(&p->cv, &p->mutex);
+		}
+		w->running = 1;
+		pthread_mutex_unlock(&p->mutex);
+
+		// working on w->step
+		w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL
+
+		// update step and let other workers know
+		pthread_mutex_lock(&p->mutex);
+		w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps;
+		w->running = 0;
+		pthread_cond_broadcast(&p->cv);
+		pthread_mutex_unlock(&p->mutex);
+	}
+	pthread_exit(0);
+}
+
+void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps)
+{
+	ktp_t aux;
+	pthread_t *tid;
+	int i;
+
+	if (n_threads < 1) n_threads = 1;
+	aux.n_workers = n_threads;
+	aux.n_steps = n_steps;
+	aux.func = func;
+	aux.shared = shared_data;
+	pthread_mutex_init(&aux.mutex, 0);
+	pthread_cond_init(&aux.cv, 0);
+
+	aux.workers = alloca(n_threads * sizeof(ktp_worker_t));
+	for (i = 0; i < n_threads; ++i) {
+		ktp_worker_t *w = &aux.workers[i];
+		w->step = w->running = 0; w->pl = &aux; w->data = 0;
+	}
+
+	tid = alloca(n_threads * sizeof(pthread_t));
+	for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]);
+	for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0);
+
+	pthread_mutex_destroy(&aux.mutex);
+	pthread_cond_destroy(&aux.cv);
+}
diff --git a/main.c b/main.c
index b873b75..9f776d4 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.7.10-r789"
+#define PACKAGE_VERSION "0.7.12-r1039"
 #endif
 
 int bwa_fa2pac(int argc, char *argv[]);
@@ -22,11 +22,10 @@ int bwa_bwtsw2(int argc, char *argv[]);
 
 int main_fastmap(int argc, char *argv[]);
 int main_mem(int argc, char *argv[]);
+int main_shm(int argc, char *argv[]);
 
 int main_pemerge(int argc, char *argv[]);
 	
-char *bwa_pg;
-
 static int usage()
 {
 	fprintf(stderr, "\n");
@@ -43,6 +42,7 @@ static int usage()
 	fprintf(stderr, "         sampe         generate alignment (paired ended)\n");
 	fprintf(stderr, "         bwasw         BWA-SW for long queries\n");
 	fprintf(stderr, "\n");
+	fprintf(stderr, "         shm           manage indices in shared memory\n");
 	fprintf(stderr, "         fa2pac        convert FASTA to PAC format\n");
 	fprintf(stderr, "         pac2bwt       generate BWT from PAC\n");
 	fprintf(stderr, "         pac2bwtgen    alternative algorithm for generating BWT\n");
@@ -59,6 +59,7 @@ static int usage()
 
 int main(int argc, char *argv[])
 {
+	extern char *bwa_pg;
 	int i, ret;
 	double t_real;
 	kstring_t pg = {0,0,0};
@@ -81,6 +82,7 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 	else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
 	else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
+	else if (strcmp(argv[1], "shm") == 0) ret = main_shm(argc-1, argv+1);
 	else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1);
 	else {
 		fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
diff --git a/utils.c b/utils.c
index 00be7f0..2983261 100644
--- a/utils.c
+++ b/utils.c
@@ -219,6 +219,17 @@ int err_fputs(const char *s, FILE *stream)
 	return ret;
 }
 
+int err_puts(const char *s)
+{
+	int ret = puts(s);
+	if (EOF == ret)
+	{
+		_err_fatal_simple("puts", strerror(errno));
+	}
+
+	return ret;
+}
+
 int err_fflush(FILE *stream) 
 {
     int ret = fflush(stream);
diff --git a/utils.h b/utils.h
index 5ef6ac4..11966b8 100644
--- a/utils.h
+++ b/utils.h
@@ -80,7 +80,7 @@ extern "C" {
 	int err_fputc(int c, FILE *stream);
 #define err_putchar(C) err_fputc((C), stdout)
 	int err_fputs(const char *s, FILE *stream);
-#define err_puts(S) err_fputs((S), stdout)
+	int err_puts(const char *s);
 	int err_fflush(FILE *stream);
 	int err_fclose(FILE *stream);
 	int err_gzclose(gzFile file);

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/bwa.git