[med-svn] [SCM] bwa branch, master, updated. debian/0.6.1-1-22-g71fd912

Thu Mar 7 05:47:00 UTC 2013

The following commit has been merged in the master branch:
commit 947c8141c58d61196b274d19fc9273ebe688ce2e
Author: Charles Plessy <plessy at debian.org>
Date:   Thu Mar 7 14:25:22 2013 +0900

    Imported Upstream version 0.7.0

diff --git a/Makefile b/Makefile
index 6f388f2..eab4198 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,14 @@
 CC=			gcc
-CXX=		g++
 CFLAGS=		-g -Wall -O2
 CXXFLAGS=	$(CFLAGS)
 AR=			ar
 DFLAGS=		-DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
-LOBJS=		bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \
-			bwaseqio.o bwase.o kstring.o
-AOBJS=		QSufSort.o bwt_gen.o \
-			is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
-			bwape.o cs2nt.o \
+LOBJS=		utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o
+AOBJS=		QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
+			is.o bwtindex.o bwape.o \
 			bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \
 			bwtsw2_chain.o fastmap.o bwtsw2_pair.o
-PROG=		bwa
+PROG=		bwa bwamem-lite
 INCLUDES=	
 LIBS=		-lm -lz -lpthread
 SUBDIRS=	.
@@ -26,19 +23,29 @@ SUBDIRS=	.
 all:$(PROG)
 
 bwa:libbwa.a $(AOBJS) main.o
-		$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
+		$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa
+
+bwamem-lite:libbwa.a example.o
+		$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa
 
 libbwa.a:$(LOBJS)
 		$(AR) -csru $@ $(LOBJS)
 
-bwa.o:bwa.h
+ksw.o:ksw.h
+kstring.o:kstring.h
+utils.o:utils.h ksort.h kseq.h
+bntseq.o:bntseq.h
+bwt.o:bwt.h utils.h
+bwa.o:bwa.h bwt.h bntseq.h
+bwamem.o:ksw.h kbtree.h ksort.h kvec.h kstring.h utils.h bwamem.h
+bwamem_pair.o:ksw.h kvec.h kstring.h utils.h bwamem.h
 
 QSufSort.o:QSufSort.h
+bwt_gen.o:QSufSort.h
+
+fastmap.o:bwt.h bwamem.h
 
-bwt.o:bwt.h
-bwtio.o:bwt.h
 bwtaln.o:bwt.h bwtaln.h kseq.h
-bntseq.o:bntseq.h
 bwtgap.o:bwtgap.h bwtaln.h bwt.h
 
 bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h
diff --git a/NEWS b/NEWS
index d68c693..35202f1 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,56 @@
+Beta Release 0.7.0 (28 Feburary, 2013)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query
+sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap
+algorithm and extends seeds with banded affine-gap-penalty dynamic programming
+(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or
+longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA
+and BWA-SW and is more accurate. It also supports split alignments like BWA-SW
+and may optionally output multiple hits like BWA. BWA-MEM does not guarantee
+to find hits within a certain edit distance, but BWA is not efficient for such
+task given longer reads anyway, and the edit-distance criterion is arguably
+not as important in long-read alignment.
+
+In addition to the algorithmic improvements, BWA-MEM also implements a few
+handy features in practical aspects:
+
+ 1. BWA-MEM automatically switches between local and glocal (global wrt reads;
+	local wrt reference) alignment. It reports the end-to-end glocal alignment
+	if the glocal alignment is not much worse than the optimal local alignment.
+	Glocal alignment reduces reference bias.
+
+ 2. BWA-MEM automatically infers pair orientation from a batch of single-end
+    alignments. It allows more than one orientations if there are sufficient
+	supporting reads. This feature has not been tested on reads from Illumina
+	jumping library yet. (EXPERIMENTAL)
+
+ 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It
+    is possible to convert a name-sorted BAM to an interleaved fastq on the fly
+    and feed the data stream to BWA-MEM for mapping.
+
+ 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which
+    helps to transfer individual read annotations to the output.
+
+ 5. BWA-MEM supports more advanced piping. Users can now run:
+    (bwa mem ref.fa '<bzcat r1.fq.bz2' '<bzcat r2.fq.bz2') to map bzip'd read
+    files without replying on bash features.
+
+ 6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c'
+	program in the source code directory implements a full single-end mapper in
+	50 lines of code.
+
+The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM
+for production use yet. However, when the implementation becomes stable after a
+few release cycles, existing BWA users are recommended to migrate to BWA-MEM
+for 76bp or longer Illumina reads and long query sequences. The original BWA
+short-read algorithm will not deliver satisfactory results for 150bp+ Illumina
+reads. Change of mappers will be necessary sooner or later.
+
+(0.7.0 beta: 28 Feburary 2013, r313)
+
+
+
 Release 0.6.2 (19 June, 2012)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/QSufSort.c b/QSufSort.c
index e437ac3..36c5a51 100644
--- a/QSufSort.c
+++ b/QSufSort.c
@@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin
 	qsint_t i, j;
 	qsint_t s, negatedSortedGroupLength;
 	qsint_t numSymbolAggregated;
-	qsint_t maxNumInputSymbol;
 	qsint_t numSortedPos = 1;
 	qsint_t newAlphabetSize;
    
-	maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1;
-
 	if (!skipTransform) {
 		/* bucketing possible*/
 		newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, 
diff --git a/bntseq.c b/bntseq.c
index adcd2d7..972837e 100644
--- a/bntseq.c
+++ b/bntseq.c
@@ -35,7 +35,7 @@
 #include "utils.h"
 
 #include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
 
 unsigned char nst_nt4_table[256] = {
 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
@@ -288,21 +288,26 @@ int bwa_fa2pac(int argc, char *argv[])
 	return 0;
 }
 
+int bns_pos2rid(const bntseq_t *bns, int64_t pos_f)
+{
+	int left, mid, right;
+	if (pos_f >= bns->l_pac) return -1;
+	left = 0; mid = 0; right = bns->n_seqs;
+	while (left < right) { // binary search
+		mid = (left + right) >> 1;
+		if (pos_f >= bns->anns[mid].offset) {
+			if (mid == bns->n_seqs - 1) break;
+			if (pos_f < bns->anns[mid+1].offset) break; // bracketed
+			left = mid + 1;
+		} else right = mid;
+	}
+	return mid;
+}
+
 int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
 {
 	int left, mid, right, nn;
-	if (ref_id) {
-		left = 0; mid = 0; right = bns->n_seqs;
-		while (left < right) {
-			mid = (left + right) >> 1;
-			if (pos_f >= bns->anns[mid].offset) {
-				if (mid == bns->n_seqs - 1) break;
-				if (pos_f < bns->anns[mid+1].offset) break; // bracketed
-				left = mid + 1;
-			} else right = mid;
-		}
-		*ref_id = mid;
-	}
+	if (ref_id) *ref_id = bns_pos2rid(bns, pos_f);
 	left = 0; right = bns->n_holes; nn = 0;
 	while (left < right) {
 		mid = (left + right) >> 1;
@@ -321,3 +326,26 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id)
 	}
 	return nn;
 }
+
+uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len)
+{
+	uint8_t *seq = 0;
+	if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap
+	if (end > l_pac<<1) end = l_pac<<1;
+	if (beg < 0) beg = 0;
+	if (beg >= l_pac || end <= l_pac) {
+		int64_t k, l = 0;
+		*len = end - beg;
+		seq = malloc(end - beg);
+		if (beg >= l_pac) { // reverse strand
+			int64_t beg_f = (l_pac<<1) - 1 - end;
+			int64_t end_f = (l_pac<<1) - 1 - beg;
+			for (k = end_f; k > beg_f; --k)
+				seq[l++] = 3 - _get_pac(pac, k);
+		} else { // forward strand
+			for (k = beg; k < end; ++k)
+				seq[l++] = _get_pac(pac, k);
+		}
+	} else *len = 0; // if bridging the forward-reverse boundary, return nothing
+	return seq;
+}
diff --git a/bntseq.h b/bntseq.h
index 843db64..4061438 100644
--- a/bntseq.h
+++ b/bntseq.h
@@ -29,6 +29,7 @@
 #define BWT_BNTSEQ_H
 
 #include <stdint.h>
+#include <stdio.h>
 #include <zlib.h>
 
 #ifndef BWA_UBYTE
@@ -71,7 +72,9 @@ extern "C" {
 	bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename);
 	void bns_destroy(bntseq_t *bns);
 	int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only);
+	int bns_pos2rid(const bntseq_t *bns, int64_t pos_f);
 	int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id);
+	uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len);
 
 #ifdef __cplusplus
 }
diff --git a/bwa.1 b/bwa.1
index 66bc9a2..45b9921 100644
--- a/bwa.1
+++ b/bwa.1
@@ -1,47 +1,45 @@
-.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools"
+.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools"
 .SH NAME
 .PP
 bwa - Burrows-Wheeler Alignment Tool
 .SH SYNOPSIS
 .PP
-bwa index -a bwtsw database.fasta
+bwa index ref.fa
 .PP
-bwa aln database.fasta short_read.fastq > aln_sa.sai
+bwa mem ref.fa reads.fq > aln-se.sam
 .PP
-bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam
+bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
 .PP
-bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam
+bwa aln ref.fa short_read.fq > aln_sa.sai
 .PP
-bwa bwasw database.fasta long_read.fastq > aln.sam
+bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+.PP
+bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+.PP
+bwa bwasw ref.fa long_read.fq > aln.sam
 
 .SH DESCRIPTION
 .PP
-BWA is a fast light-weighted tool that aligns relatively short sequences
-(queries) to a sequence database (targe), such as the human reference
-genome. It implements two different algorithms, both based on
-Burrows-Wheeler Transform (BWT). The first algorithm is designed for
-short queries up to ~150bp with low error rate (<3%). It does gapped
-global alignment w.r.t. queries, supports paired-end reads, and is one
-of the fastest short read alignment algorithms to date while also
-visiting suboptimal hits. The second algorithm, BWA-SW, is designed for
-reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like
-alignment to find high-scoring local hits and split hits. On
-low-error short queries, BWA-SW is a little slower and less accurate than the
-first algorithm, but on long queries, it is better.
-.PP
-For both algorithms, the database file in the FASTA format must be
-first indexed with the
-.B `index'
-command, which typically takes a few hours for a 3GB genome. The first algorithm is
-implemented via the
-.B `aln'
-command, which finds the suffix array (SA) coordinates of good hits of
-each individual read, and the
-.B `samse/sampe'
-command, which converts SA coordinates to chromosomal coordinate and
-pairs reads (for `sampe'). The second algorithm is invoked by the
-.B `bwasw'
-command. It works for single-end reads only.
+BWA is a software package for mapping low-divergent sequences against a large
+reference genome, such as the human genome. It consists of three algorithms:
+BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina
+sequence reads up to 100bp, while the rest two for longer sequences ranged from
+70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read
+support and split alignment, but BWA-MEM, which is the latest, is generally
+recommended for high-quality queries as it is faster and more accurate.
+BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina
+reads.
+
+For all the algorithms, BWA first needs to construct the FM-index for
+the reference genome (the
+.B index
+command). Alignment algorithms are invoked with different sub-commands:
+.BR aln / samse / sampe
+for BWA-backtrack,
+.B bwasw
+for BWA-SW and
+.B mem
+for the BWA-MEM algorithm.
 
 .SH COMMANDS AND OPTIONS
 .TP
@@ -53,9 +51,6 @@ Index database sequences in the FASTA format.
 .B OPTIONS:
 .RS
 .TP 10
-.B -c
-Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x)
-.TP
 .BI -p \ STR
 Prefix of the output database [same as db filename]
 .TP
@@ -77,6 +72,175 @@ genome.
 .RE
 
 .TP
+.B mem
+.B bwa mem
+.RB [ -aCHMpP ]
+.RB [ -t
+.IR nThreads ]
+.RB [ -k
+.IR minSeedLen ]
+.RB [ -w
+.IR bandWidth ]
+.RB [ -r
+.IR seedSplitRatio ]
+.RB [ -c
+.IR maxOcc ]
+.RB [ -A
+.IR matchScore ]
+.RB [ -B
+.IR mmPenalty ]
+.RB [ -O
+.IR gapOpenPen ]
+.RB [ -E
+.IR gapExtPen ]
+.RB [ -L
+.IR clipPen ]
+.RB [ -U
+.IR unpairPen ]
+.RB [ -R
+.IR RGline ]
+.RB [ -v
+.IR verboseLevel ]
+.I db.prefix
+.I reads.fq
+.RI [ mates.fq ]
+
+Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the
+algorithm works by seeding alignments with maximal exact matches (MEMs) and
+then extending seeds with the affine-gap Smith-Waterman algorithm (SW).
+
+If
+.I mates.fq
+file is absent and option
+.B -p
+is not set, this command regards input reads are single-end. If
+.I mates.fq
+is present, this command assumes the
+.IR i -th
+read in
+.I reads.fq
+and the
+.IR i -th
+read in
+.I mates.fq
+constitute a read pair. If
+.B -p
+is used, the command assumes the
+.RI 2 i -th
+and the
+.RI (2 i +1)-th
+read in
+.I reads.fq
+constitute a read pair (such input file is said to be interleaved). In this case,
+.I mates.fq
+is ignored. In the paired-end mode, the
+.B mem
+command will infer the read orientation and the insert size distribution from a
+batch of reads.
+
+The BWA-MEM algorithm performs local alignment. It may produce multiple primary
+alignments for different part of a query sequence. This is a crucial feature
+for long sequences. However, some tools such as Picard's markDuplicates does
+not work with split alignments. One may consider to use option
+.B -M
+to flag shorter split hits as secondary.
+
+.B OPTIONS:
+.RS
+.TP 10
+.BI -t \ INT
+Number of threads [1]
+.TP
+.BI -k \ INT
+Minimum seed length. Matches shorter than
+.I INT
+will be missed. The alignment speed is usually insensitive to this value unless
+it significantly deviates 20. [19]
+.TP
+.BI -w \ INT
+Band width. Essentially, gaps longer than
+.I INT
+will not be found. Note that the maximum gap length is also affected by the
+scoring matrix and the hit length, not solely determined by this option. [100]
+.TP
+.BI -r \ FLOAT
+Trigger re-seeding for a MEM longer than
+.IR minSeedLen * FLOAT .
+This is a key heuristic parameter for tuning the performance. Larger value
+yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5]
+.TP
+.BI -c \ INT
+Discard a MEM if it has more than
+.I INT
+occurence in the genome. This is an insensitive parameter. [10000]
+.TP
+.B -P
+In the paired-end mode, perform SW to rescue missing hits only but do not try to find
+hits that fit a proper pair.
+.TP
+.BI -A \ INT
+Matching score. [1]
+.TP
+.BI -B \ INT
+Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4]
+.TP
+.BI -O \ INT
+Gap open penalty. [6]
+.TP
+.BI -E \ INT
+Gap extension penalty. A gap of length k costs O + k*E (i.e.
+.B -O
+is for opening a zero-length gap). [1]
+.TP
+.BI -L \ INT
+Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best
+score reaching the end of query. If this score is larger than the best SW score
+minus the clipping penalty, clipping will not be applied. Note that in this
+case, the SAM AS tag reports the best SW score; clipping penalty is not
+deducted. [5]
+.TP
+.BI -U \ INT
+Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as
+.RI scoreRead1+scoreRead2- INT
+and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these
+two scores to determine whether we should force pairing. [9]
+.TP
+.B -p
+Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
+.TP
+.BI -R \ STR
+Complete read group header line. '\\t' can be used in
+.I STR
+and will be converted to a TAB in the output SAM. The read group ID will be
+attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'.
+[null]
+.TP
+.B -a
+Output all found alignments for single-end or unpaired paired-end reads. These
+alignments will be flagged as secondary alignments.
+.TP
+.B -C
+Append append FASTA/Q comment to SAM output. This option can be used to
+transfer read meta information (e.g. barcode) to the SAM output. Note that the
+FASTA/Q comment (the string after a space in the header line) must conform the SAM
+spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
+.TP
+.B -H
+Use hard clipping 'H' in the SAM output. This option may dramatically reduce
+the redundancy of output when mapping long contig or BAC sequences.
+.TP
+.B -M
+Mark shorter split hits as secondary (for Picard compatibility).
+.TP
+.BI -v \ INT
+Control the verbose level of the output. This option has not been fully
+supported throughout BWA. Ideally, a value 0 for disabling all the output to
+stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for
+all normal messages; 4 or higher for debugging. When this option takes value
+4, the output is not SAM. [3]
+.RE
+
+.TP
 .B aln
 bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
 nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc]
@@ -482,24 +646,6 @@ Pairing is slower for shorter reads. This is mainly because shorter
 reads have more spurious hits and converting SA coordinates to
 chromosomal coordinates are very costly.
 
-.SH NOTES ON LONG-READ ALIGNMENT
-.PP
-Command
-.B bwasw
-is designed for long-read alignment. BWA-SW essentially aligns the trie
-of the reference genome against the directed acyclic word graph (DAWG) of a
-read to find seeds not highly repetitive in the genome, and then performs a
-standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called
-the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the
-top Z reference suffix intervals that match the vertex. BWA-SW is more accurate
-if the resultant alignment is supported by more seeds, and therefore BWA-SW
-usually performs better on long queries or queries with low divergence to the
-reference genome.
-
-BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads
-mainly because it gives better gapped alignment. For paired-end reads, it is yet
-to know whether BWA-short or BWA-SW yield overall better results.
-
 .SH CHANGES IN BWA-0.6
 .PP
 Since version 0.6, BWA has been able to work with a reference genome longer than 4GB.
@@ -534,16 +680,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes
 from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS
 libraries are distributed under the MIT license.
 .PP
-If you use the short-read alignment component, please cite the following
+If you use the BWA-backtrack algorithm, please cite the following
 paper:
 .PP
 Li H. and Durbin R. (2009) Fast and accurate short read alignment with
 Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168]
 .PP
-If you use the long-read component (BWA-SW), please cite:
+If you use the BWA-SW algorithm, please cite:
 .PP
 Li H. and Durbin R. (2010) Fast and accurate long-read alignment with
 Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505]
+.PP
+If you use the fastmap component of BWA, please cite:
+.PP
+Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de
+novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178]
+.PP
+The BWA-MEM algorithm has not been published yet.
 
 .SH HISTORY
 BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW
@@ -569,3 +722,11 @@ short-read aligners are being implemented.
 
 The BWA-SW algorithm is a new component of BWA. It was conceived in
 November 2008 and implemented ten months later.
+
+The BWA-MEM algorithm is based on an algorithm finding super-maximal exact
+matches (SMEMs), which was first published with the fermi assembler paper
+in 2012. I first implemented the basic SMEM algorithm in the
+.B fastmap
+command for an experiment and then extended the basic algorithm and added the
+extension part in Feburary 2013 to make BWA-MEM a fully featured mapper.
+
diff --git a/bwa.c b/bwa.c
index 8e99f18..beea6d1 100644
--- a/bwa.c
+++ b/bwa.c
@@ -1,272 +1,324 @@
-#include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-#include <math.h>
-#include "bwa.h"
-#include "bwt.h"
-#include "bwtgap.h"
+#include <zlib.h>
+#include <assert.h>
 #include "bntseq.h"
+#include "bwa.h"
+#include "ksw.h"
+#include "utils.h"
 
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-extern unsigned char nst_nt4_table[256];
-extern void seq_reverse(int len, uint8_t *seq, int is_comp);
-
-bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
+int bwa_verbose = 3;
+char bwa_rg_id[256];
 
-struct bwa_idx_t {
-	bwt_t *bwt;
-	bntseq_t *bns;
-	uint8_t *pac;
-};
+/************************
+ * Batch FASTA/Q reader *
+ ************************/
 
-struct bwa_buf_t {
-	int max_buf;
-	bwa_pestat_t pes;
-	gap_stack_t *stack;
-	gap_opt_t *opt;
-	int *diff_tab;
-	uint8_t *buf;
-	int *logn;
-};
+#include "kseq.h"
+KSEQ_DECLARE(gzFile)
 
-bwa_idx_t *bwa_idx_load(const char *prefix)
+static inline void trim_readno(kstring_t *s)
 {
-	bwa_idx_t *p;
-	int l;
-	char *str;
-	l = strlen(prefix);
-	p = calloc(1, sizeof(bwa_idx_t));
-	str = malloc(l + 10);
-	strcpy(str, prefix);
-	p->bns = bns_restore(str);
-	strcpy(str + l, ".bwt");
-	p->bwt = bwt_restore_bwt(str);
-	str[l] = 0;
-	strcpy(str + l, ".sa");
-	bwt_restore_sa(str, p->bwt);
-	free(str);
-	p->pac = calloc(p->bns->l_pac/4+1, 1);
-	fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
-	fclose(p->bns->fp_pac);
-	p->bns->fp_pac = 0;
-	return p;
+	if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1]))
+		s->l -= 2, s->s[s->l] = 0;
 }
 
-void bwa_idx_destroy(bwa_idx_t *p)
-{
-	bns_destroy(p->bns);
-	bwt_destroy(p->bwt);
-	free(p->pac);
-	free(p);
+static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s)
+{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice
+	s->name = strdup(ks->name.s);
+	s->comment = ks->comment.l? strdup(ks->comment.s) : 0;
+	s->seq = strdup(ks->seq.s);
+	s->qual = ks->qual.l? strdup(ks->qual.s) : 0;
+	s->l_seq = strlen(s->seq);
 }
 
-bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
+bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_)
 {
-	extern gap_opt_t *gap_init_opt(void);
-	extern int bwa_cal_maxdiff(int l, double err, double thres);
-	int i;
-	bwa_buf_t *p;
-	p = malloc(sizeof(bwa_buf_t));
-	p->stack = gap_init_stack2(max_score);
-	p->opt = gap_init_opt();
-	p->opt->s_gapo = opt->s_gapo;
-	p->opt->s_gape = opt->s_gape;
-	p->opt->max_diff = opt->max_diff;
-	p->opt->max_gapo = opt->max_gapo;
-	p->opt->max_gape = opt->max_gape;
-	p->opt->seed_len = opt->seed_len;
-	p->opt->max_seed_diff = opt->max_seed_diff;
-	p->opt->fnr = opt->fnr;
-	p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int));
-	for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
-		p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
-	p->logn = calloc(256, sizeof(int));
-	for (i = 1; i != 256; ++i)
-		p->logn[i] = (int)(4.343 * log(i) + 0.499);
-	return p;
+	kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_;
+	int size = 0, m, n;
+	bseq1_t *seqs;
+	m = n = 0; seqs = 0;
+	while (kseq_read(ks) >= 0) {
+		if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads
+			fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__);
+			break;
+		}
+		if (n >= m) {
+			m = m? m<<1 : 256;
+			seqs = realloc(seqs, m * sizeof(bseq1_t));
+		}
+		trim_readno(&ks->name);
+		kseq2bseq1(ks, &seqs[n]);
+		size += seqs[n++].l_seq;
+		if (ks2) {
+			trim_readno(&ks2->name);
+			kseq2bseq1(ks2, &seqs[n]);
+			size += seqs[n++].l_seq;
+		}
+		if (size >= chunk_size) break;
+	}
+	if (size == 0) { // test if the 2nd file is finished
+		if (ks2 && kseq_read(ks2) >= 0)
+			fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__);
+	}
+	*n_ = n;
+	return seqs;
 }
 
-void bwa_buf_destroy(bwa_buf_t *p)
-{
-	gap_destroy_stack(p->stack);
-	free(p->diff_tab); free(p->logn); free(p->opt);
-	free(p);
-}
+/*****************
+ * CIGAR related *
+ *****************/
 
-bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
+// Generate CIGAR when the alignment end points are known
+uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM)
 {
-	extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
-	int i, seq_len, buf_len;
-	bwt_width_t *w, *seed_w;
-	uint8_t *s;
-	gap_opt_t opt2 = *buf->opt;
-	bwa_sai_t sai;
-
-	seq_len = strlen(seq);
-	// estimate the buffer length
-	buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
-	if (buf_len > buf->max_buf) {
-		buf->max_buf = buf_len;
-		kroundup32(buf->max_buf);
-		buf->buf = realloc(buf->buf, buf->max_buf);
+	uint32_t *cigar = 0;
+	uint8_t tmp, *rseq;
+	int i;
+	int64_t rlen;
+	*n_cigar = 0; *NM = -1;
+	if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand
+	rseq = bns_get_seq(l_pac, pac, rb, re, &rlen);
+	if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range
+	if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position
+		for (i = 0; i < l_query>>1; ++i)
+			tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+		for (i = 0; i < rlen>>1; ++i)
+			tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp;
 	}
-	memset(buf->buf, 0, buf_len);
-	seed_w = (bwt_width_t*)buf->buf;
-	w = seed_w + buf->opt->seed_len;
-	s = (uint8_t*)(w + seq_len + 1);
-	if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
-	// copy the sequence
-	for (i = 0; i < seq_len; ++i)
-		s[i] = nst_nt4_table[(int)seq[i]];
-	seq_reverse(seq_len, s, 0);
-	// mapping
-	bwt_cal_width(idx->bwt, seq_len, s, w);
-	if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
-	if (seq_len > buf->opt->seed_len)
-		bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
-	for (i = 0; i < seq_len; ++i) // complement; I forgot why...
-		s[i] = s[i] > 3? 4 : 3 - s[i];
-	sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
-	return sai;
-}
-
-static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
-{
-	uint64_t x = pos, z;
-	int k, y = 0;
-	*n_mm = *n_gaps = 0;
-	for (k = 0; k < n_cigar; ++k) {
-		int l = cigar[k]>>4;
-		int op = cigar[k]&0xf;
-		if (op == 0) { // match/mismatch
-			for (z = 0; z < l && x + z < l_pac; ++z) {
-				int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
-				if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
-			}
+	if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP
+		cigar = malloc(4);
+		cigar[0] = l_query<<4 | 0;
+		*n_cigar = 1;
+		for (i = 0, *score = 0; i < l_query; ++i)
+			*score += mat[rseq[i]*5 + query[i]];
+	} else {
+		int w, max_gap, min_w;
+		//printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n');
+		//printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n');
+		// set the band-width
+		max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.);
+		max_gap = max_gap > 1? max_gap : 1;
+		w = (max_gap + abs(rlen - l_query) + 1) >> 1;
+		w = w < w_? w : w_;
+		min_w = abs(rlen - l_query) + 3;
+		w = w > min_w? w : min_w;
+		// NW alignment
+		*score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar);
+	}
+	{// compute NM
+		int k, x, y, n_mm = 0, n_gap = 0;
+		for (k = 0, x = y = 0; k < *n_cigar; ++k) {
+			int op  = cigar[k]&0xf;
+			int len = cigar[k]>>4;
+			if (op == 0) { // match
+				for (i = 0; i < len; ++i)
+					if (query[x + i] != rseq[y + i]) ++n_mm;
+				x += len; y += len;
+			} else if (op == 1) x += len, n_gap += len;
+			else if (op == 2) y += len, n_gap += len;
 		}
-		if (op == 1 || op == 2) (*n_gaps) += l;
-		if (op == 0 || op == 2) x += l;
-		if (op == 0 || op == 1 || op == 4) y += l;
+		*NM = n_mm + n_gap;
 	}
+	if (rb >= l_pac) // reverse back query
+		for (i = 0; i < l_query>>1; ++i)
+			tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp;
+
+ret_gen_cigar:
+	free(rseq);
+	return cigar;
 }
 
-void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln)
+int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re)
 {
-	extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
-	extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
-	int strand, seq_len, i, n_gap, n_mm;
-	uint64_t pos3, pac_pos;
-	uint8_t *s[2];
-
-	memset(aln, 0, sizeof(bwa_aln_t));
-	seq_len = strlen(seq);
-	if (seq_len<<1 > buf->max_buf) {
-		buf->max_buf = seq_len<<1;
-		kroundup32(buf->max_buf);
-		buf->buf = realloc(buf->buf, buf->max_buf);
+	int ib, ie, is_rev;
+	int64_t fb, fe, mid = -1;
+	if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary
+		*qb = *qe = *rb = *re = -1;
+		return -1; // unable to fix
+	} else {
+		fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev);
+		ib = bns_pos2rid(bns, fb);
+		if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix
+		fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev);
+		ie = bns_pos2rid(bns, fe);
+		if (ie - ib > 1) { // bridge three or more references
+			*qb = *qe = *rb = *re = -1;
+			return -2; // unable to fix
+		} else {
+			int l = bns->anns[ib].offset + bns->anns[ib].len - fb;
+			mid = is_rev? *re - l : *rb + l;
+		}
 	}
-	s[0] = buf->buf;
-	s[1] = s[0] + seq_len;
-	for (i = 0; i < seq_len; ++i)
-		s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
-	seq_reverse(seq_len, s[1], 1);
-	pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
-	if (strand) aln->flag |= 16;
-	if (n_gaps) { // only for gapped alignment
-		int n_cigar;
-		bwa_cigar_t *cigar16;
-		cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
-		aln->n_cigar = n_cigar;
-		aln->cigar = malloc(n_cigar * 4);
-		for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
-			int op = cigar16[i]>>14;
-			int len = cigar16[i]&0x3fff;
-			if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
-			aln->cigar[i] = len<<4 | op;
-			if (op == 0 || op == 2) pos3 += len;
+	if (mid >= 0) {
+		int i, score, n_cigar, y, NM;
+		uint32_t *cigar;
+		int64_t x;
+		cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM);
+		for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) {
+			int op = cigar[i]&0xf, len = cigar[i]>>4;
+			if (op == 0) {
+				if (x <= mid && mid < x + len) {
+					if (mid - *rb > *re - mid) { // the first part is longer
+						if (x == mid) { // need to check the previous operation
+							assert(i); // mid != *rb should always stand
+							if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x;
+							else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4);
+							else abort(); // should not be here
+						} else *qe = y + (mid - x), *re = mid;
+					} else *qb = y + (mid - x), *rb = mid;
+					break;
+				} else x += len, y += len;
+			} else if (op == 1) { // insertion
+				y += len;
+			} else if (op == 2) { // deletion
+				if (x <= mid && mid < x + len) {
+					if (mid - *rb > *re - mid) *qe = y, *re = x;
+					else *qb = y, *rb = x + len;
+					break;
+				} else x += len;
+			} else abort(); // should not be here
 		}
-		free(cigar16);
-	} else { // ungapped
-		aln->n_cigar = 1;
-		aln->cigar = malloc(4);
-		aln->cigar[0] = seq_len<<4 | 0;
-		pos3 = pac_pos + seq_len;
+		free(cigar);
 	}
-	aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id);
-	aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset;
-	if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence
-		aln->flag |= 4; // read unmapped
-	compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap);
-	aln->n_mm = n_mm;
-	aln->n_gap = n_gap;
+	return 1;
 }
 
-/************************
- * Single-end alignment *
- ************************/
+/*********************
+ * Full index reader *
+ *********************/
 
-bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
+char *bwa_idx_infer_prefix(const char *hint)
 {
-	bwa_one_t *one;
-	int best, cnt, i, seq_len;
-
-	seq_len = strlen(seq);
-	one = calloc(1, sizeof(bwa_one_t));
-	one->sai = bwa_sai(idx, buf, seq);
-	if (one->sai.n == 0) return one;
-	// count number of hits; randomly select one alignment
-	best = one->sai.sai[0].score;
-	for (i = cnt = 0; i < one->sai.n; ++i) {
-		bwa_sai1_t *p = &one->sai.sai[i];
-		if (p->score > best) break;
-		if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
-			one->which = p;
-			one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
+	char *prefix;
+	int l_hint;
+	FILE *fp;
+	l_hint = strlen(hint);
+	prefix = malloc(l_hint + 3 + 4 + 1);
+	strcpy(prefix, hint);
+	strcpy(prefix + l_hint, ".64.bwt");
+	if ((fp = fopen(prefix, "rb")) != 0) {
+		fclose(fp);
+		prefix[l_hint + 3] = 0;
+		return prefix;
+	} else {
+		strcpy(prefix + l_hint, ".bwt");
+		if ((fp = fopen(prefix, "rb")) == 0) {
+			free(prefix);
+			return 0;
+		} else {
+			fclose(fp);
+			prefix[l_hint] = 0;
+			return prefix;
 		}
-		cnt += p->l - p->k + 1;
 	}
-	one->c1 = cnt;
-	for (; i < one->sai.n; ++i)
-		cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
-	one->c2 = cnt - one->c1;
-	// estimate single-end mapping quality
-	one->mapQs = -1;
-	if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
-	else if (one->c1 > 1) one->mapQs = 0;
-	else {
-		int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
-		if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
-		else if (one->c2 == 0) one->mapQs = 37;
+}
+
+bwt_t *bwa_idx_load_bwt(const char *hint)
+{
+	char *tmp, *prefix;
+	bwt_t *bwt;
+	prefix = bwa_idx_infer_prefix(hint);
+	if (prefix == 0) {
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
+		return 0;
+	}
+	tmp = calloc(strlen(prefix) + 5, 1);
+	strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
+	bwt = bwt_restore_bwt(tmp);
+	strcat(strcpy(tmp, prefix), ".sa");  // partial suffix array (SA)
+	bwt_restore_sa(tmp, bwt);
+	free(tmp); free(prefix);
+	return bwt;
+}
+
+bwaidx_t *bwa_idx_load(const char *hint, int which)
+{
+	bwaidx_t *idx;
+	char *prefix;
+	prefix = bwa_idx_infer_prefix(hint);
+	if (prefix == 0) {
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__);
+		return 0;
 	}
-	if (one->mapQs < 0) {
-		cnt = (one->c2 >= 255)? 255 : one->c2;
-		one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
+	idx = calloc(1, sizeof(bwaidx_t));
+	if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
+	if (which & BWA_IDX_BNS) {
+		idx->bns = bns_restore(prefix);
+		if (which & BWA_IDX_PAC) {
+			idx->pac = calloc(idx->bns->l_pac/4+1, 1);
+			fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
+			fclose(idx->bns->fp_pac);
+			idx->bns->fp_pac = 0;
+		}
 	}
-	one->mapQ = one->mapQs;
-	// compute CIGAR on request
-	one->one.ref_id = -1;
-	if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one);
-	return one;
+	free(prefix);
+	return idx;
 }
 
-void bwa_one_destroy(bwa_one_t *one)
+void bwa_idx_destroy(bwaidx_t *idx)
 {
-	free(one->sai.sai);
-	free(one->one.cigar);
-	free(one);
+	if (idx == 0) return;
+	if (idx->bwt) bwt_destroy(idx->bwt);
+	if (idx->bns) bns_destroy(idx->bns);
+	if (idx->pac) free(idx->pac);
+	free(idx);
 }
 
-/************************
- * Paired-end alignment *
- ************************/
+/***********************
+ * SAM header routines *
+ ***********************/
+
+void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line)
+{
+	int i;
+	for (i = 0; i < bns->n_seqs; ++i)
+		err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
+	if (rg_line) err_printf("%s\n", rg_line);
+}
 
-void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2])
+static char *bwa_escape(char *s)
 {
+	char *p, *q;
+	for (p = q = s; *p; ++p) {
+		if (*p == '\\') {
+			++p;
+			if (*p == 't') *q++ = '\t';
+			else if (*p == 'n') *q++ = '\n';
+			else if (*p == 'r') *q++ = '\r';
+			else if (*p == '\\') *q++ = '\\';
+		} else *q++ = *p;
+	}
+	*q = '\0';
+	return s;
 }
 
-void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2])
+char *bwa_set_rg(const char *s)
 {
+	char *p, *q, *r, *rg_line = 0;
+	memset(bwa_rg_id, 0, 256);
+	if (strstr(s, "@RG") != s) {
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__);
+		goto err_set_rg;
+	}
+	rg_line = strdup(s);
+	bwa_escape(rg_line);
+	if ((p = strstr(rg_line, "\tID:")) == 0) {
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__);
+		goto err_set_rg;
+	}
+	p += 4;
+	for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
+	if (q - p + 1 > 256) {
+		if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__);
+		goto err_set_rg;
+	}
+	for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
+		*r++ = *q;
+	return rg_line;
+
+err_set_rg:
+	free(rg_line);
+	return 0;
 }
+
diff --git a/bwa.h b/bwa.h
index e8172da..81d40e0 100644
--- a/bwa.h
+++ b/bwa.h
@@ -2,103 +2,45 @@
 #define BWA_H_
 
 #include <stdint.h>
+#include "bntseq.h"
+#include "bwt.h"
 
-#define BWA_DEF_MAX_SCORE 2048
-#define BWA_MAX_QUERY_LEN 1024
+#define BWA_IDX_BWT 0x1
+#define BWA_IDX_BNS 0x2
+#define BWA_IDX_PAC 0x4
+#define BWA_IDX_ALL 0x7
 
-// BWA index
-struct bwa_idx_t;
-typedef struct bwa_idx_t bwa_idx_t;
-
-// Buffer for BWA alignment
-struct bwa_buf_t;
-typedef struct bwa_buf_t bwa_buf_t;
-
-// BWA alignment options
-typedef struct {
-	int s_gapo, s_gape;               // gap open and extension penalties; the mismatch penalty is fixed at 3
-	int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
-	int seed_len, max_seed_diff;      // seed length and max differences allowed in the seed
-	float fnr;                        // parameter for automatic length-adjusted max differences
-} bwa_opt_t;
-
-// default BWA alignment options
-extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
-
-// an interval hit in the SA coordinate; basic unit in .sai files
 typedef struct {
-	uint32_t n_mm:16, n_gapo:8, n_gape:8;
-	int score;
-	uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
-} bwa_sai1_t;
+	bwt_t    *bwt; // FM-index
+	bntseq_t *bns; // information on the reference sequences
+	uint8_t  *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base
+} bwaidx_t;
 
-// all interval hits in the SA coordinate
 typedef struct {
-	int n; // number of interval hits
-	bwa_sai1_t *sai;
-} bwa_sai_t;
+	int l_seq;
+	char *name, *comment, *seq, *qual, *sam;
+} bseq1_t;
 
-// an alignment
-typedef struct {
-	uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
-	int32_t ref_id;                    // referece sequence index (the first seq is indexed by 0)
-	uint32_t offset;                   // coordinate on the reference; zero-based
-	uint32_t n_cigar:16, flag:16;      // number of CIGAR operations; SAM flag
-	uint32_t *cigar;                   // CIGAR in the BAM 28+4 encoding; having n_cigar operations
-} bwa_aln_t;
-
-typedef struct {
-	int mapQs, mapQ, c1, c2;
-	uint64_t sa;
-	bwa_sai1_t *which;
-	bwa_sai_t sai;
-	bwa_aln_t one;
-} bwa_one_t;
-
-typedef struct {
-	double avg, std, ap_prior;
-	uint64_t low, high, high_bayesian;
-} bwa_pestat_t;
+extern int bwa_verbose;
+extern char bwa_rg_id[256];
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-	// load a BWA index
-	bwa_idx_t *bwa_idx_load(const char *prefix);
-	void bwa_idx_destroy(bwa_idx_t *p);
-
-	// allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
-	bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
-	void bwa_buf_destroy(bwa_buf_t *p);
+	bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_);
 
-	/**
-	 * Find all the SA intervals
-	 *
-	 * @param idx    BWA index; multiple threads can share the same index
-	 * @param buf    BWA alignment buffer; each thread should have its own buffer
-	 * @param seq    NULL terminated C string, consisting of A/C/G/T/N only
-	 *
-	 * @return       SA intervals seq is matched to
-	 */
-	bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
+	uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);
+	int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re);
 
-	/**
-	 * Construct an alignment in the base-pair coordinate
-	 *
-	 * @param idx     BWA index
-	 * @param buf     BWA alignment buffer
-	 * @param seq     NULL terinated C string
-	 * @param sa      Suffix array value
-	 * @param n_gaps  Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
-	 *
-	 * @return        An alignment
-	 */
-	void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
+	char *bwa_idx_infer_prefix(const char *hint);
+	bwt_t *bwa_idx_load_bwt(const char *hint);
 
-	bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
+	bwaidx_t *bwa_idx_load(const char *hint, int which);
+	void bwa_idx_destroy(bwaidx_t *idx);
 
-	void bwa_one_destroy(bwa_one_t *one);
+	void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line);
+	char *bwa_set_rg(const char *s);
 
 #ifdef __cplusplus
 }
diff --git a/bwa.txt b/bwa.txt
new file mode 100644
index 0000000..d32ad96
--- /dev/null
+++ b/bwa.txt
@@ -0,0 +1,607 @@
+bwa(1)                       Bioinformatics tools                       bwa(1)
+
+
+
+NAME
+       bwa - Burrows-Wheeler Alignment Tool
+
+SYNOPSIS
+       bwa index ref.fa
+
+       bwa mem ref.fa reads.fq > aln-se.sam
+
+       bwa mem ref.fa read1.fq read2.fq > aln-pe.sam
+
+       bwa aln ref.fa short_read.fq > aln_sa.sai
+
+       bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam
+
+       bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam
+
+       bwa bwasw ref.fa long_read.fq > aln.sam
+
+
+DESCRIPTION
+       BWA is a software package for mapping low-divergent sequences against a
+       large  reference genome, such as the human genome. It consists of three
+       algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first  algorithm  is
+       designed  for  Illumina  sequence reads up to 100bp, while the rest two
+       for longer sequences ranged from 70bp to 1Mbp. BWA-MEM and BWA-SW share
+       similar  features  such  as  long-read support and split alignment, but
+       BWA-MEM, which is the latest, is generally recommended for high-quality
+       queries  as  it  is  faster and more accurate.  BWA-MEM also has better
+       performance than BWA-backtrack for 70-100bp Illumina reads.
+
+       For all the algorithms, BWA first needs to construct the  FM-index  for
+       the  reference  genome  (the  index  command). Alignment algorithms are
+       invoked with different sub-commands: aln/samse/sampe for BWA-backtrack,
+       bwasw for BWA-SW and mem for the BWA-MEM algorithm.
+
+
+COMMANDS AND OPTIONS
+       index  bwa index [-p prefix] [-a algoType] <in.db.fasta>
+
+              Index database sequences in the FASTA format.
+
+              OPTIONS:
+
+              -p STR    Prefix of the output database [same as db filename]
+
+              -a STR    Algorithm   for   constructing  BWT  index.  Available
+                        options are:
+
+                        is     IS linear-time algorithm for constructing  suf-
+                               fix  array. It requires 5.37N memory where N is
+                               the size of  the  database.  IS  is  moderately
+                               fast,  but  does  not work with database larger
+                               than 2GB. IS is the default  algorithm  due  to
+                               its  simplicity. The current codes for IS algo-
+                               rithm are reimplemented by Yuta Mori.
+
+                        bwtsw  Algorithm implemented in  BWT-SW.  This  method
+                               works with the whole human genome.
+
+
+       mem    bwa  mem  [-aCHMpP] [-t nThreads] [-k minSeedLen] [-w bandWidth]
+              [-r seedSplitRatio] [-c maxOcc] [-A matchScore]  [-B  mmPenalty]
+              [-O  gapOpenPen]  [-E gapExtPen] [-L clipPen] [-U unpairPen] [-R
+              RGline] [-v verboseLevel] db.prefix reads.fq [mates.fq]
+
+              Align 70bp-1Mbp query  sequences  with  the  BWA-MEM  algorithm.
+              Briefly,  the algorithm works by seeding alignments with maximal
+              exact matches (MEMs) and then extending seeds with  the  affine-
+              gap Smith-Waterman algorithm (SW).
+
+              If  mates.fq  file is absent and option -p is not set, this com-
+              mand regards input reads are single-end. If mates.fq is present,
+              this command assumes the i-th read in reads.fq and the i-th read
+              in mates.fq constitute a read pair. If -p is used,  the  command
+              assumes  the 2i-th and the (2i+1)-th read in reads.fq constitute
+              a read pair (such input file is said to be interleaved). In this
+              case,  mates.fq is ignored. In the paired-end mode, the mem com-
+              mand will infer the read orientation and the insert size distri-
+              bution from a batch of reads.
+
+              The  BWA-MEM  algorithm performs local alignment. It may produce
+              multiple primary  alignments  for  different  part  of  a  query
+              sequence. This is a crucial feature for long sequences. However,
+              some tools such as Picard's markDuplicates does  not  work  with
+              split  alignments.  One  may  consider  to use option -M to flag
+              shorter split hits as secondary.
+
+              OPTIONS:
+
+              -t INT    Number of threads [1]
+
+              -k INT    Minimum seed length. Matches shorter than INT will  be
+                        missed.  The alignment speed is usually insensitive to
+                        this value unless it significantly deviates 20. [19]
+
+              -w INT    Band width. Essentially, gaps longer than INT will not
+                        be  found.  Note  that  the maximum gap length is also
+                        affected by the scoring matrix and the hit length, not
+                        solely determined by this option. [100]
+
+              -r FLOAT  Trigger   re-seeding   for  a  MEM  longer  than  min-
+                        SeedLen*FLOAT.  This is a key heuristic parameter  for
+                        tuning  the  performance.  Larger  value  yields fewer
+                        seeds, which leads to faster alignment speed but lower
+                        accuracy. [1.5]
+
+              -c INT    Discard a MEM if it has more than INT occurence in the
+                        genome. This is an insensitive parameter. [10000]
+
+              -P        In the paired-end mode, perform SW to  rescue  missing
+                        hits  only  but  do  not  try  to find hits that fit a
+                        proper pair.
+
+              -A INT    Matching score. [1]
+
+              -B INT    Mismatch penalty. The sequence error rate is  approxi-
+                        mately: {.75 * exp[-log(4) * B/A]}. [4]
+
+              -O INT    Gap open penalty. [6]
+
+              -E INT    Gap extension penalty. A gap of length k costs O + k*E
+                        (i.e.  -O is for opening a zero-length gap). [1]
+
+              -L INT    Clipping penalty. When performing SW  extension,  BWA-
+                        MEM  keeps track of the best score reaching the end of
+                        query. If this score is larger than the best SW  score
+                        minus  the  clipping  penalty,  clipping  will  not be
+                        applied. Note that  in  this  case,  the  SAM  AS  tag
+                        reports  the  best  SW  score; clipping penalty is not
+                        deducted. [5]
+
+              -U INT    Penalty for an unpaired read pair. BWA-MEM  scores  an
+                        unpaired  read  pair  as scoreRead1+scoreRead2-INT and
+                        scores  a  paired   as   scoreRead1+scoreRead2-insert-
+                        Penalty.  It  compares  these  two scores to determine
+                        whether we should force pairing. [9]
+
+              -p        Assume the  first  input  query  file  is  interleaved
+                        paired-end  FASTA/Q.  See  the command description for
+                        details.
+
+              -R STR    Complete read group header line. '\t' can be  used  in
+                        STR  and will be converted to a TAB in the output SAM.
+                        The read group ID will be attached to  every  read  in
+                        the   output.  An  example  is  '@RG\tID:foo\tSM:bar'.
+                        [null]
+
+              -a        Output all found alignments for single-end or unpaired
+                        paired-end  reads. These alignments will be flagged as
+                        secondary alignments.
+
+              -C        Append append FASTA/Q  comment  to  SAM  output.  This
+                        option  can  be used to transfer read meta information
+                        (e.g. barcode)  to  the  SAM  output.  Note  that  the
+                        FASTA/Q  comment  (the  string  after  a  space in the
+                        header  line)  must  conform  the   SAM   spec   (e.g.
+                        BC:Z:CGTAC).  Malformated  comments  lead to incorrect
+                        SAM output.
+
+              -H        Use hard clipping 'H' in the SAM output.  This  option
+                        may  dramatically reduce the redundancy of output when
+                        mapping long contig or BAC sequences.
+
+              -M        Mark shorter split hits as secondary (for Picard  com-
+                        patibility).
+
+              -v INT    Control  the  verbose level of the output. This option
+                        has not been fully supported throughout BWA.  Ideally,
+                        a  value  0  for disabling all the output to stderr; 1
+                        for outputting errors only; 2 for warnings and errors;
+                        3  for all normal messages; 4 or higher for debugging.
+                        When this option takes value 4, the output is not SAM.
+                        [3]
+
+
+       aln    bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i
+              nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN]  [-M
+              misMsc]  [-O  gapOsc]  [-E  gapEsc]  [-q trimQual] <in.db.fasta>
+              <in.query.fq> > <out.sai>
+
+              Find the SA coordinates of the input reads. Maximum  maxSeedDiff
+              differences  are  allowed  in  the first seedLen subsequence and
+              maximum maxDiff differences are allowed in the whole sequence.
+
+              OPTIONS:
+
+              -n NUM    Maximum edit distance if the  value  is  INT,  or  the
+                        fraction  of  missing alignments given 2% uniform base
+                        error rate if FLOAT. In the latter case,  the  maximum
+                        edit  distance  is  automatically chosen for different
+                        read lengths. [0.04]
+
+              -o INT    Maximum number of gap opens [1]
+
+              -e INT    Maximum number of gap extensions, -1 for  k-difference
+                        mode (disallowing long gaps) [-1]
+
+              -d INT    Disallow  a  long  deletion  within INT bp towards the
+                        3'-end [16]
+
+              -i INT    Disallow an indel within INT bp towards the ends [5]
+
+              -l INT    Take the first INT subsequence  as  seed.  If  INT  is
+                        larger  than  the query sequence, seeding will be dis-
+                        abled. For long reads, this option is typically ranged
+                        from 25 to 35 for `-k 2'. [inf]
+
+              -k INT    Maximum edit distance in the seed [2]
+
+              -t INT    Number of threads (multi-threading mode) [1]
+
+              -M INT    Mismatch  penalty.  BWA will not search for suboptimal
+                        hits with a score lower than (bestScore-misMsc). [3]
+
+              -O INT    Gap open penalty [11]
+
+              -E INT    Gap extension penalty [4]
+
+              -R INT    Proceed with suboptimal alignments  if  there  are  no
+                        more  than  INT  equally  best  hits. This option only
+                        affects paired-end mapping. Increasing this  threshold
+                        helps  to  improve the pairing accuracy at the cost of
+                        speed, especially for short reads (~32bp).
+
+              -c        Reverse query but not complement it, which is required
+                        for  alignment  in  the  color  space. (Disabled since
+                        0.6.x)
+
+              -N        Disable iterative search. All hits with no  more  than
+                        maxDiff  differences  will be found. This mode is much
+                        slower than the default.
+
+              -q INT    Parameter for read trimming. BWA trims a read down  to
+                        argmax_x{\sum_{i=x+1}^l(INT-q_i)}  if  q_l<INT where l
+                        is the original read length. [0]
+
+              -I        The input is in the Illumina 1.3+ read format (quality
+                        equals ASCII-64).
+
+              -B INT    Length  of  barcode starting from the 5'-end. When INT
+                        is positive, the barcode of each read will be  trimmed
+                        before  mapping and will be written at the BC SAM tag.
+                        For paired-end reads, the barcode from both  ends  are
+                        concatenated. [0]
+
+              -b        Specify  the  input read sequence file is the BAM for-
+                        mat. For paired-end data, two ends in a pair  must  be
+                        grouped  together  and  options  -1  or -2 are usually
+                        applied to specify which end should be mapped. Typical
+                        command  lines  for  mapping  pair-end data in the BAM
+                        format are:
+
+                            bwa aln ref.fa -b1 reads.bam > 1.sai
+                            bwa aln ref.fa -b2 reads.bam > 2.sai
+                            bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam >
+                        aln.sam
+
+              -0        When  -b  is  specified,  only use single-end reads in
+                        mapping.
+
+              -1        When -b is specified, only use the  first  read  in  a
+                        read  pair  in  mapping (skip single-end reads and the
+                        second reads).
+
+              -2        When -b is specified, only use the second  read  in  a
+                        read pair in mapping.
+
+
+       samse  bwa samse [-n maxOcc] <in.db.fasta> <in.sai> <in.fq> > <out.sam>
+
+              Generate alignments in the SAM format  given  single-end  reads.
+              Repetitive hits will be randomly chosen.
+
+              OPTIONS:
+
+              -n INT    Maximum  number  of alignments to output in the XA tag
+                        for reads paired properly. If a read has more than INT
+                        hits, the XA tag will not be written. [3]
+
+              -r STR    Specify    the   read   group   in   a   format   like
+                        `@RG\tID:foo\tSM:bar'. [null]
+
+
+       sampe  bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N max-
+              HitDis] [-P] <in.db.fasta> <in1.sai> <in2.sai> <in1.fq> <in2.fq>
+              > <out.sam>
+
+              Generate alignments in the SAM format  given  paired-end  reads.
+              Repetitive read pairs will be placed randomly.
+
+              OPTIONS:
+
+              -a INT  Maximum  insert  size  for  a read pair to be considered
+                      being mapped properly. Since 0.4.5, this option is  only
+                      used  when  there are not enough good alignment to infer
+                      the distribution of insert sizes. [500]
+
+              -o INT  Maximum occurrences of a read for pairing. A  read  with
+                      more  occurrneces  will be treated as a single-end read.
+                      Reducing this parameter helps faster pairing. [100000]
+
+              -P      Load the entire FM-index  into  memory  to  reduce  disk
+                      operations (base-space reads only). With this option, at
+                      least 1.25N bytes of memory are required, where N is the
+                      length of the genome.
+
+              -n INT  Maximum number of alignments to output in the XA tag for
+                      reads paired properly. If a read has more than INT hits,
+                      the XA tag will not be written. [3]
+
+              -N INT  Maximum number of alignments to output in the XA tag for
+                      disconcordant read pairs (excluding  singletons).  If  a
+                      read  has  more  than  INT  hits, the XA tag will not be
+                      written. [10]
+
+              -r STR  Specify   the   read   group   in    a    format    like
+                      `@RG\tID:foo\tSM:bar'. [null]
+
+
+       bwasw  bwa  bwasw  [-a  matchScore]  [-b  mmPen]  [-q  gapOpenPen]  [-r
+              gapExtPen] [-t nThreads] [-w bandWidth] [-T thres] [-s  hspIntv]
+              [-z  zBest]  [-N  nHspRev]  [-c thresCoef] <in.db.fasta> <in.fq>
+              [mate.fq]
+
+              Align query  sequences  in  the  in.fq  file.  When  mate.fq  is
+              present,  perform paired-end alignment. The paired-end mode only
+              works for reads Illumina short-insert libraries. In the  paired-
+              end  mode, BWA-SW may still output split alignments but they are
+              all marked as not properly paired; the mate positions  will  not
+              be written if the mate has multiple local hits.
+
+              OPTIONS:
+
+              -a INT    Score of a match [1]
+
+              -b INT    Mismatch penalty [3]
+
+              -q INT    Gap open penalty [5]
+
+              -r INT    Gap  extension  penalty.  The penalty for a contiguous
+                        gap of size k is q+k*r. [2]
+
+              -t INT    Number of threads in the multi-threading mode [1]
+
+              -w INT    Band width in the banded alignment [33]
+
+              -T INT    Minimum score threshold divided by a [37]
+
+              -c FLOAT  Coefficient  for  threshold  adjustment  according  to
+                        query length. Given an l-long query, the threshold for
+                        a hit to be retained is a*max{T,c*log(l)}. [5.5]
+
+              -z INT    Z-best heuristics. Higher -z increases accuracy at the
+                        cost of speed. [1]
+
+              -s INT    Maximum SA interval size for initiating a seed. Higher
+                        -s increases accuracy at the cost of speed. [3]
+
+              -N INT    Minimum  number  of  seeds  supporting  the  resultant
+                        alignment to skip reverse alignment. [5]
+
+
+SAM ALIGNMENT FORMAT
+       The  output  of  the  `aln'  command is binary and designed for BWA use
+       only. BWA outputs the final  alignment  in  the  SAM  (Sequence  Align-
+       ment/Map) format. Each line consists of:
+
+
+       +----+-------+----------------------------------------------------------+
+       |Col | Field |                       Description                        |
+       +----+-------+----------------------------------------------------------+
+       | 1  | QNAME | Query (pair) NAME                                        |
+       | 2  | FLAG  | bitwise FLAG                                             |
+       | 3  | RNAME | Reference sequence NAME                                  |
+       | 4  | POS   | 1-based leftmost POSition/coordinate of clipped sequence |
+       | 5  | MAPQ  | MAPping Quality (Phred-scaled)                           |
+       | 6  | CIAGR | extended CIGAR string                                    |
+       | 7  | MRNM  | Mate Reference sequence NaMe (`=' if same as RNAME)      |
+       | 8  | MPOS  | 1-based Mate POSistion                                   |
+       | 9  | ISIZE | Inferred insert SIZE                                     |
+       |10  | SEQ   | query SEQuence on the same strand as the reference       |
+       |11  | QUAL  | query QUALity (ASCII-33 gives the Phred base quality)    |
+       |12  | OPT   | variable OPTional fields in the format TAG:VTYPE:VALUE   |
+       +----+-------+----------------------------------------------------------+
+
+       Each bit in the FLAG field is defined as:
+
+
+               +----+--------+---------------------------------------+
+               |Chr |  Flag  |              Description              |
+               +----+--------+---------------------------------------+
+               | p  | 0x0001 | the read is paired in sequencing      |
+               | P  | 0x0002 | the read is mapped in a proper pair   |
+               | u  | 0x0004 | the query sequence itself is unmapped |
+               | U  | 0x0008 | the mate is unmapped                  |
+               | r  | 0x0010 | strand of the query (1 for reverse)   |
+               | R  | 0x0020 | strand of the mate                    |
+               | 1  | 0x0040 | the read is the first read in a pair  |
+               | 2  | 0x0080 | the read is the second read in a pair |
+               | s  | 0x0100 | the alignment is not primary          |
+               | f  | 0x0200 | QC failure                            |
+               | d  | 0x0400 | optical or PCR duplicate              |
+               +----+--------+---------------------------------------+
+
+       The Please check <http://samtools.sourceforge.net> for the format spec-
+       ification and the tools for post-processing the alignment.
+
+       BWA generates the following optional fields. Tags starting with `X' are
+       specific to BWA.
+
+
+               +----+------------------------------------------------+
+               |Tag |                    Meaning                     |
+               +----+------------------------------------------------+
+               |NM  | Edit distance                                  |
+               |MD  | Mismatching positions/bases                    |
+               |AS  | Alignment score                                |
+               |BC  | Barcode sequence                               |
+               +----+------------------------------------------------+
+               |X0  | Number of best hits                            |
+               |X1  | Number of suboptimal hits found by BWA         |
+               |XN  | Number of ambiguous bases in the referenece    |
+               |XM  | Number of mismatches in the alignment          |
+               |XO  | Number of gap opens                            |
+               |XG  | Number of gap extentions                       |
+               |XT  | Type: Unique/Repeat/N/Mate-sw                  |
+               |XA  | Alternative hits; format: (chr,pos,CIGAR,NM;)* |
+               +----+------------------------------------------------+
+               |XS  | Suboptimal alignment score                     |
+               |XF  | Support from forward/reverse alignment         |
+               |XE  | Number of supporting seeds                     |
+               +----+------------------------------------------------+
+
+       Note  that XO and XG are generated by BWT search while the CIGAR string
+       by Smith-Waterman alignment. These two tags may  be  inconsistent  with
+       the CIGAR string. This is not a bug.
+
+
+NOTES ON SHORT-READ ALIGNMENT
+   Alignment Accuracy
+       When  seeding is disabled, BWA guarantees to find an alignment contain-
+       ing maximum maxDiff differences including maxGapO gap  opens  which  do
+       not  occur  within nIndelEnd bp towards either end of the query. Longer
+       gaps may be found if maxGapE is positive, but it is not  guaranteed  to
+       find  all  hits. When seeding is enabled, BWA further requires that the
+       first seedLen subsequence contains no  more  than  maxSeedDiff  differ-
+       ences.
+
+       When gapped alignment is disabled, BWA is expected to generate the same
+       alignment as Eland version 1, the Illumina alignment program.  However,
+       as  BWA change `N' in the database sequence to random nucleotides, hits
+       to these random sequences will also be counted. As a  consequence,  BWA
+       may mark a unique hit as a repeat, if the random sequences happen to be
+       identical to the sequences which should be unqiue in the database.
+
+       By default, if the best hit is not  highly  repetitive  (controlled  by
+       -R), BWA also finds all hits contains one more mismatch; otherwise, BWA
+       finds all equally best hits only. Base quality  is  NOT  considered  in
+       evaluating  hits.  In the paired-end mode, BWA pairs all hits it found.
+       It further performs Smith-Waterman alignment for unmapped reads to res-
+       cue  reads  with a high erro rate, and for high-quality anomalous pairs
+       to fix potential alignment errors.
+
+
+   Estimating Insert Size Distribution
+       BWA estimates the insert size distribution per 256*1024 read pairs.  It
+       first  collects  pairs of reads with both ends mapped with a single-end
+       quality 20 or higher and then calculates median (Q2), lower and  higher
+       quartile  (Q1  and  Q3).  It estimates the mean and the variance of the
+       insert size distribution from  pairs  whose  insert  sizes  are  within
+       interval  [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair
+       considered to be properly paired (SAM flag 0x2) is calculated by  solv-
+       ing  equation  Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is
+       the standard error of the insert size distribution, L is the length  of
+       the  genome,  p0  is  prior of anomalous pair and Phi() is the standard
+       cumulative distribution function.  For  mapping  Illumina  short-insert
+       reads  to  the  human  genome, x is about 6-7 sigma away from the mean.
+       Quartiles, mean, variance and x will be printed to the  standard  error
+       output.
+
+
+   Memory Requirement
+       With  bwtsw algorithm, 5GB memory is required for indexing the complete
+       human genome sequences. For short reads, the aln  command  uses  ~3.2GB
+       memory and the sampe command uses ~5.4GB.
+
+
+   Speed
+       Indexing the human genome sequences takes 3 hours with bwtsw algorithm.
+       Indexing smaller genomes with IS algorithms  is  faster,  but  requires
+       more memory.
+
+       The  speed  of alignment is largely determined by the error rate of the
+       query sequences (r). Firstly, BWA runs much  faster  for  near  perfect
+       hits  than for hits with many differences, and it stops searching for a
+       hit with l+2 differences if a l-difference hit is found. This means BWA
+       will  be  very  slow if r is high because in this case BWA has to visit
+       hits with many differences and looking for  these  hits  is  expensive.
+       Secondly,  the  alignment algorithm behind makes the speed sensitive to
+       [k log(N)/m], where k is the maximum allowed differences, N the size of
+       database and m the length of a query. In practice, we choose k w.r.t. r
+       and therefore r is the leading factor. I would not recommend to use BWA
+       on data with r>0.02.
+
+       Pairing  is  slower  for  shorter reads. This is mainly because shorter
+       reads have more spurious hits and converting SA coordinates to  chromo-
+       somal coordinates are very costly.
+
+
+CHANGES IN BWA-0.6
+       Since  version  0.6,  BWA has been able to work with a reference genome
+       longer than 4GB.  This feature makes it possible to integrate the  for-
+       ward  and  reverse complemented genome in one FM-index, which speeds up
+       both BWA-short and BWA-SW. As a tradeoff, BWA uses more memory  because
+       it has to keep all positions and ranks in 64-bit integers, twice larger
+       than 32-bit integers used in the previous versions.
+
+       The latest BWA-SW also works for paired-end reads longer than 100bp. In
+       comparison  to  BWA-short,  BWA-SW tends to be more accurate for highly
+       unique reads and more robust to relative  long  INDELs  and  structural
+       variants.   Nonetheless,  BWA-short usually has higher power to distin-
+       guish the optimal hit from many suboptimal hits. The choice of the map-
+       ping algorithm may depend on the application.
+
+
+SEE ALSO
+       BWA    website   <http://bio-bwa.sourceforge.net>,   Samtools   website
+       <http://samtools.sourceforge.net>
+
+
+AUTHOR
+       Heng Li at the Sanger Institute wrote the key source  codes  and  inte-
+       grated    the    following    codes   for   BWT   construction:   bwtsw
+       <http://i.cs.hku.hk/~ckwong3/bwtsw/>, implemented by Chi-Kwong Wong  at
+       the        University       of       Hong       Kong       and       IS
+       <http://yuta.256.googlepages.com/sais> originally proposed by  Nong  Ge
+       <http://www.cs.sysu.edu.cn/nong/>  at  the  Sun  Yat-Sen University and
+       implemented by Yuta Mori.
+
+
+LICENSE AND CITATION
+       The full BWA package is distributed under GPLv3 as it uses source codes
+       from  BWT-SW  which  is covered by GPL. Sorting, hash table, BWT and IS
+       libraries are distributed under the MIT license.
+
+       If you use the  BWA-backtrack  algorithm,  please  cite  the  following
+       paper:
+
+       Li  H. and Durbin R. (2009) Fast and accurate short read alignment with
+       Burrows-Wheeler  transform.  Bioinformatics,  25,   1754-1760.   [PMID:
+       19451168]
+
+       If you use the BWA-SW algorithm, please cite:
+
+       Li  H.  and Durbin R. (2010) Fast and accurate long-read alignment with
+       Burrows-Wheeler  transform.   Bioinformatics,   26,   589-595.   [PMID:
+       20080505]
+
+       If you use the fastmap component of BWA, please cite:
+
+       Li  H. (2012) Exploring single-sample SNP and INDEL calling with whole-
+       genome  de  novo  assembly.  Bioinformatics,  28,   1838-1844.   [PMID:
+       22569178]
+
+       The BWA-MEM algorithm has not been published yet.
+
+
+HISTORY
+       BWA  is  largely influenced by BWT-SW. It uses source codes from BWT-SW
+       and mimics its binary file formats; BWA-SW resembles BWT-SW in  several
+       ways.  The  initial  idea  about BWT-based alignment also came from the
+       group who developed BWT-SW. At the same time, BWA is  different  enough
+       from  BWT-SW. The short-read alignment algorithm bears no similarity to
+       Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW,  it
+       introduces  heuristics that can hardly be applied to the original algo-
+       rithm. In all, BWA does not guarantee to find all local  hits  as  what
+       BWT-SW  is  designed  to  do, but it is much faster than BWT-SW on both
+       short and long query sequences.
+
+       I started to write the first piece of codes on 24 May 2008 and got  the
+       initial  stable  version  on  02  June  2008. During this period, I was
+       acquainted that Professor Tak-Wah  Lam,  the  first  author  of  BWT-SW
+       paper,  was collaborating with Beijing Genomics Institute on SOAP2, the
+       successor to SOAP (Short Oligonucleotide Analysis Package).  SOAP2  has
+       come  out in November 2008. According to the SourceForge download page,
+       the third BWT-based short read aligner, bowtie, was first  released  in
+       August  2008.  At  the time of writing this manual, at least three more
+       BWT-based short-read aligners are being implemented.
+
+       The BWA-SW algorithm is a new component of BWA.  It  was  conceived  in
+       November 2008 and implemented ten months later.
+
+       The  BWA-MEM  algorithm  is based on an algorithm finding super-maximal
+       exact matches (SMEMs), which was first published with the fermi  assem-
+       bler paper in 2012. I first implemented the basic SMEM algorithm in the
+       fastmap command for an experiment and then extended the basic algorithm
+       and  added  the extension part in Feburary 2013 to make BWA-MEM a fully
+       featured mapper.
+
+
+
+
+bwa-0.7.0                      27 Feburary 2013                         bwa(1)
diff --git a/bwamem.c b/bwamem.c
new file mode 100644
index 0000000..52dc7fb
--- /dev/null
+++ b/bwamem.c
@@ -0,0 +1,848 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+#include "kstring.h"
+#include "bwamem.h"
+#include "bntseq.h"
+#include "ksw.h"
+#include "kvec.h"
+#include "ksort.h"
+#include "utils.h"
+
+/* Theory on probability and scoring *ungapped* alignment
+ *
+ * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution
+ * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate
+ *
+ * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x
+ *
+ * If the matching score is x and mismatch penalty is -y, we can compute error rate e:
+ *   e = .75 * exp[-log(4) * y/x]
+ *
+ * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)}
+ *   = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l)
+ *
+ * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale:
+ *   Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x)
+ *
+ *
+ * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1)
+ * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4)
+ *
+ * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR)
+ */
+
+mem_opt_t *mem_opt_init()
+{
+	mem_opt_t *o;
+	o = calloc(1, sizeof(mem_opt_t));
+	o->flag = 0;
+	o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100;
+	o->pen_unpaired = 9;
+	o->pen_clip = 5;
+	o->min_seed_len = 19;
+	o->split_width = 10;
+	o->max_occ = 10000;
+	o->max_chain_gap = 10000;
+	o->max_ins = 10000;
+	o->mask_level = 0.50;
+	o->chain_drop_ratio = 0.50;
+	o->split_factor = 1.5;
+	o->chunk_size = 10000000;
+	o->n_threads = 1;
+	o->max_matesw = 100;
+	mem_fill_scmat(o->a, o->b, o->mat);
+	return o;
+}
+
+void mem_fill_scmat(int a, int b, int8_t mat[25])
+{
+	int i, j, k;
+	for (i = k = 0; i < 4; ++i) {
+		for (j = 0; j < 4; ++j)
+			mat[k++] = i == j? a : -b;
+		mat[k++] = 0; // ambiguous base
+	}
+	for (j = 0; j < 5; ++j) mat[k++] = 0;
+}
+
+/***************************
+ * SMEM iterator interface *
+ ***************************/
+
+struct __smem_i {
+	const bwt_t *bwt;
+	const uint8_t *query;
+	int start, len;
+	bwtintv_v *matches; // matches; to be returned by smem_next()
+	bwtintv_v *sub;     // sub-matches inside the longest match; temporary
+	bwtintv_v *tmpvec[2]; // temporary arrays
+};
+
+smem_i *smem_itr_init(const bwt_t *bwt)
+{
+	smem_i *itr;
+	itr = calloc(1, sizeof(smem_i));
+	itr->bwt = bwt;
+	itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
+	itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
+	itr->matches   = calloc(1, sizeof(bwtintv_v));
+	itr->sub       = calloc(1, sizeof(bwtintv_v));
+	return itr;
+}
+
+void smem_itr_destroy(smem_i *itr)
+{
+	free(itr->tmpvec[0]->a); free(itr->tmpvec[0]);
+	free(itr->tmpvec[1]->a); free(itr->tmpvec[1]);
+	free(itr->matches->a);   free(itr->matches);
+	free(itr->sub->a);       free(itr->sub);
+	free(itr);
+}
+
+void smem_set_query(smem_i *itr, int len, const uint8_t *query)
+{
+	itr->query = query;
+	itr->start = 0;
+	itr->len = len;
+}
+
+const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width)
+{
+	int i, max, max_i, ori_start;
+	itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0;
+	if (itr->start >= itr->len || itr->start < 0) return 0;
+	while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases
+	if (itr->start == itr->len) return 0;
+	ori_start = itr->start;
+	itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM
+	if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here
+	for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match
+		bwtintv_t *p = &itr->matches->a[i];
+		int len = (uint32_t)p->info - (p->info>>32);
+		if (max < len) max = len, max_i = i;
+	}
+	if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long
+		int j;
+		bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging
+		bwtintv_t *p = &itr->matches->a[max_i];
+		bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM
+		i = j = 0; a->n = 0;
+		while (i < itr->matches->n && j < itr->sub->n) { // ordered merge
+			int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info);
+			int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info);
+			if (xi < xj) {
+				kv_push(bwtintv_t, *a, itr->matches->a[i]);
+				++i;
+			} else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) {
+				kv_push(bwtintv_t, *a, itr->sub->a[j]);
+				++j;
+			} else ++j;
+		}
+		for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]);
+		for (; j < itr->sub->n; ++j)
+			if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start)
+				kv_push(bwtintv_t, *a, itr->sub->a[j]);
+		kv_copy(bwtintv_t, *itr->matches, *a);
+	}
+	return itr->matches;
+}
+
+/********************************
+ * Chaining while finding SMEMs *
+ ********************************/
+
+typedef struct {
+	int64_t rbeg;
+	int32_t qbeg, len;
+} mem_seed_t;
+
+typedef struct {
+	int n, m;
+	int64_t pos;
+	mem_seed_t *seeds;
+} mem_chain_t;
+
+typedef struct { size_t n, m; mem_chain_t *a;  } mem_chain_v;
+
+#include "kbtree.h"
+
+#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos))
+KBTREE_INIT(chn, mem_chain_t, chain_cmp)
+
+static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p)
+{
+	int64_t qend, rend, x, y;
+	const mem_seed_t *last = &c->seeds[c->n-1];
+	qend = last->qbeg + last->len;
+	rend = last->rbeg + last->len;
+	if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend)
+		return 1; // contained seed; do nothing
+	x = p->qbeg - last->qbeg; // always non-negtive
+	y = p->rbeg - last->rbeg;
+	if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain
+		if (c->n == c->m) {
+			c->m <<= 1;
+			c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t));
+		}
+		c->seeds[c->n++] = *p;
+		return 1;
+	}
+	return 0; // request to add a new chain
+}
+
+static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr)
+{
+	const bwtintv_v *a;
+	int split_len = (int)(opt->min_seed_len * opt->split_factor + .499);
+	split_len = split_len < itr->len? split_len : itr->len;
+	while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM
+		int i;
+		for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start
+			bwtintv_t *p = &a->a[i];
+			int slen = (uint32_t)p->info - (p->info>>32); // seed length
+			int64_t k;
+			if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive
+			for (k = 0; k < p->x[2]; ++k) {
+				mem_chain_t tmp, *lower, *upper;
+				mem_seed_t s;
+				int to_add = 0;
+				s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference
+				s.qbeg = p->info>>32;
+				s.len  = slen;
+				if (kb_size(tree)) {
+					kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain
+					if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1;
+				} else to_add = 1;
+				if (to_add) { // add the seed as a new chain
+					tmp.n = 1; tmp.m = 4;
+					tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t));
+					tmp.seeds[0] = s;
+					kb_putp(chn, tree, &tmp);
+				}
+			}
+		}
+	}
+}
+
+void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn)
+{
+	int i, j;
+	for (i = 0; i < chn->n; ++i) {
+		mem_chain_t *p = &chn->a[i];
+		printf("%d", p->n);
+		for (j = 0; j < p->n; ++j) {
+			bwtint_t pos;
+			int is_rev, ref_id;
+			pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev);
+			if (is_rev) pos -= p->seeds[j].len - 1;
+			bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id);
+			printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
+		}
+		putchar('\n');
+	}
+}
+
+mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq)
+{
+	mem_chain_v chain;
+	smem_i *itr;
+	kbtree_t(chn) *tree;
+
+	kv_init(chain);
+	if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match
+	tree = kb_init(chn, KB_DEFAULT_SIZE);
+	itr = smem_itr_init(bwt);
+	smem_set_query(itr, len, seq);
+	mem_insert_seed(opt, tree, itr);
+
+	kv_resize(mem_chain_t, chain, kb_size(tree));
+
+	#define traverse_func(p_) (chain.a[chain.n++] = *(p_))
+	__kb_traverse(mem_chain_t, tree, traverse_func);
+	#undef traverse_func
+
+	smem_itr_destroy(itr);
+	kb_destroy(chn, tree);
+	return chain;
+}
+
+/********************
+ * Filtering chains *
+ ********************/
+
+typedef struct {
+	int beg, end, w;
+	void *p, *p2;
+} flt_aux_t;
+
+#define flt_lt(a, b) ((a).w > (b).w)
+KSORT_INIT(mem_flt, flt_aux_t, flt_lt)
+
+int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains)
+{
+	flt_aux_t *a;
+	int i, j, n;
+	if (n_chn <= 1) return n_chn; // no need to filter
+	a = malloc(sizeof(flt_aux_t) * n_chn);
+	for (i = 0; i < n_chn; ++i) {
+		mem_chain_t *c = &chains[i];
+		int64_t end;
+		int w = 0, tmp;
+		for (j = 0, end = 0; j < c->n; ++j) {
+			const mem_seed_t *s = &c->seeds[j];
+			if (s->qbeg >= end) w += s->len;
+			else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end;
+			end = end > s->qbeg + s->len? end : s->qbeg + s->len;
+		}
+		tmp = w;
+		for (j = 0, end = 0; j < c->n; ++j) {
+			const mem_seed_t *s = &c->seeds[j];
+			if (s->rbeg >= end) w += s->len;
+			else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end;
+			end = end > s->qbeg + s->len? end : s->qbeg + s->len;
+		}
+		w = w < tmp? w : tmp;
+		a[i].beg = c->seeds[0].qbeg;
+		a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len;
+		a[i].w = w; a[i].p = c; a[i].p2 = 0;
+	}
+	ks_introsort(mem_flt, n_chn, a);
+	{ // reorder chains such that the best chain appears first
+		mem_chain_t *swap;
+		swap = malloc(sizeof(mem_chain_t) * n_chn);
+		for (i = 0; i < n_chn; ++i) {
+			swap[i] = *((mem_chain_t*)a[i].p);
+			a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed
+		}
+		memcpy(chains, swap, sizeof(mem_chain_t) * n_chn);
+		free(swap);
+	}
+	for (i = 1, n = 1; i < n_chn; ++i) {
+		for (j = 0; j < n; ++j) {
+			int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg;
+			int e_min = a[j].end < a[i].end? a[j].end : a[i].end;
+			if (e_min > b_max) { // have overlap
+				int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg;
+				if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
+					if (a[j].p2 == 0) a[j].p2 = a[i].p;
+					if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1)
+						break;
+				}
+			}
+		}
+		if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it.
+	}
+	for (i = 0; i < n; ++i) { // mark chains to be kept
+		mem_chain_t *c = (mem_chain_t*)a[i].p;
+		if (c->n > 0) c->n = -c->n;
+		c = (mem_chain_t*)a[i].p2;
+		if (c && c->n > 0) c->n = -c->n;
+	}
+	free(a);
+	for (i = 0; i < n_chn; ++i) { // free discarded chains
+		mem_chain_t *c = &chains[i];
+		if (c->n >= 0) {
+			free(c->seeds);
+			c->n = c->m = 0;
+		} else c->n = -c->n;
+	}
+	for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains
+		if (chains[i].n > 0) {
+			if (n != i) chains[n++] = chains[i];
+			else ++n;
+		}
+	}
+	return n;
+}
+
+/******************************
+ * De-overlap single-end hits *
+ ******************************/
+
+#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb))))
+KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt)
+
+int mem_sort_and_dedup(int n, mem_alnreg_t *a)
+{
+	int m, i;
+	if (n <= 1) return n;
+	ks_introsort(mem_ars, n, a);
+	for (i = 1; i < n; ++i) { // mark identical hits
+		if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb)
+			a[i].qe = a[i].qb;
+	}
+	for (i = 1, m = 1; i < n; ++i) // exclude identical hits
+		if (a[i].qe > a[i].qb) {
+			if (m != i) a[m++] = a[i];
+			else ++m;
+		}
+	return m;
+}
+
+void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function
+{ // similar to the loop in mem_chain_flt()
+	int i, k, tmp;
+	kvec_t(int) z;
+	if (n == 0) return;
+	kv_init(z);
+	for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1;
+	tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+	kv_push(int, z, 0);
+	for (i = 1; i < n; ++i) {
+		for (k = 0; k < z.n; ++k) {
+			int j = z.a[k];
+			int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb;
+			int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe;
+			if (e_min > b_max) { // have overlap
+				int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb;
+				if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap
+					if (a[j].sub == 0) a[j].sub = a[i].score;
+					if (a[j].score - a[i].score <= tmp) ++a[j].sub_n;
+					break;
+				}
+			}
+		}
+		if (k == z.n) kv_push(int, z, i);
+		else a[i].secondary = z.a[k];
+	}
+	free(z.a);
+}
+
+/****************************************
+ * Construct the alignment from a chain *
+ ****************************************/
+
+static inline int cal_max_gap(const mem_opt_t *opt, int qlen)
+{
+	int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.);
+	l = l > 1? l : 1;
+	return l < opt->w<<1? l : opt->w<<1;
+}
+
+void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av)
+{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds
+	int i, k;
+	int64_t rlen, rmax[2], tmp, max = 0;
+	const mem_seed_t *s;
+	uint8_t *rseq = 0;
+	uint64_t *srt;
+
+	if (c->n == 0) return;
+	// get the max possible span
+	rmax[0] = l_pac<<1; rmax[1] = 0;
+	for (i = 0; i < c->n; ++i) {
+		int64_t b, e;
+		const mem_seed_t *t = &c->seeds[i];
+		b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg));
+		e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len));
+		rmax[0] = rmax[0] < b? rmax[0] : b;
+		rmax[1] = rmax[1] > e? rmax[1] : e;
+		if (t->len > max) max = t->len;
+	}
+	rmax[0] = rmax[0] > 0? rmax[0] : 0;
+	rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1;
+	if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side
+		if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac;
+		else rmax[0] = l_pac;
+	}
+	// retrieve the reference sequence
+	rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen);
+	if (rlen != rmax[1] - rmax[0]) return;
+
+	srt = malloc(c->n * 8);
+	for (i = 0; i < c->n; ++i)
+		srt[i] = (uint64_t)c->seeds[i].len<<32 | i;
+	ks_introsort_64(c->n, srt);
+
+	for (k = c->n - 1; k >= 0; --k) {
+		mem_alnreg_t *a;
+		s = &c->seeds[(uint32_t)srt[k]];
+
+		for (i = 0; i < av->n; ++i) { // test whether extension has been made before
+			mem_alnreg_t *p = &av->a[i];
+			int64_t rd;
+			int qd, w, max_gap;
+			if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained
+			// qd: distance ahead of the seed on query; rd: on reference
+			qd = s->qbeg - p->qb; rd = s->rbeg - p->rb;
+			max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed
+			w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width
+			if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit
+			// similar to the previous four lines, but this time we look at the region behind
+			qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len);
+			max_gap = cal_max_gap(opt, qd < rd? qd : rd);
+			w = max_gap < opt->w? max_gap : opt->w;
+			if (qd - rd < w && rd - qd < w) break;
+		}
+		if (i < av->n) continue;
+
+		a = kv_pushp(mem_alnreg_t, *av);
+		memset(a, 0, sizeof(mem_alnreg_t));
+
+		if (s->qbeg) { // left extension
+			uint8_t *rs, *qs;
+			int qle, tle, gtle, gscore;
+			qs = malloc(s->qbeg);
+			for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i];
+			tmp = s->rbeg - rmax[0];
+			rs = malloc(tmp);
+			for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
+			a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, &gtle, &gscore);
+			// check whether we prefer to reach the end of the query
+			if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits
+			else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end
+			free(qs); free(rs);
+		} else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg;
+
+		if (s->qbeg + s->len != l_query) { // right extension
+			int qle, tle, qe, re, gtle, gscore;
+			qe = s->qbeg + s->len;
+			re = s->rbeg + s->len - rmax[0];
+			a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, &gtle, &gscore);
+			// similar to the above
+			if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle;
+			else a->qe = l_query, a->re = rmax[0] + re + gtle;
+		} else a->qe = l_query, a->re = s->rbeg + s->len;
+		if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re);
+
+		// compute seedcov
+		for (i = 0, a->seedcov = 0; i < c->n; ++i) {
+			const mem_seed_t *t = &c->seeds[i];
+			if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained
+				a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough
+		}
+	}
+	free(srt); free(rseq);
+}
+
+/*****************************
+ * Basic hit->SAM conversion *
+ *****************************/
+
+static inline int infer_bw(int l1, int l2, int score, int a, int q, int r)
+{
+	int w;
+	if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps
+	w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.);
+	if (w < abs(l1 - l2)) w = abs(l1 - l2);
+	return w;
+}
+
+void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m)
+{
+#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1)
+	int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1;
+	uint32_t *cigar = 0;
+	int64_t pos;
+	bwahit_t ptmp, *p = &ptmp;
+
+	if (!p_) { // in this case, generate an unmapped alignment
+		memset(&ptmp, 0, sizeof(bwahit_t));
+		ptmp.rb = ptmp.re = -1;
+	} else ptmp = *p_;
+	p->flag |= m? 1 : 0; // is paired in sequencing
+	p->flag |= !is_mapped(p)? 4 : 0; // is mapped
+	p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped
+	if (m && !is_mapped(p) && is_mapped(m)) {
+		p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq;
+		copy_mate = 1;
+	}
+	p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand
+	p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand
+	kputs(s->name, str); kputc('\t', str);
+	if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate
+		int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag
+		if (p->flag&0x10000) sam_flag |= 0x100;
+		if (!copy_mate) {
+			int w2;
+			w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r);
+			w2 = w2 < w? w2 : w;
+			cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM);
+			p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened)
+		} else n_cigar = 0, cigar = 0;
+		pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev);
+		bns_cnt_ambi(bns, pos, p->re - p->rb, &rid);
+		kputw(sam_flag, str); kputc('\t', str);
+		kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str);
+		kputw(p->qual, str); kputc('\t', str);
+		if (n_cigar) {
+			int i, clip5, clip3;
+			clip5 = is_rev? s->l_seq - p->qe : p->qb;
+			clip3 = is_rev? p->qb : s->l_seq - p->qe;
+			if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); }
+			for (i = 0; i < n_cigar; ++i) {
+				kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str);
+			}
+			if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); }
+		} else kputc('*', str);
+	} else { // no coordinate
+		kputw(p->flag, str);
+		kputs("\t*\t0\t0\t*", str);
+		rid = -1;
+	}
+	if (m && is_mapped(m)) { // then print mate pos and isize
+		pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev);
+		bns_cnt_ambi(bns, pos, m->re - m->rb, &mid);
+		kputc('\t', str);
+		if (mid == rid) kputc('=', str);
+		else kputs(bns->anns[mid].name, str);
+		kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str);
+		kputc('\t', str);
+		if (mid == rid) {
+			int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb;
+			int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb;
+			kputw(p0 - p1 + (p0 > p1? 1 : -1), str);
+		} else kputw(0, str);
+		kputc('\t', str);
+	} else kputsn("\t*\t0\t0\t", 7, str);
+	if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL
+		kputsn("*\t*", 3, str);
+	} else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand
+		int i, qb = 0, qe = s->l_seq;
+		if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
+		ks_resize(str, str->l + (qe - qb) + 1);
+		for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]];
+		kputc('\t', str);
+		if (s->qual) { // printf qual
+			ks_resize(str, str->l + (qe - qb) + 1);
+			for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i];
+			str->s[str->l] = 0;
+		} else kputc('*', str);
+	} else { // the reverse strand
+		int i, qb = 0, qe = s->l_seq;
+		if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe;
+		ks_resize(str, str->l + (qe - qb) + 1);
+		for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]];
+		kputc('\t', str);
+		if (s->qual) { // printf qual
+			ks_resize(str, str->l + (qe - qb) + 1);
+			for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i];
+			str->s[str->l] = 0;
+		} else kputc('*', str);
+	}
+	if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); }
+	if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); }
+	if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); }
+	if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); }
+	if (s->comment) { kputc('\t', str); kputs(s->comment, str); }
+	kputc('\n', str);
+	free(cigar);
+#undef is_mapped
+}
+
+/************************
+ * Integrated interface *
+ ************************/
+
+int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a)
+{
+	int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a;
+	double identity;
+	sub = a->csub > sub? a->csub : sub;
+	if (sub >= a->score) return 0;
+	l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb;
+	mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0;
+	identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l;
+	mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq;
+	if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499);
+	if (mapq > 60) mapq = 60;
+	if (mapq < 0) mapq = 0;
+	return mapq;
+}
+
+void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h)
+{
+	h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe;
+	h->score = a->score;
+	h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub;
+	h->qual = 0; // quality unset
+	h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set
+}
+
+void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m)
+{
+	int k;
+	kstring_t str;
+	str.l = str.m = 0; str.s = 0;
+	if (a->n > 0) {
+		int mapq0 = -1;
+		for (k = 0; k < a->n; ++k) {
+			bwahit_t h;
+			mem_alnreg_t *p = &a->a[k];
+			if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue;
+			if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue;
+			mem_alnreg2hit(p, &h);
+			bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re);
+			h.flag |= extra_flag;
+			if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard)
+			h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p);
+			if (k == 0) mapq0 = h.qual;
+			else if (h.qual > mapq0) h.qual = mapq0;
+			bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m);
+		}
+	} else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m);
+	s->sam = str.s;
+}
+
+mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
+{
+	int i;
+	mem_chain_v chn;
+	mem_alnreg_v regs;
+
+	for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so
+		seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]];
+
+	chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq);
+	chn.n = mem_chain_flt(opt, chn.n, chn.a);
+	if (bwa_verbose >= 4) mem_print_chain(bns, &chn);
+
+	kv_init(regs);
+	for (i = 0; i < chn.n; ++i) {
+		mem_chain_t *p = &chn.a[i];
+		mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &regs);
+		free(chn.a[i].seeds);
+	}
+	free(chn.a);
+	regs.n = mem_sort_and_dedup(regs.n, regs.a);
+	return regs;
+}
+
+mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq)
+{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se()
+	mem_alnreg_v ar;
+	ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq);
+	mem_mark_primary_se(opt, ar.n, ar.a);
+	return ar;
+}
+
+// This routine is only used for the API purpose
+mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar)
+{
+	mem_aln_t a;
+	int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev;
+	int64_t pos, rb = ar->rb, re = ar->re;
+	memset(&a, 0, sizeof(mem_aln_t));
+	a.mapq = mem_approx_mapq_se(opt, ar);
+	bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re);
+	w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r);
+	w2 = w2 < opt->w? w2 : opt->w;
+	a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM);
+	a.NM = NM;
+	pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev);
+	a.is_rev = is_rev;
+	if (qb != 0 || qe != l_query) { // add clipping to CIGAR
+		int clip5, clip3;
+		clip5 = is_rev? l_query - qe : qb;
+		clip3 = is_rev? qb : l_query - qe;
+		a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2));
+		if (clip5) {
+			memmove(a.cigar+1, a.cigar, a.n_cigar * 4);
+			a.cigar[0] = clip5<<4|3;
+			++a.n_cigar;
+		}
+		if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3;
+	}
+	a.rid = bns_pos2rid(bns, pos);
+	a.pos = pos - bns->anns[a.rid].offset;
+	return a;
+}
+
+typedef struct {
+	int start, step, n;
+	const mem_opt_t *opt;
+	const bwt_t *bwt;
+	const bntseq_t *bns;
+	const uint8_t *pac;
+	const mem_pestat_t *pes;
+	bseq1_t *seqs;
+	mem_alnreg_v *regs;
+} worker_t;
+
+static void *worker1(void *data)
+{
+	worker_t *w = (worker_t*)data;
+	int i;
+	if (!(w->opt->flag&MEM_F_PE)) {
+		for (i = w->start; i < w->n; i += w->step)
+			w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq);
+	} else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower
+		for (i = w->start; i < w->n>>1; i += w->step) {
+			w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq);
+			w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq);
+		}
+	}
+	return 0;
+}
+
+static void *worker2(void *data)
+{
+	extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]);
+	worker_t *w = (worker_t*)data;
+	int i;
+	if (!(w->opt->flag&MEM_F_PE)) {
+		for (i = w->start; i < w->n; i += w->step) {
+			mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a);
+			mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0);
+			free(w->regs[i].a);
+		}
+	} else {
+		int n = 0;
+		for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet
+			n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]);
+			free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a);
+		}
+		fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n);
+	}
+	return 0;
+}
+
+void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs)
+{
+	int i;
+	worker_t *w;
+	mem_alnreg_v *regs;
+	mem_pestat_t pes[4];
+
+	w = calloc(opt->n_threads, sizeof(worker_t));
+	regs = malloc(n * sizeof(mem_alnreg_v));
+	for (i = 0; i < opt->n_threads; ++i) {
+		worker_t *p = &w[i];
+		p->start = i; p->step = opt->n_threads; p->n = n;
+		p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac;
+		p->seqs = seqs; p->regs = regs;
+		p->pes = &pes[0];
+	}
+#ifdef HAVE_PTHREAD
+	if (opt->n_threads == 1) {
+		worker1(w);
+		if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
+		worker2(w);
+	} else {
+		pthread_t *tid;
+		tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t));
+		for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]);
+		for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
+		if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
+		for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]);
+		for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0);
+		free(tid);
+	}
+#else
+	worker1(w);
+	if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes);
+	worker2(w);
+#endif
+	for (i = 0; i < n; ++i) {
+		fputs(seqs[i].sam, stdout);
+		free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam);
+	}
+	free(regs); free(w);
+}
diff --git a/bwamem.h b/bwamem.h
new file mode 100644
index 0000000..c2f124c
--- /dev/null
+++ b/bwamem.h
@@ -0,0 +1,145 @@
+#ifndef BWAMEM_H_
+#define BWAMEM_H_
+
+#include "bwt.h"
+#include "bntseq.h"
+#include "bwa.h"
+
+#define MEM_MAPQ_COEF 30.0
+#define MEM_MAPQ_MAX  60
+
+struct __smem_i;
+typedef struct __smem_i smem_i;
+
+#define MEM_F_HARDCLIP  0x1
+#define MEM_F_PE        0x2
+#define MEM_F_NOPAIRING 0x4
+#define MEM_F_ALL       0x8
+#define MEM_F_NO_MULTI  0x10
+
+typedef struct {
+	int a, b, q, r;         // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r
+	int pen_unpaired;       // phred-scaled penalty for unpaired reads
+	int pen_clip;           // clipping penalty. This score is not deducted from the DP score.
+	int w;                  // band width
+
+	int flag;               // see MEM_F_* macros
+	int min_seed_len;       // minimum seed length
+	float split_factor;     // split into a seed if MEM is longer than min_seed_len*split_factor
+	int split_width;        // split into a seed if its occurence is smaller than this value
+	int max_occ;            // skip a seed if its occurence is larger than this value
+	int max_chain_gap;      // do not chain seed if it is max_chain_gap-bp away from the closest seed
+	int n_threads;          // number of threads
+	int chunk_size;         // process chunk_size-bp sequences in a batch
+	float mask_level;       // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits
+	float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain
+	int max_ins;            // when estimating insert size distribution, skip pairs with insert longer than this value
+	int max_matesw;         // perform maximally max_matesw rounds of mate-SW for each end
+	int8_t mat[25];         // scoring matrix; mat[0] == 0 if unset
+} mem_opt_t;
+
+typedef struct {
+	int64_t rb, re; // [rb,re): reference sequence in the alignment
+	int qb, qe;     // [qb,qe): query sequence in the alignment
+	int score;      // best SW score
+	int sub;        // 2nd best SW score
+	int csub;       // SW score of a tandem hit
+	int sub_n;      // approximate number of suboptimal hits
+	int seedcov;    // length of regions coverged by seeds
+	int secondary;  // index of the parent hit shadowing the current hit; <0 if primary
+} mem_alnreg_t;
+
+typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v;
+
+typedef struct {
+	int low, high, failed;
+	double avg, std;
+} mem_pestat_t;
+
+typedef struct { // TODO: This is an intermediate struct only. Better get rid of it.
+	int64_t rb, re;
+	int qb, qe, flag, qual;
+	// optional info
+	int score, sub;
+} bwahit_t;
+
+typedef struct { // This struct is only used for the convenience of API.
+	int rid;
+	int pos;
+	uint32_t is_rev:1, mapq:8, NM:23;
+	int n_cigar;
+	uint32_t *cigar;
+} mem_aln_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	smem_i *smem_itr_init(const bwt_t *bwt);
+	void smem_itr_destroy(smem_i *itr);
+	void smem_set_query(smem_i *itr, int len, const uint8_t *query);
+	const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width);
+
+	mem_opt_t *mem_opt_init(void);
+	void mem_fill_scmat(int a, int b, int8_t mat[25]);
+
+	/**
+	 * Align a batch of sequences and generate the alignments in the SAM format
+	 *
+	 * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam.
+	 * Note that $seqs[i].sam may consist of several SAM lines if the
+	 * corresponding sequence has multiple primary hits.
+	 *
+	 * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query
+	 * sequences must be interleaved: $n must be an even number and the 2i-th
+	 * sequence and the (2i+1)-th sequence constitute a read pair. In this
+	 * mode, there should be enough (typically >50) unique pairs for the
+	 * routine to infer the orientation and insert size.
+	 *
+	 * @param opt    alignment parameters
+	 * @param bwt    FM-index of the reference sequence
+	 * @param bns    Information of the reference
+	 * @param pac    2-bit encoded reference
+	 * @param n      number of query sequences
+	 * @param seqs   query sequences; $seqs[i].seq/sam to be modified after the call
+	 */
+	void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs);
+
+	/**
+	 * Find the aligned regions for one query sequence
+	 *
+	 * Note that this routine does not generate CIGAR. CIGAR should be
+	 * generated later by bwa_gen_cigar() defined in bwa.c.
+	 *
+	 * @param opt    alignment parameters
+	 * @param bwt    FM-index of the reference sequence
+	 * @param bns    Information of the reference
+	 * @param pac    2-bit encoded reference
+	 * @param l_seq  length of query sequence
+	 * @param seq    query sequence; conversion ACGTN/acgtn=>01234 to be applied
+	 *
+	 * @return       list of aligned regions.
+	 */
+	mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq);
+
+	mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar);
+
+	/**
+	 * Infer the insert size distribution from interleaved alignment regions
+	 *
+	 * This function can be called after mem_align1(), as long as paired-end
+	 * reads are properly interleaved.
+	 *
+	 * @param opt    alignment parameters
+	 * @param l_pac  length of concatenated reference sequence
+	 * @param n      number of query sequences; must be an even number
+	 * @param regs   region array of size $n; 2i-th and (2i+1)-th elements constitute a pair
+	 * @param pes    inferred insert size distribution (output)
+	 */
+	void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bwamem_pair.c b/bwamem_pair.c
new file mode 100644
index 0000000..9ff12b3
--- /dev/null
+++ b/bwamem_pair.c
@@ -0,0 +1,314 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+#include "kstring.h"
+#include "bwamem.h"
+#include "kvec.h"
+#include "utils.h"
+#include "ksw.h"
+
+#define MIN_RATIO     0.8
+#define MIN_DIR_CNT   10
+#define MIN_DIR_RATIO 0.05
+#define OUTLIER_BOUND 2.0
+#define MAPPING_BOUND 3.0
+#define MAX_STDDEV    4.0
+
+static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist)
+{
+	int64_t p2;
+	int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac);
+	p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand
+	*dist = p2 > b1? p2 - b1 : b1 - p2;
+	return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3);
+}
+
+static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r)
+{
+	int j;
+	for (j = 1; j < r->n; ++j) { // choose unique alignment
+		int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb;
+		int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe;
+		if (e_min > b_max) { // have overlap
+			int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb;
+			if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap
+		}
+	}
+	return j < r->n? r->a[j].score : opt->min_seed_len * opt->a;
+}
+
+void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4])
+{
+	int i, d, max;
+	uint64_v isize[4];
+	memset(pes, 0, 4 * sizeof(mem_pestat_t));
+	memset(isize, 0, sizeof(kvec_t(int)) * 4);
+	for (i = 0; i < n>>1; ++i) {
+		int dir;
+		int64_t is;
+		mem_alnreg_v *r[2];
+		r[0] = (mem_alnreg_v*)&regs[i<<1|0];
+		r[1] = (mem_alnreg_v*)&regs[i<<1|1];
+		if (r[0]->n == 0 || r[1]->n == 0) continue;
+		if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue;
+		if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue;
+		dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is);
+		if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is);
+	}
+	if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n);
+	for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two.
+		mem_pestat_t *r = &pes[d];
+		uint64_v *q = &isize[d];
+		int p25, p50, p75, x;
+		if (q->n < MIN_DIR_CNT) {
+			fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+			r->failed = 1;
+			continue;
+		} else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+		ks_introsort_64(q->n, q->a);
+		p25 = q->a[(int)(.25 * q->n + .499)];
+		p50 = q->a[(int)(.50 * q->n + .499)];
+		p75 = q->a[(int)(.75 * q->n + .499)];
+		r->low  = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499);
+		if (r->low < 1) r->low = 1;
+		r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499);
+		fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75);
+		fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high);
+		for (i = x = 0, r->avg = 0; i < q->n; ++i)
+			if (q->a[i] >= r->low && q->a[i] <= r->high)
+				r->avg += q->a[i], ++x;
+		r->avg /= x;
+		for (i = 0, r->std = 0; i < q->n; ++i)
+			if (q->a[i] >= r->low && q->a[i] <= r->high)
+				r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg);
+		r->std = sqrt(r->std / x);
+		fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std);
+		r->low  = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499);
+		r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499);
+		if (r->low  > r->avg - MAX_STDDEV * r->std) r->low  = (int)(r->avg - MAX_STDDEV * r->std + .499);
+		if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499);
+		if (r->low < 1) r->low = 1;
+		fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high);
+		free(q->a);
+	}
+	for (d = 0, max = 0; d < 4; ++d)
+		max = max > isize[d].n? max : isize[d].n;
+	for (d = 0; d < 4; ++d)
+		if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) {
+			pes[d].failed = 1;
+			fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]);
+		}
+}
+
+int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma)
+{
+	int i, r, skip[4], n = 0;
+	for (r = 0; r < 4; ++r)
+		skip[r] = pes[r].failed? 1 : 0;
+	for (i = 0; i < ma->n; ++i) { // check which orinentation has been found
+		int64_t dist;
+		r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist);
+		if (dist >= pes[r].low && dist <= pes[r].high)
+			skip[r] = 1;
+	}
+	if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW
+	for (r = 0; r < 4; ++r) {
+		int is_rev, is_larger;
+		uint8_t *seq, *rev = 0, *ref;
+		int64_t rb, re, len;
+		if (skip[r]) continue;
+		is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate
+		is_larger = !(r>>1); // whether the mate has larger coordinate
+		if (is_rev) {
+			rev = malloc(l_ms); // this is the reverse complement of $ms
+			for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4;
+			seq = rev;
+		} else seq = (uint8_t*)ms;
+		if (!is_rev) {
+			rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high;
+			re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length
+		} else {
+			rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands
+			re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low;
+		}
+		if (rb < 0) rb = 0;
+		if (re > l_pac<<1) re = l_pac<<1;
+		ref = bns_get_seq(l_pac, pac, rb, re, &len);
+		if (len == re - rb) { // no funny things happening
+			kswr_t aln;
+			mem_alnreg_t b;
+			int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len;
+			aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0);
+			memset(&b, 0, sizeof(mem_alnreg_t));
+			if (aln.score >= opt->min_seed_len) {
+				b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb;                                                                                                                                                                              
+				b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; 
+				b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb;
+				b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1;
+				b.score = aln.score;
+				b.csub = aln.score2;
+				b.secondary = -1;
+				b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1;
+//				printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re);
+				kv_push(mem_alnreg_t, *ma, b); // make room for a new element
+				// move b s.t. ma is sorted
+				for (i = 0; i < ma->n - 1; ++i) // find the insertion point
+					if (ma->a[i].score < b.score) break;
+				tmp = i;
+				for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1];
+				ma->a[i] = b;
+			}
+			++n;
+		}
+		if (rev) free(rev);
+		free(ref);
+	}
+	return n;
+}
+
+int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2])
+{
+	extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
+	pair64_v v, u;
+	int r, i, k, y[4], ret; // y[] keeps the last hit
+	kv_init(v); kv_init(u);
+	for (r = 0; r < 2; ++r) { // loop through read number
+		for (i = 0; i < a[r].n; ++i) {
+			pair64_t key;
+			mem_alnreg_t *e = &a[r].a[i];
+			key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position
+			key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r;
+			kv_push(pair64_t, v, key);
+		}
+	}
+	ks_introsort_128(v.n, v.a);
+	y[0] = y[1] = y[2] = y[3] = -1;
+	//for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x);
+	for (i = 0; i < v.n; ++i) {
+		for (r = 0; r < 2; ++r) { // loop through direction
+			int dir = r<<1 | (v.a[i].y>>1&1), which;
+			if (pes[dir].failed) continue; // invalid orientation
+			which = r<<1 | ((v.a[i].y&1)^1);
+			if (y[which] < 0) continue; // no previous hits
+			for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt)
+				int64_t dist;
+				int q;
+				double ns;
+				pair64_t *p;
+				if ((v.a[k].y&3) != which) continue;
+				dist = (int64_t)v.a[i].x - v.a[k].x;
+				//printf("%d: %lld\n", k, dist);
+				if (dist > pes[dir].high) break;
+				if (dist < pes[dir].low)  continue;
+				ns = (dist - pes[dir].avg) / pes[dir].std;
+				q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4)
+				if (q < 0) q = 0;
+				p = kv_pushp(pair64_t, u);
+				p->y = (uint64_t)k<<32 | i;
+				p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU);
+				//printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist);
+			}
+		}
+		y[v.a[i].y&3] = i;
+	}
+	if (u.n) { // found at least one proper pair
+		int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r;
+		ks_introsort_128(u.n, u.a);
+		i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32;
+		z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair
+		z[v.a[k].y&1] = v.a[k].y<<32>>34;
+		ret = u.a[u.n-1].x >> 32;
+		*sub = u.n > 1? u.a[u.n-2].x>>32 : 0;
+		for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i)
+			if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub;
+	} else ret = 0, *sub = 0, *n_sub = 0;
+	free(u.a); free(v.a);
+	return ret;
+}
+
+int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2])
+{
+	extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a);
+	extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m);
+	extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a);
+	extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h);
+	extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m);
+
+	int n = 0, i, j, z[2], o, subo, n_sub;
+	kstring_t str;
+	mem_alnreg_v b[2];
+	bwahit_t h[2];
+
+	str.l = str.m = 0; str.s = 0;
+	// perform SW for the best alignment
+	kv_init(b[0]); kv_init(b[1]);
+	for (i = 0; i < 2; ++i)
+		for (j = 0; j < a[i].n; ++j)
+			if (a[i].a[j].score >= a[i].a[0].score  - opt->pen_unpaired)
+				kv_push(mem_alnreg_t, b[i], a[i].a[j]);
+	for (i = 0; i < 2; ++i)
+		for (j = 0; j < b[i].n && j < opt->max_matesw; ++j)
+			n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]);
+	free(b[0].a); free(b[1].a);
+	mem_mark_primary_se(opt, a[0].n, a[0].a);
+	mem_mark_primary_se(opt, a[1].n, a[1].a);
+	if (opt->flag&MEM_F_NOPAIRING) goto no_pairing;
+	// pairing single-end hits
+	if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) {
+		int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2];
+		// check if an end has multiple hits even after mate-SW
+		for (i = 0; i < 2; ++i) {
+			for (j = 1; j < a[i].n; ++j)
+				if (a[i].a[j].secondary < 0) break;
+			is_multi[i] = j < a[i].n? 1 : 0;
+		}
+		if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score
+		// compute mapQ for the best SE hit
+		score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired;
+		//q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0;
+		subo = subo > score_un? subo : score_un;
+		q_pe = (o - subo) * 6;
+		if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499);
+		if (q_pe < 0) q_pe = 0;
+		if (q_pe > 60) q_pe = 60;
+		// the following assumes no split hits
+		if (o > score_un) { // paired alignment is preferred
+			mem_alnreg_t *c[2];
+			c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]];
+			for (i = 0; i < 2; ++i) {
+				if (c[i]->secondary >= 0)
+					c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2;
+				q_se[i] = mem_approx_mapq_se(opt, c[i]);
+			}
+			q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40;
+			q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40;
+			extra_flag |= 2;
+			// cap at the tandem repeat score
+			q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6;
+			q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6;
+		} else { // the unpaired alignment is preferred
+			z[0] = z[1] = 0;
+			q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]);
+			q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]);
+		}
+		mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag;
+		bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re);
+		mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag;
+		bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re);
+		bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0;
+		bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s;
+	} else goto no_pairing;
+	return n;
+
+no_pairing:
+	for (i = 0; i < 2; ++i) {
+		if (a[i].n) {
+			mem_alnreg2hit(&a[i].a[0], &h[i]);
+			bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re);
+		} else h[i].rb = h[i].re = -1;
+	}
+	mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]);
+	mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]);
+	return n;
+}
diff --git a/bwape.c b/bwape.c
index 779670f..0b2b8d6 100644
--- a/bwape.c
+++ b/bwape.c
@@ -10,6 +10,7 @@
 #include "utils.h"
 #include "stdaln.h"
 #include "bwase.h"
+#include "bwa.h"
 
 typedef struct {
 	int n;
@@ -21,24 +22,15 @@ typedef struct {
 	bwtint_t low, high, high_bayesian;
 } isize_info_t;
 
-typedef struct {
-	uint64_t x, y;
-} b128_t;
-
-#define b128_lt(a, b) ((a).x < (b).x)
 #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y)
 #define b128_hash(a) ((uint32_t)(a).x)
 
 #include "khash.h"
-KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq)
-
-#include "ksort.h"
-KSORT_INIT(b128, b128_t, b128_lt)
-KSORT_INIT_GENERIC(uint64_t)
+KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq)
 
 typedef struct {
-	kvec_t(b128_t) arr;
-	kvec_t(b128_t) pos[2];
+	pair64_v arr;
+	pair64_v pos[2];
 	kvec_t(bwt_aln1_t) aln[2];
 } pe_data_t;
 
@@ -69,19 +61,6 @@ pe_opt_t *bwa_init_pe_opt()
 	po->ap_prior = 1e-5;
 	return po;
 }
-
-static inline uint64_t hash_64(uint64_t key)
-{
-	key += ~(key << 32);
-	key ^= (key >> 22);
-	key += ~(key << 13);
-	key ^= (key >> 8);
-	key += (key << 3);
-	key ^= (key >> 15);
-	key += ~(key << 27);
-	key ^= (key >> 31);
-	return key;
-}
 /*
 static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x);
 {
@@ -120,7 +99,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double
 		free(isizes);
 		return -1;
 	}
-	ks_introsort(uint64_t, tot, isizes);
+	ks_introsort_64(tot, isizes);
 	p25 = isizes[(int)(tot*0.25 + 0.5)];
 	p50 = isizes[(int)(tot*0.50 + 0.5)];
 	p75 = isizes[(int)(tot*0.75 + 0.5)];
@@ -170,7 +149,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
 {
 	int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len;
 	uint64_t o_score, subo_score;
-	b128_t last_pos[2][2], o_pos[2];
+	pair64_t last_pos[2][2], o_pos[2];
 	max_len = p[0]->full_len;
 	if (max_len < p[1]->full_len) max_len = p[1]->full_len;
 	if (low_bound < max_len) low_bound = max_len;
@@ -206,11 +185,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
 
 	o_score = subo_score = (uint64_t)-1;
 	o_n = subo_n = 0;
-	ks_introsort(b128, d->arr.n, d->arr.a);
+	ks_introsort_128(d->arr.n, d->arr.a);
 	for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1;
 	if (opt->type == BWA_PET_STD) {
 		for (i = 0; i < d->arr.n; ++i) {
-			b128_t x = d->arr.a[i];
+			pair64_t x = d->arr.a[i];
 			int strand = x.y>>1&1;
 			if (strand == 1) { // reverse strand, then check
 				int y = 1 - (x.y&1);
@@ -221,19 +200,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm,
 				last_pos[x.y&1][1] = x;
 			}
 		}
-	} else if (opt->type == BWA_PET_SOLID) {
-		for (i = 0; i < d->arr.n; ++i) {
-			b128_t x = d->arr.a[i];
-			int strand = x.y>>1&1;
-			if ((strand^x.y)&1) { // push
-				int y = 1 - (x.y&1);
-				__pairing_aux(last_pos[y][1], x);
-				__pairing_aux(last_pos[y][0], x);
-			} else { // check
-				last_pos[x.y&1][0] = last_pos[x.y&1][1];
-				last_pos[x.y&1][1] = x;
-			}
-		}
 	} else {
 		fprintf(stderr, "[paring] not implemented yet!\n");
 		exit(1);
@@ -345,7 +311,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
 		if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT)
 			&& (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT))
 		{ // only when both ends mapped
-			b128_t x;
+			pair64_t x;
 			int j, k;
 			long long n_occ[2];
 			for (j = 0; j < 2; ++j) {
@@ -360,7 +326,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
 					bwt_aln1_t *r = d->aln[j].a + k;
 					bwtint_t l;
 					if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table
-						b128_t key;
+						pair64_t key;
 						int ret;
 						key.x = r->k; key.y = r->l;
 						khint_t iter = kh_put(b128, g_hash, key, &ret);
@@ -377,14 +343,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
 						for (l = 0; l < kh_val(g_hash, iter).n; ++l) {
 							x.x = kh_val(g_hash, iter).a[l]>>1;
 							x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j;
-							kv_push(b128_t, d->arr, x);
+							kv_push(pair64_t, d->arr, x);
 						}
 					} else { // then calculate on the fly
 						for (l = r->k; l <= r->l; ++l) {
 							int strand;
 							x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand);
 							x.y = k<<2 | strand<<1 | j;
-							kv_push(b128_t, d->arr, x);
+							kv_push(pair64_t, d->arr, x);
 						}
 					}
 				}
@@ -576,11 +542,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
 			++n_tot[is_singleton];
 			cigar[0] = cigar[1] = 0;
 			n_cigar[0] = n_cigar[1] = 0;
-			if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered
+			if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered
 			for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified
 				ubyte_t *seq;
 				if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip
-				if (popt->type == BWA_PET_STD) {
+				{ // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads
 					if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate
 						__set_rght_coor(beg[k], end[k], p[1-k], p[k]);
 						seq = p[k]->rseq;
@@ -589,17 +555,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
 						seq = p[k]->seq;
 						seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly
 					}
-				} else { // BWA_PET_SOLID
-					if (p[1-k]->strand == 0) { // R3-F3 pairing
-						if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
-						else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
-						seq = p[k]->rseq;
-						seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed
-					} else { // F3-R3 pairing
-						if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3
-						else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3
-						seq = p[k]->seq;
-					}
 				}
 				// perform SW alignment
 				cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]);
@@ -656,14 +611,14 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs,
 	return pacseq;
 }
 
-void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt)
+void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line)
 {
 	extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
 	int i, j, n_seqs, tot_seqs = 0;
 	bwa_seq_t *seqs[2];
 	bwa_seqio_t *ks[2];
 	clock_t t;
-	bntseq_t *bns, *ntbns = 0;
+	bntseq_t *bns;
 	FILE *fp_sa[2];
 	gap_opt_t opt, opt0;
 	khint_t iter;
@@ -688,10 +643,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
 	opt0 = opt;
 	fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten!
 	ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
-	if (!(opt.mode & BWA_MODE_COMPREAD)) {
-		popt->type = BWA_PET_SOLID;
-		ntbns = bwa_open_nt(prefix);
-	} else { // for Illumina alignment only
+	{ // for Illumina alignment only
 		if (popt->is_preload) {
 			strcpy(str, prefix); strcat(str, ".bwt");  bwt = bwt_restore_bwt(str);
 			strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
@@ -702,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
 	}
 
 	// core loop
-	bwa_print_sam_SQ(bns);
+	bwa_print_sam_hdr(bns, rg_line);
 	bwa_print_sam_PG();
 	while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) {
 		int cnt_chg;
@@ -724,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
 
 		fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... ");
 		for (j = 0; j < 2; ++j)
-			bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns);
+			bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq);
 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
 		if (pac == 0) free(pacseq);
 
@@ -749,7 +701,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
 
 	// destroy
 	bns_destroy(bns);
-	if (ntbns) bns_destroy(ntbns);
 	for (i = 0; i < 2; ++i) {
 		bwa_seq_close(ks[i]);
 		fclose(fp_sa[i]);
@@ -764,21 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
 
 int bwa_sai2sam_pe(int argc, char *argv[])
 {
-	extern char *bwa_rg_line, *bwa_rg_id;
-	extern int bwa_set_rg(const char *s);
-	extern char *bwa_infer_prefix(const char *hint);
 	int c;
 	pe_opt_t *popt;
-	char *prefix;
+	char *prefix, *rg_line = 0;
 
 	popt = bwa_init_pe_opt();
 	while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) {
 		switch (c) {
 		case 'r':
-			if (bwa_set_rg(optarg) < 0) {
-				fprintf(stderr, "[%s] malformated @RG line\n", __func__);
-				return 1;
-			}
+			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
 			break;
 		case 'a': popt->max_isize = atoi(optarg); break;
 		case 'o': popt->max_occ = atoi(optarg); break;
@@ -812,13 +757,11 @@ int bwa_sai2sam_pe(int argc, char *argv[])
 		fprintf(stderr, "\n");
 		return 1;
 	}
-	if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+	if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
 		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
-		free(bwa_rg_line); free(bwa_rg_id);
 		return 0;
 	}
-	bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt);
-	free(bwa_rg_line); free(bwa_rg_id); free(prefix);
-	free(popt);
+	bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line);
+	free(prefix); free(popt);
 	return 0;
 }
diff --git a/bwase.c b/bwase.c
index 35744e7..2dd783b 100644
--- a/bwase.c
+++ b/bwase.c
@@ -10,9 +10,9 @@
 #include "bntseq.h"
 #include "utils.h"
 #include "kstring.h"
+#include "bwa.h"
 
 int g_log_n[256];
-char *bwa_rg_line, *bwa_rg_id;
 
 void bwa_print_sam_PG();
 
@@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
 				}
 				rest -= q->l - q->k + 1;
 			} else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. 
-				int j, i, k;
-				for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) {
+				int j, i;
+				for (j = rest, i = q->l - q->k + 1; j > 0; --j) {
 					double p = 1.0, x = drand48();
 					while (x < p) p -= p * j / (i--);
 					s->multi[z].pos = q->l - i;
@@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s)
 	s->len = s->full_len;
 }
 
-void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns)
+void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq)
 {
-	ubyte_t *pacseq, *ntpac = 0;
+	ubyte_t *pacseq;
 	int i, j;
 	kstring_t *str;
 
-	if (ntbns) { // in color space
-		ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1);
-		rewind(ntbns->fp_pac);
-		fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac);
-	}
-
 	if (!_pacseq) {
 		pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
 		rewind(bns->fp_pac);
@@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
 		s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos,
 									  (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1);
 	}
-#if 0
-	if (ntbns) { // in color space
-		for (i = 0; i < n_seqs; ++i) {
-			bwa_seq_t *s = seqs + i;
-			bwa_cs2nt_core(s, bns->l_pac, ntpac);
-			for (j = 0; j < s->n_multi; ++j) {
-				bwt_multi1_t *q = s->multi + j;
-				int n_cigar;
-				if (q->gap == 0) continue;
-				free(q->cigar);
-				q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos,
-											  (q->strand? 1 : -1) * q->gap, &n_cigar, 0);
-				q->n_cigar = n_cigar;
-			}
-			if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again
-				free(s->cigar);
-				s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos,
-											  (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0);
-			}
-		}
-	}
-#endif
 	// generate MD tag
 	str = (kstring_t*)calloc(1, sizeof(kstring_t));
 	for (i = 0; i != n_seqs; ++i) {
@@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t
 		if (s->type != BWA_TYPE_NO_MATCH) {
 			int nm;
 			s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq,
-								bns->l_pac, ntbns? ntpac : pacseq, str, &nm);
+								bns->l_pac, pacseq, str, &nm);
 			s->nm = nm;
 		}
 	}
 	free(str->s); free(str);
 
 	// correct for trimmed reads
-	if (!ntbns) // trimming is only enabled for Illumina reads
-		for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
+	for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i);
 
 	if (!_pacseq) free(pacseq);
-	free(ntpac);
 }
 
 int64_t pos_end(const bwa_seq_t *p)
@@ -442,11 +412,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
 
 		// print mate coordinate
 		if (mate && mate->type != BWA_TYPE_NO_MATCH) {
-			int m_seqid, m_is_N;
+			int m_seqid;
 			long long isize;
 			am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality
 			// redundant calculation here, but should not matter too much
-			m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
+			bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid);
 			err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name);
 			isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0;
 			if (p->type == BWA_TYPE_NO_MATCH) isize = 0;
@@ -464,7 +434,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
 			err_printf("%s", p->qual);
 		} else err_printf("*");
 
-		if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
+		if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
 		if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
 		if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
 		if (p->type != BWA_TYPE_NO_MATCH) {
@@ -512,74 +482,20 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
 			if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality
 			err_printf("%s", p->qual);
 		} else err_printf("*");
-		if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id);
+		if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id);
 		if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc);
 		if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len);
 		putchar('\n');
 	}
 }
 
-bntseq_t *bwa_open_nt(const char *prefix)
-{
-	bntseq_t *ntbns;
-	char *str;
-	str = (char*)calloc(strlen(prefix) + 10, 1);
-	strcat(strcpy(str, prefix), ".nt");
-	ntbns = bns_restore(str);
-	free(str);
-	return ntbns;
-}
-
-void bwa_print_sam_SQ(const bntseq_t *bns)
-{
-	int i;
-	for (i = 0; i < bns->n_seqs; ++i)
-		err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len);
-	if (bwa_rg_line) err_printf("%s\n", bwa_rg_line);
-}
-
 void bwase_initialize() 
 {
 	int i;
 	for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5);
 }
 
-char *bwa_escape(char *s)
-{
-	char *p, *q;
-	for (p = q = s; *p; ++p) {
-		if (*p == '\\') {
-			++p;
-			if (*p == 't') *q++ = '\t';
-			else if (*p == 'n') *q++ = '\n';
-			else if (*p == 'r') *q++ = '\r';
-			else if (*p == '\\') *q++ = '\\';
-		} else *q++ = *p;
-	}
-	*q = '\0';
-	return s;
-}
-
-int bwa_set_rg(const char *s)
-{
-	char *p, *q, *r;
-	if (strstr(s, "@RG") != s) return -1;
-	if (bwa_rg_line) free(bwa_rg_line);
-	if (bwa_rg_id) free(bwa_rg_id);
-	bwa_rg_line = strdup(s);
-	bwa_rg_id = 0;
-	bwa_escape(bwa_rg_line);
-	p = strstr(bwa_rg_line, "\tID:");
-	if (p == 0) return -1;
-	p += 4;
-	for (q = p; *q && *q != '\t' && *q != '\n'; ++q);
-	bwa_rg_id = calloc(q - p + 1, 1);
-	for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q)
-		*r++ = *q;
-	return 0;
-}
-
-void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ)
+void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line)
 {
 	extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa);
 	int i, n_seqs, tot_seqs = 0, m_aln;
@@ -587,7 +503,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
 	bwa_seq_t *seqs;
 	bwa_seqio_t *ks;
 	clock_t t;
-	bntseq_t *bns, *ntbns = 0;
+	bntseq_t *bns;
 	FILE *fp_sa;
 	gap_opt_t opt;
 
@@ -599,9 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
 
 	m_aln = 0;
 	fread(&opt, sizeof(gap_opt_t), 1, fp_sa);
-	if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac
-		ntbns = bwa_open_nt(prefix);
-	bwa_print_sam_SQ(bns);
+	bwa_print_sam_hdr(bns, rg_line);
 	//bwa_print_sam_PG();
 	// set ks
 	ks = bwa_open_reads(opt.mode, fn_fa);
@@ -628,7 +542,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
 
 		fprintf(stderr, "[bwa_aln_core] refine gapped alignments... ");
-		bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns);
+		bwa_refine_gapped(bns, n_seqs, seqs, 0);
 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock();
 
 		fprintf(stderr, "[bwa_aln_core] print alignments... ");
@@ -642,7 +556,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
 
 	// destroy
 	bwa_seq_close(ks);
-	if (ntbns) bns_destroy(ntbns);
 	bns_destroy(bns);
 	fclose(fp_sa);
 	free(aln);
@@ -650,17 +563,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f
 
 int bwa_sai2sam_se(int argc, char *argv[])
 {
-	extern char *bwa_infer_prefix(const char *hint);
 	int c, n_occ = 3;
-	char *prefix;
+	char *prefix, *rg_line = 0;
 	while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) {
 		switch (c) {
 		case 'h': break;
 		case 'r':
-			if (bwa_set_rg(optarg) < 0) {
-				fprintf(stderr, "[%s] malformated @RG line\n", __func__);
-				return 1;
-			}
+			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1;
 			break;
 		case 'n': n_occ = atoi(optarg); break;
 		case 'f': xreopen(optarg, "w", stdout); break;
@@ -672,12 +581,10 @@ int bwa_sai2sam_se(int argc, char *argv[])
 		fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] <prefix> <in.sai> <in.fq>\n");
 		return 1;
 	}
-	if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+	if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
 		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
-		free(bwa_rg_line); free(bwa_rg_id);
 		return 0;
 	}
-	bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ);
-	free(bwa_rg_line); free(bwa_rg_id);
+	bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line);
 	return 0;
 }
diff --git a/bwase.h b/bwase.h
index f8e9b0a..26a9f68 100644
--- a/bwase.h
+++ b/bwase.h
@@ -14,7 +14,7 @@ extern "C" {
 	// Calculate the approximate position of the sequence from the specified bwt with loaded suffix array.
 	void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr);
 	// Refine the approximate position of the sequence to an actual placement for the sequence.
-	void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns);
+	void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq);
 	// Backfill certain alignment properties mainly centering around number of matches.
 	void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s);
 	// Calculate the end position of a read given a certain sequence.
diff --git a/bwaseqio.c b/bwaseqio.c
index e22d4cd..c1e9f97 100644
--- a/bwaseqio.c
+++ b/bwaseqio.c
@@ -5,7 +5,7 @@
 #include "bamlite.h"
 
 #include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
 
 extern unsigned char nst_nt4_table[256];
 static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
diff --git a/bwt.c b/bwt.c
index fcc141e..4ee9ea8 100644
--- a/bwt.c
+++ b/bwt.c
@@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt)
 	}
 }
 
+static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA
+{
+	bwtint_t x = k - (k > bwt->primary);
+	x = bwt_B0(bwt, x);
+	x = bwt->L2[x] + bwt_occ(bwt, k, x);
+	return k == bwt->primary? 0 : x;
+}
+
 // bwt->bwt and bwt->occ must be precalculated
 void bwt_cal_sa(bwt_t *bwt, int intv)
 {
@@ -95,23 +103,22 @@ static inline int __occ_aux(uint64_t y, int c)
 	return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56;
 }
 
-inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
+bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
 {
-	bwtint_t n, l, j;
-	uint32_t *p;
+	bwtint_t n;
+	uint32_t *p, *end;
 
 	if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c];
 	if (k == (bwtint_t)(-1)) return 0;
-	if (k >= bwt->primary) --k; // because $ is not in bwt
+	k -= (k >= bwt->primary); // because $ is not in bwt
 
 	// retrieve Occ at k/OCC_INTERVAL
 	n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c];
 	p += sizeof(bwtint_t); // jump to the start of the first BWT cell
 
 	// calculate Occ up to the last k/32
-	j = k >> 5 << 5;
-	for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2)
-		n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
+	end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1);
+	for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c);
 
 	// calculate Occ
 	n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c);
@@ -121,7 +128,7 @@ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c)
 }
 
 // an analogy to bwt_occ() but more efficient, requiring k <= l
-inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
+void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol)
 {
 	bwtint_t _k, _l;
 	_k = (k >= bwt->primary)? k-1 : k;
@@ -158,52 +165,53 @@ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint
 	((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff]		\
 	 + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24])
 
-inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
+void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4])
 {
-	bwtint_t l, j, x;
-	uint32_t *p;
+	bwtint_t x;
+	uint32_t *p, tmp, *end;
 	if (k == (bwtint_t)(-1)) {
 		memset(cnt, 0, 4 * sizeof(bwtint_t));
 		return;
 	}
-	if (k >= bwt->primary) --k; // because $ is not in bwt
+	k -= (k >= bwt->primary); // because $ is not in bwt
 	p = bwt_occ_intv(bwt, k);
 	memcpy(cnt, p, 4 * sizeof(bwtint_t));
-	p += sizeof(bwtint_t);
-	j = k >> 4 << 4;
-	for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p)
-		x += __occ_aux4(bwt, *p);
-	x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
+	p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
+	end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop
+	for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p);
+	tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
+	x += __occ_aux4(bwt, tmp) - (~k&15);
 	cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
 }
 
 // an analogy to bwt_occ4() but more efficient, requiring k <= l
-inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
+void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4])
 {
 	bwtint_t _k, _l;
-	_k = (k >= bwt->primary)? k-1 : k;
-	_l = (l >= bwt->primary)? l-1 : l;
-	if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
+	_k = k - (k >= bwt->primary);
+	_l = l - (l >= bwt->primary);
+	if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) {
 		bwt_occ4(bwt, k, cntk);
 		bwt_occ4(bwt, l, cntl);
 	} else {
-		bwtint_t i, j, x, y;
-		uint32_t *p;
-		if (k >= bwt->primary) --k; // because $ is not in bwt
-		if (l >= bwt->primary) --l;
+		bwtint_t x, y;
+		uint32_t *p, tmp, *endk, *endl;
+		k -= (k >= bwt->primary); // because $ is not in bwt
+		l -= (l >= bwt->primary);
 		p = bwt_occ_intv(bwt, k);
 		memcpy(cntk, p, 4 * sizeof(bwtint_t));
-		p += sizeof(bwtint_t);
+		p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t))
 		// prepare cntk[]
-		j = k >> 4 << 4;
-		for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p)
-			x += __occ_aux4(bwt, *p);
+		endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4));
+		endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4));
+		for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p);
 		y = x;
-		x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15);
+		tmp = *p & ~((1U<<((~k&15)<<1)) - 1);
+		x += __occ_aux4(bwt, tmp) - (~k&15);
 		// calculate cntl[] and finalize cntk[]
-		j = l >> 4 << 4;
-		for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p);
-		y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15);
+		for (; p < endl; ++p) y += __occ_aux4(bwt, *p);
+		tmp = *p & ~((1U<<((~l&15)<<1)) - 1);
+		y += __occ_aux4(bwt, tmp) - (~l&15);
 		memcpy(cntl, cntk, 4 * sizeof(bwtint_t));
 		cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24;
 		cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24;
@@ -277,7 +285,7 @@ static void bwt_reverse_intvs(bwtintv_v *p)
 	}
 }
 
-int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2])
+int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2])
 {
 	int i, j, c, ret;
 	bwtintv_t ik, ok[4];
@@ -285,45 +293,45 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
 
 	mem->n = 0;
 	if (q[x] > 3) return x + 1;
+	if (min_intv < 1) min_intv = 1; // the interval size should be at least 1
 	kv_init(a[0]); kv_init(a[1]);
-	prev = tmpvec[0]? tmpvec[0] : &a[0];
-	curr = tmpvec[1]? tmpvec[1] : &a[1];
-	bwt_set_intv(bwt, q[x], ik);
+	prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided
+	curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1];
+	bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base
 	ik.info = x + 1;
 
 	for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search
-		if (q[i] < 4) {
-			c = 3 - q[i];
+		if (q[i] < 4) { // an A/C/G/T base
+			c = 3 - q[i]; // complement of q[i]
 			bwt_extend(bwt, &ik, ok, 0);
-			if (ok[c].x[2] != ik.x[2]) // change of the interval size
+			if (ok[c].x[2] != ik.x[2]) { // change of the interval size
 				kv_push(bwtintv_t, *curr, ik);
-			if (ok[c].x[2] == 0) break; // cannot be extended
+				if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further
+			}
 			ik = ok[c]; ik.info = i + 1;
 		} else { // an ambiguous base
 			kv_push(bwtintv_t, *curr, ik);
-			break; // cannot be extended; in this case, i<len always stands
+			break; // always terminate extension at an ambiguous base; in this case, i<len always stands
 		}
 	}
 	if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end
-	bwt_reverse_intvs(curr); // s.t. smaller intervals visited first
+	bwt_reverse_intvs(curr); // s.t. smaller intervals (i.e. longer matches) visited first
 	ret = curr->a[0].info; // this will be the returned value
 	swap = curr; curr = prev; prev = swap;
 
 	for (i = x - 1; i >= -1; --i) { // backward search for MEMs
-		if (q[i] > 3) break;
-		c = i < 0? 0 : q[i];
+		c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base
 		for (j = 0, curr->n = 0; j < prev->n; ++j) {
 			bwtintv_t *p = &prev->a[j];
 			bwt_extend(bwt, p, ok, 1);
-			if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further
-				if (curr->n == 0) { // curr->n to make sure there is no longer matches
+			if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough
+				if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches
 					if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches
 						ik = *p; ik.info |= (uint64_t)(i + 1)<<32;
 						kv_push(bwtintv_t, *mem, ik);
 					}
 				} // otherwise the match is contained in another longer match
-			}
-			if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) {
+			} else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) {
 				ok[c].info = p->info;
 				kv_push(bwtintv_t, *curr, ok[c]);
 			}
@@ -333,7 +341,83 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem
 	}
 	bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate
 
-	if (tmpvec[0] == 0) free(a[0].a);
-	if (tmpvec[1] == 0) free(a[1].a);
+	if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a);
+	if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a);
 	return ret;
 }
+
+/*************************
+ * Read/write BWT and SA *
+ *************************/
+
+void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
+{
+	FILE *fp;
+	fp = xopen(fn, "wb");
+	fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+	fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+	fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
+	fclose(fp);
+}
+
+void bwt_dump_sa(const char *fn, const bwt_t *bwt)
+{
+	FILE *fp;
+	fp = xopen(fn, "wb");
+	fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
+	fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+	fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
+	fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
+	fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
+	fclose(fp);
+}
+
+void bwt_restore_sa(const char *fn, bwt_t *bwt)
+{
+	char skipped[256];
+	FILE *fp;
+	bwtint_t primary;
+
+	fp = xopen(fn, "rb");
+	fread(&primary, sizeof(bwtint_t), 1, fp);
+	xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
+	fread(skipped, sizeof(bwtint_t), 4, fp); // skip
+	fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
+	fread(&primary, sizeof(bwtint_t), 1, fp);
+	xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
+
+	bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
+	bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
+	bwt->sa[0] = -1;
+
+	fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
+	fclose(fp);
+}
+
+bwt_t *bwt_restore_bwt(const char *fn)
+{
+	bwt_t *bwt;
+	FILE *fp;
+
+	bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+	fp = xopen(fn, "rb");
+	fseek(fp, 0, SEEK_END);
+	bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
+	bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
+	fseek(fp, 0, SEEK_SET);
+	fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
+	fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
+	fread(bwt->bwt, 4, bwt->bwt_size, fp);
+	bwt->seq_len = bwt->L2[4];
+	fclose(fp);
+	bwt_gen_cnt_table(bwt);
+
+	return bwt;
+}
+
+void bwt_destroy(bwt_t *bwt)
+{
+	if (bwt == 0) return;
+	free(bwt->sa); free(bwt->bwt);
+	free(bwt);
+}
diff --git a/bwt.h b/bwt.h
index 5823f82..e7b0f97 100644
--- a/bwt.h
+++ b/bwt.h
@@ -30,8 +30,10 @@
 
 #include <stdint.h>
 
-// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line
-#define OCC_INTERVAL 0x80
+// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
+#define OCC_INTV_SHIFT 7
+#define OCC_INTERVAL   (1LL<<OCC_INTV_SHIFT)
+#define OCC_INTV_MASK  (OCC_INTERVAL - 1)
 
 #ifndef BWA_UBYTE
 #define BWA_UBYTE
@@ -74,13 +76,6 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
  * called bwt_B0 instead of bwt_B */
 #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3)
 
-// inverse Psi function
-#define bwt_invPsi(bwt, k)												\
-	(((k) == (bwt)->primary)? 0 :										\
-	 ((k) < (bwt)->primary)?											\
-	 (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k))		\
-	 : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1)))
-
 #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
 
 #ifdef __cplusplus
@@ -121,7 +116,9 @@ extern "C" {
 	 * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
 	 * Return the end of the longest exact match starting from _x_.
 	 */
-	int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+	int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
+
+	// SMEM iterator interface
 
 #ifdef __cplusplus
 }
diff --git a/bwt_lite.c b/bwt_lite.c
index dd411e1..902e0fc 100644
--- a/bwt_lite.c
+++ b/bwt_lite.c
@@ -65,7 +65,7 @@ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c)
 	if (c == 0) n -= 15 - (k&15); // corrected for the masked bits
 	return n;
 }
-inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
+void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
 {
 	uint32_t x, b;
 	if (k == (uint32_t)(-1)) {
@@ -80,7 +80,7 @@ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4])
 	x -= 15 - (k&15);
 	cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24;
 }
-inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
+void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4])
 {
 	bwtl_occ4(bwt, k, cntk);
 	bwtl_occ4(bwt, l, cntl);
diff --git a/bwtaln.c b/bwtaln.c
index efc7f66..96d4026 100644
--- a/bwtaln.c
+++ b/bwtaln.c
@@ -11,6 +11,7 @@
 #include "bwtaln.h"
 #include "bwtgap.h"
 #include "utils.h"
+#include "bwa.h"
 
 #ifdef HAVE_PTHREAD
 #include <pthread.h>
@@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt)
 	bwa_seq_close(ks);
 }
 
-char *bwa_infer_prefix(const char *hint)
-{
-	char *prefix;
-	int l_hint;
-	FILE *fp;
-	l_hint = strlen(hint);
-	prefix = malloc(l_hint + 3 + 4 + 1);
-	strcpy(prefix, hint);
-	strcpy(prefix + l_hint, ".64.bwt");
-	if ((fp = fopen(prefix, "rb")) != 0) {
-		fclose(fp);
-		prefix[l_hint + 3] = 0;
-		return prefix;
-	} else {
-		strcpy(prefix + l_hint, ".bwt");
-		if ((fp = fopen(prefix, "rb")) == 0) {
-			free(prefix);
-			return 0;
-		} else {
-			fclose(fp);
-			prefix[l_hint] = 0;
-			return prefix;
-		}
-	}
-}
-
 int bwa_aln(int argc, char *argv[])
 {
 	int c, opte = -1;
@@ -252,7 +227,7 @@ int bwa_aln(int argc, char *argv[])
 	char *prefix;
 
 	opt = gap_init_opt();
-	while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
+	while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) {
 		switch (c) {
 		case 'n':
 			if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1;
@@ -272,7 +247,6 @@ int bwa_aln(int argc, char *argv[])
 		case 'L': opt->mode |= BWA_MODE_LOGGAP; break;
 		case 'R': opt->max_top2 = atoi(optarg); break;
 		case 'q': opt->trim_qual = atoi(optarg); break;
-		case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break;
 		case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break;
 		case 'f': xreopen(optarg, "wb", stdout); break;
 		case 'b': opt->mode |= BWA_MODE_BAM; break;
@@ -310,7 +284,6 @@ int bwa_aln(int argc, char *argv[])
 		fprintf(stderr, "         -q INT    quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual);
         fprintf(stderr, "         -f FILE   file to write output to instead of stdout\n");
 		fprintf(stderr, "         -B INT    length of barcode\n");
-//		fprintf(stderr, "         -c        input sequences are in the color space\n");
 		fprintf(stderr, "         -L        log-scaled gap penalty for long deletions\n");
 		fprintf(stderr, "         -N        non-iterative mode: search for all n-difference hits (slooow)\n");
 		fprintf(stderr, "         -I        the input is in the Illumina 1.3+ FASTQ-like format\n");
@@ -330,7 +303,7 @@ int bwa_aln(int argc, char *argv[])
 			k = l;
 		}
 	}
-	if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
+	if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) {
 		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
 		free(opt);
 		return 0;
diff --git a/bwtaln.h b/bwtaln.h
index 39eaf4b..412cc04 100644
--- a/bwtaln.h
+++ b/bwtaln.h
@@ -107,7 +107,6 @@ typedef struct {
 } gap_opt_t;
 
 #define BWA_PET_STD   1
-#define BWA_PET_SOLID 2
 
 typedef struct {
 	int max_isize, force_isize;
diff --git a/bwtindex.c b/bwtindex.c
index 938e982..298153d 100644
--- a/bwtindex.c
+++ b/bwtindex.c
@@ -36,17 +36,160 @@
 #include "main.h"
 #include "utils.h"
 
-bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is);
-void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+#ifdef _DIVBWT
+#include "divsufsort.h"
+#endif
 
-int bwa_index(int argc, char *argv[])
+int is_bwt(ubyte_t *T, int n);
+
+int64_t bwa_seq_len(const char *fn_pac)
+{
+	FILE *fp;
+	int64_t pac_len;
+	ubyte_t c;
+	fp = xopen(fn_pac, "rb");
+	fseek(fp, -1, SEEK_END);
+	pac_len = ftell(fp);
+	fread(&c, 1, 1, fp);
+	fclose(fp);
+	return (pac_len - 1) * 4 + (int)c;
+}
+
+bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
+{
+	bwt_t *bwt;
+	ubyte_t *buf, *buf2;
+	int i, pac_size;
+	FILE *fp;
+
+	// initialization
+	bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
+	bwt->seq_len = bwa_seq_len(fn_pac);
+	bwt->bwt_size = (bwt->seq_len + 15) >> 4;
+	fp = xopen(fn_pac, "rb");
+
+	// prepare sequence
+	pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
+	buf2 = (ubyte_t*)calloc(pac_size, 1);
+	fread(buf2, 1, pac_size, fp);
+	fclose(fp);
+	memset(bwt->L2, 0, 5 * 4);
+	buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
+	for (i = 0; i < bwt->seq_len; ++i) {
+		buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
+		++bwt->L2[1+buf[i]];
+	}
+	for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
+	free(buf2);
+
+	// Burrows-Wheeler Transform
+	if (use_is) {
+		bwt->primary = is_bwt(buf, bwt->seq_len);
+	} else {
+#ifdef _DIVBWT
+		bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
+#else
+		err_fatal_simple("libdivsufsort is not compiled in.");
+#endif
+	}
+	bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
+	for (i = 0; i < bwt->seq_len; ++i)
+		bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
+	free(buf);
+	return bwt;
+}
+
+int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required!
+{
+	bwt_t *bwt;
+	int c, use_is = 1;
+	while ((c = getopt(argc, argv, "d")) >= 0) {
+		switch (c) {
+		case 'd': use_is = 0; break;
+		default: return 1;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
+		return 1;
+	}
+	bwt = bwt_pac2bwt(argv[optind], use_is);
+	bwt_dump_bwt(argv[optind+1], bwt);
+	bwt_destroy(bwt);
+	return 0;
+}
+
+#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
+
+void bwt_bwtupdate_core(bwt_t *bwt)
 {
+	bwtint_t i, k, c[4], n_occ;
+	uint32_t *buf;
+
+	n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
+	bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
+	buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
+	c[0] = c[1] = c[2] = c[3] = 0;
+	for (i = k = 0; i < bwt->seq_len; ++i) {
+		if (i % OCC_INTERVAL == 0) {
+			memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+			k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
+		}
+		if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
+		++c[bwt_B00(bwt, i)];
+	}
+	// the last element
+	memcpy(buf + k, c, sizeof(bwtint_t) * 4);
+	xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
+	// update bwt
+	free(bwt->bwt); bwt->bwt = buf;
+}
+
+int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command
+{
+	bwt_t *bwt;
+	if (argc < 2) {
+		fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
+		return 1;
+	}
+	bwt = bwt_restore_bwt(argv[1]);
+	bwt_bwtupdate_core(bwt);
+	bwt_dump_bwt(argv[1], bwt);
+	bwt_destroy(bwt);
+	return 0;
+}
+
+int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command
+{
+	bwt_t *bwt;
+	int c, sa_intv = 32;
+	while ((c = getopt(argc, argv, "i:")) >= 0) {
+		switch (c) {
+		case 'i': sa_intv = atoi(optarg); break;
+		default: return 1;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
+		return 1;
+	}
+	bwt = bwt_restore_bwt(argv[optind]);
+	bwt_cal_sa(bwt, sa_intv);
+	bwt_dump_sa(argv[optind+1], bwt);
+	bwt_destroy(bwt);
+	return 0;
+}
+
+int bwa_index(int argc, char *argv[]) // the "index" command
+{
+	extern void bwa_pac_rev_core(const char *fn, const char *fn_rev);
+
 	char *prefix = 0, *str, *str2, *str3;
-	int c, algo_type = 0, is_color = 0, is_64 = 0;
+	int c, algo_type = 0, is_64 = 0;
 	clock_t t;
 	int64_t l_pac;
 
-	while ((c = getopt(argc, argv, "6ca:p:")) >= 0) {
+	while ((c = getopt(argc, argv, "6a:p:")) >= 0) {
 		switch (c) {
 		case 'a': // if -a is not set, algo_type will be determined later
 			if (strcmp(optarg, "div") == 0) algo_type = 1;
@@ -55,7 +198,6 @@ int bwa_index(int argc, char *argv[])
 			else err_fatal(__func__, "unknown algorithm: '%s'.", optarg);
 			break;
 		case 'p': prefix = strdup(optarg); break;
-		case 'c': is_color = 1; break;
 		case '6': is_64 = 1; break;
 		default: return 1;
 		}
@@ -67,7 +209,6 @@ int bwa_index(int argc, char *argv[])
 		fprintf(stderr, "Options: -a STR    BWT construction algorithm: bwtsw or is [auto]\n");
 		fprintf(stderr, "         -p STR    prefix of the index [same as fasta name]\n");
 		fprintf(stderr, "         -6        index files named as <in.fasta>.64.* instead of <in.fasta>.* \n");
-//		fprintf(stderr, "         -c        build color-space index\n");
 		fprintf(stderr, "\n");
 		fprintf(stderr,	"Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n");
 		fprintf(stderr, "         `-a div' do not work not for long genomes. Please choose `-a'\n");
@@ -83,29 +224,13 @@ int bwa_index(int argc, char *argv[])
 	str2 = (char*)calloc(strlen(prefix) + 10, 1);
 	str3 = (char*)calloc(strlen(prefix) + 10, 1);
 
-	if (is_color == 0) { // nucleotide indexing
+	{ // nucleotide indexing
 		gzFile fp = xzopen(argv[optind], "r");
 		t = clock();
 		fprintf(stderr, "[bwa_index] Pack FASTA... ");
 		l_pac = bns_fasta2bntseq(fp, prefix, 0);
 		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
 		gzclose(fp);
-	} else { // color indexing
-		gzFile fp = xzopen(argv[optind], "r");
-		strcat(strcpy(str, prefix), ".nt");
-		t = clock();
-		fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... ");
-		l_pac = bns_fasta2bntseq(fp, str, 0);
-		fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
-		gzclose(fp);
-		{
-			char *tmp_argv[3];
-			tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix;
-			t = clock();
-			fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... ");
-			bwa_pac2cspac(3, tmp_argv);
-			fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC);
-		}
 	}
 	if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT
 	{
diff --git a/bwtio.c b/bwtio.c
deleted file mode 100644
index 7508609..0000000
--- a/bwtio.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "bwt.h"
-#include "utils.h"
-
-void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
-{
-	FILE *fp;
-	fp = xopen(fn, "wb");
-	fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
-	fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
-	fwrite(bwt->bwt, 4, bwt->bwt_size, fp);
-	fclose(fp);
-}
-
-void bwt_dump_sa(const char *fn, const bwt_t *bwt)
-{
-	FILE *fp;
-	fp = xopen(fn, "wb");
-	fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp);
-	fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp);
-	fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
-	fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp);
-	fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
-	fclose(fp);
-}
-
-void bwt_restore_sa(const char *fn, bwt_t *bwt)
-{
-	char skipped[256];
-	FILE *fp;
-	bwtint_t primary;
-
-	fp = xopen(fn, "rb");
-	fread(&primary, sizeof(bwtint_t), 1, fp);
-	xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same.");
-	fread(skipped, sizeof(bwtint_t), 4, fp); // skip
-	fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp);
-	fread(&primary, sizeof(bwtint_t), 1, fp);
-	xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");
-
-	bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
-	bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t));
-	bwt->sa[0] = -1;
-
-	fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp);
-	fclose(fp);
-}
-
-bwt_t *bwt_restore_bwt(const char *fn)
-{
-	bwt_t *bwt;
-	FILE *fp;
-
-	bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
-	fp = xopen(fn, "rb");
-	fseek(fp, 0, SEEK_END);
-	bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2;
-	bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4);
-	fseek(fp, 0, SEEK_SET);
-	fread(&bwt->primary, sizeof(bwtint_t), 1, fp);
-	fread(bwt->L2+1, sizeof(bwtint_t), 4, fp);
-	fread(bwt->bwt, 4, bwt->bwt_size, fp);
-	bwt->seq_len = bwt->L2[4];
-	fclose(fp);
-	bwt_gen_cnt_table(bwt);
-
-	return bwt;
-}
-
-void bwt_destroy(bwt_t *bwt)
-{
-	if (bwt == 0) return;
-	free(bwt->sa); free(bwt->bwt);
-	free(bwt);
-}
diff --git a/bwtmisc.c b/bwtmisc.c
deleted file mode 100644
index c35d684..0000000
--- a/bwtmisc.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/* The MIT License
-
-   Copyright (c) 2008 Genome Research Ltd (GRL).
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3 at sanger.ac.uk> */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include "bntseq.h"
-#include "utils.h"
-#include "main.h"
-#include "bwt.h"
-
-#ifdef _DIVBWT
-#include "divsufsort.h"
-#endif
-
-int is_bwt(ubyte_t *T, int n);
-
-int64_t bwa_seq_len(const char *fn_pac)
-{
-	FILE *fp;
-	int64_t pac_len;
-	ubyte_t c;
-	fp = xopen(fn_pac, "rb");
-	fseek(fp, -1, SEEK_END);
-	pac_len = ftell(fp);
-	fread(&c, 1, 1, fp);
-	fclose(fp);
-	return (pac_len - 1) * 4 + (int)c;
-}
-
-bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is)
-{
-	bwt_t *bwt;
-	ubyte_t *buf, *buf2;
-	int i, pac_size;
-	FILE *fp;
-
-	// initialization
-	bwt = (bwt_t*)calloc(1, sizeof(bwt_t));
-	bwt->seq_len = bwa_seq_len(fn_pac);
-	bwt->bwt_size = (bwt->seq_len + 15) >> 4;
-	fp = xopen(fn_pac, "rb");
-
-	// prepare sequence
-	pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1);
-	buf2 = (ubyte_t*)calloc(pac_size, 1);
-	fread(buf2, 1, pac_size, fp);
-	fclose(fp);
-	memset(bwt->L2, 0, 5 * 4);
-	buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1);
-	for (i = 0; i < bwt->seq_len; ++i) {
-		buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3;
-		++bwt->L2[1+buf[i]];
-	}
-	for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1];
-	free(buf2);
-
-	// Burrows-Wheeler Transform
-	if (use_is) {
-		bwt->primary = is_bwt(buf, bwt->seq_len);
-	} else {
-#ifdef _DIVBWT
-		bwt->primary = divbwt(buf, buf, 0, bwt->seq_len);
-#else
-		err_fatal_simple("libdivsufsort is not compiled in.");
-#endif
-	}
-	bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4);
-	for (i = 0; i < bwt->seq_len; ++i)
-		bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1);
-	free(buf);
-	return bwt;
-}
-
-int bwa_pac2bwt(int argc, char *argv[])
-{
-	bwt_t *bwt;
-	int c, use_is = 1;
-	while ((c = getopt(argc, argv, "d")) >= 0) {
-		switch (c) {
-		case 'd': use_is = 0; break;
-		default: return 1;
-		}
-	}
-	if (optind + 2 > argc) {
-		fprintf(stderr, "Usage: bwa pac2bwt [-d] <in.pac> <out.bwt>\n");
-		return 1;
-	}
-	bwt = bwt_pac2bwt(argv[optind], use_is);
-	bwt_dump_bwt(argv[optind+1], bwt);
-	bwt_destroy(bwt);
-	return 0;
-}
-
-#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3)
-
-void bwt_bwtupdate_core(bwt_t *bwt)
-{
-	bwtint_t i, k, c[4], n_occ;
-	uint32_t *buf;
-
-	n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1;
-	bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size
-	buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt
-	c[0] = c[1] = c[2] = c[3] = 0;
-	for (i = k = 0; i < bwt->seq_len; ++i) {
-		if (i % OCC_INTERVAL == 0) {
-			memcpy(buf + k, c, sizeof(bwtint_t) * 4);
-			k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4)
-		}
-		if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2
-		++c[bwt_B00(bwt, i)];
-	}
-	// the last element
-	memcpy(buf + k, c, sizeof(bwtint_t) * 4);
-	xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size");
-	// update bwt
-	free(bwt->bwt); bwt->bwt = buf;
-}
-
-int bwa_bwtupdate(int argc, char *argv[])
-{
-	bwt_t *bwt;
-	if (argc < 2) {
-		fprintf(stderr, "Usage: bwa bwtupdate <the.bwt>\n");
-		return 1;
-	}
-	bwt = bwt_restore_bwt(argv[1]);
-	bwt_bwtupdate_core(bwt);
-	bwt_dump_bwt(argv[1], bwt);
-	bwt_destroy(bwt);
-	return 0;
-}
-
-const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4};
-
-/* this function is not memory efficient, but this will make life easier
-   Ideally we should also change .amb files as one 'N' in the nucleotide
-   sequence leads to two ambiguous colors. I may do this later... */
-uint8_t *bwa_pac2cspac_core(const bntseq_t *bns)
-{
-	uint8_t *pac, *cspac;
-	bwtint_t i;
-	int c1, c2;
-	pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
-	cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1);
-	fread(pac, 1, bns->l_pac/4+1, bns->fp_pac);
-	rewind(bns->fp_pac);
-	c1 = pac[0]>>6; cspac[0] = c1<<6;
-	for (i = 1; i < bns->l_pac; ++i) {
-		c2 = pac[i>>2] >> (~i&3)*2 & 3;
-		cspac[i>>2] |= nst_color_space_table[(1<<c1)|(1<<c2)] << (~i&3)*2;
-		c1 = c2;
-	}
-	free(pac);
-	return cspac;
-}
-
-int bwa_pac2cspac(int argc, char *argv[])
-{
-	bntseq_t *bns;
-	uint8_t *cspac, ct;
-	char *str;
-	FILE *fp;
-
-	if (argc < 3) {
-		fprintf(stderr, "Usage: bwa pac2cspac <in.nt.prefix> <out.cs.prefix>\n");
-		return 1;
-	}
-	bns = bns_restore(argv[1]);
-	cspac = bwa_pac2cspac_core(bns);
-	bns_dump(bns, argv[2]);
-	// now write cspac
-	str = (char*)calloc(strlen(argv[2]) + 5, 1);
-	strcat(strcpy(str, argv[2]), ".pac");
-	fp = xopen(str, "wb");
-	fwrite(cspac, 1, bns->l_pac/4 + 1, fp);
-	ct = bns->l_pac % 4;
-	fwrite(&ct, 1, 1, fp);	
-	fclose(fp);
-	bns_destroy(bns);
-	free(cspac);
-	return 0;
-}
-
-int bwa_bwt2sa(int argc, char *argv[])
-{
-	bwt_t *bwt;
-	int c, sa_intv = 32;
-	while ((c = getopt(argc, argv, "i:")) >= 0) {
-		switch (c) {
-		case 'i': sa_intv = atoi(optarg); break;
-		default: return 1;
-		}
-	}
-	if (optind + 2 > argc) {
-		fprintf(stderr, "Usage: bwa bwt2sa [-i %d] <in.bwt> <out.sa>\n", sa_intv);
-		return 1;
-	}
-	bwt = bwt_restore_bwt(argv[optind]);
-	bwt_cal_sa(bwt, sa_intv);
-	bwt_dump_sa(argv[optind+1], bwt);
-	bwt_destroy(bwt);
-	return 0;
-}
diff --git a/bwtsw2.h b/bwtsw2.h
index 0a1b860..0ec9676 100644
--- a/bwtsw2.h
+++ b/bwtsw2.h
@@ -12,8 +12,8 @@
 #define BSW2_FLAG_RESCUED 0x800
 
 typedef struct {
-	int skip_sw:16, hard_clip:16;
-	int a, b, q, r, t, qr, bw, max_ins;
+	int skip_sw:8, cpy_cmt:8, hard_clip:16;
+	int a, b, q, r, t, qr, bw, max_ins, max_chain_gap;
 	int z, is, t_seeds, multi_2nd;
 	float mask_level, coef;
 	int n_threads, chunk_size;
@@ -45,7 +45,7 @@ typedef struct {
 
 typedef struct {
 	int l, tid;
-	char *name, *seq, *qual, *sam;
+	char *name, *seq, *qual, *sam, *comment;
 } bsw2seq1_t;
 
 #ifdef __cplusplus
diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c
index 710051d..bc12d20 100644
--- a/bwtsw2_aux.c
+++ b/bwtsw2_aux.c
@@ -13,9 +13,10 @@
 #include "bwtsw2.h"
 #include "stdaln.h"
 #include "kstring.h"
+#include "bwa.h"
 
 #include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
 
 #include "ksort.h"
 #define __left_lt(a, b) ((a).end > (b).end)
@@ -54,6 +55,8 @@ bsw2opt_t *bsw2_init_opt()
 	o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0;
 	o->mask_level = 0.50f; o->coef = 5.5f;
 	o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000;
+	o->max_chain_gap = 10000;
+	o->cpy_cmt = 0;
 	return o;
 }
 
@@ -184,14 +187,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8
 		bsw2aux_t *q = b->aux + i;
 		uint8_t *query;
 		bwtint_t k;
-		int score, path_len, beg, end;
+		int path_len, beg, end;
 		if (p->l) continue;
 		beg = (p->flag & 0x10)? lq - p->end : p->beg;
 		end = (p->flag & 0x10)? lq - p->beg : p->end;
 		query = seq[(p->flag & 0x10)? 1 : 0] + beg;
 		for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here
 			target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3;
-		score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
+		aln_global_core(target, p->len, query, end - beg, &par, path, &path_len);
 		q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar);
 #if 0
 		if (name && score != p->G) { // debugging only
@@ -227,7 +230,7 @@ void bsw2_debug_hits(const bwtsw2_t *b)
 	for (i = 0; i < b->n; ++i) {
 		bsw2hit_t *p = b->hits + i;
 		if (p->G > 0)
-			printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
+			printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev);
 	}
 }
 
@@ -286,12 +289,13 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8
 		}
 	}
 	b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits"
-	bsw2_chain_filter(opt, l, b);
+	bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained
 	for (k = 0; k < 2; ++k) {
 		bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem);
 		merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here
 		bsw2_resolve_duphits(0, 0, bb[k][0], 0);
 		bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem);
+		bsw2_resolve_duphits(0, 0, bb[k][0], 0);
 		b[k] = bb[k][0];
 		free(bb[k]);		
 	}
@@ -549,7 +553,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
 				if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str);
 				else kputc(ks->qual[j], &str);
 			}
-		} else ksprintf(&str, "\t*");
+		} else kputs("\t*", &str);
 		// print optional tags
 		ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm);
 		if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn);
@@ -557,6 +561,12 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks
 		if (p->flag&BSW2_FLAG_MATESW) type |= 1;
 		if (p->flag&BSW2_FLAG_TANDEM) type |= 2;
 		if (type) ksprintf(&str, "\tXT:i:%d", type);
+		if (opt->cpy_cmt && ks->comment) {
+			int l = strlen(ks->comment);
+			if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') {
+				kputc('\t', &str); kputs(ks->comment, &str);
+			}
+		}
 		kputc('\n', &str);
 	}
 	ks->sam = str.s;
@@ -747,23 +757,14 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *
 	_seq->n = 0;
 }
 
-static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p)
-{
-	p->tid = -1;
-	p->l = ks->seq.l;
-	p->name = strdup(ks->name.s);
-	p->seq = strdup(ks->seq.s);
-	p->qual = ks->qual.l? strdup(ks->qual.s) : 0;
-	p->sam = 0;
-}
-
 void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2)
 {
 	gzFile fp, fp2;
 	kseq_t *ks, *ks2;
-	int l, size = 0, is_pe = 0;
+	int l, is_pe = 0, i, n;
 	uint8_t *pac;
 	bsw2seq_t *_seq;
+	bseq1_t *bseq;
 
 	pac = calloc(bns->l_pac/4+1, 1);
 	if (pac == 0) {
@@ -781,34 +782,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c
 		ks2 = kseq_init(fp2);
 		is_pe = 1;
 	} else fp2 = 0, ks2 = 0, is_pe = 0;
-	while (kseq_read(ks) >= 0) {
-		if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/')
-			ks->name.l -= 2, ks->name.s[ks->name.l] = 0;
-		if (_seq->n == _seq->max) {
-			_seq->max = _seq->max? _seq->max<<1 : 1024;
+	while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
+		int size = 0;
+		if (n > _seq->max) {
+			_seq->max = n;
+			kroundup32(_seq->max);
 			_seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t));
 		}
-		kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]);
-		size += ks->seq.l;
-		if (ks2) {
-			if (kseq_read(ks2) >= 0) {
-				if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/')
-					ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0;
-				kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge
-				size += ks->seq.l;
-			} else {
-				fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__);
-				is_pe = 0;
-			}
-		}
-		if (size > opt->chunk_size * opt->n_threads) {
-			fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
-			process_seqs(_seq, opt, bns, pac, target, is_pe);
-			size = 0;
+		_seq->n = n;
+		for (i = 0; i < n; ++i) {
+			bseq1_t *b = &bseq[i];
+			bsw2seq1_t *p = &_seq->seq[i];
+			p->tid = -1; p->l = b->l_seq;
+			p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0;
+			size += p->l;
 		}
+		fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size);
+		free(bseq);
+		process_seqs(_seq, opt, bns, pac, target, is_pe);
 	}
-	fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size);
-	process_seqs(_seq, opt, bns, pac, target, is_pe);
 	// free
 	free(pac);
 	free(_seq->seq); free(_seq);
diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c
index c734657..381d0b7 100644
--- a/bwtsw2_chain.c
+++ b/bwtsw2_chain.c
@@ -23,15 +23,15 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t
 			hsaip_t *q = chain + k;
 			int x = p->qbeg - q->qbeg; // always positive
 			int y = p->tbeg - q->tbeg;
-			if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) {
+			if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained
 				if (p->qend > q->qend) q->qend = p->qend;
 				if (p->tend > q->tend) q->tend = p->tend;
 				++q->chain;
 				p->chain = shift + k;
 				break;
-			}
+			} else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains
 		}
-		if (k < 0) {
+		if (k < 0) { // not added to any previous chains
 			chain[m] = *p;
 			chain[m].chain = 1;
 			chain[m].idx = p->chain = shift + m;
@@ -44,7 +44,7 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t
 void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
 {
 	hsaip_t *z[2], *chain[2];
-	int i, j, k, n[2], m[2];
+	int i, j, k, n[2], m[2], thres = opt->t_seeds * 2;
 	char *flag;
 	// initialization
 	n[0] = b[0]->n; n[1] = b[1]->n;
@@ -71,6 +71,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
 		int tmp = p->qbeg;
 		p->qbeg = len - p->qend; p->qend = len - tmp;
 	}
+	//for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend);
 	// filtering
 	flag = calloc(m[0] + m[1], 1);
 	ks_introsort(hsaip, m[0] + m[1], chain[0]);
@@ -79,7 +80,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2])
 		for (j = 0; j < k; ++j) {
 			hsaip_t *q = chain[0] + j;
 			if (flag[q->idx]) continue;
-			if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) {
+			if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) {
 				flag[p->idx] = 1;
 				break;
 			}
diff --git a/bwtsw2_main.c b/bwtsw2_main.c
index 50355fe..ab126f2 100644
--- a/bwtsw2_main.c
+++ b/bwtsw2_main.c
@@ -6,19 +6,17 @@
 #include "bwt.h"
 #include "bwtsw2.h"
 #include "utils.h"
+#include "bwa.h"
 
 int bwa_bwtsw2(int argc, char *argv[])
 {
-	extern char *bwa_infer_prefix(const char *hint);
 	bsw2opt_t *opt;
-	bwt_t *target;
-	char buf[1024], *prefix;
-	bntseq_t *bns;
+	bwaidx_t *idx;
 	int c;
 
 	opt = bsw2_init_opt();
 	srand48(11);
-	while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) {
+	while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) {
 		switch (c) {
 		case 'q': opt->q = atoi(optarg); break;
 		case 'r': opt->r = atoi(optarg); break;
@@ -37,6 +35,8 @@ int bwa_bwtsw2(int argc, char *argv[])
 		case 'f': xreopen(optarg, "w", stdout); break;
 		case 'I': opt->max_ins = atoi(optarg); break;
 		case 'S': opt->skip_sw = 1; break;
+		case 'C': opt->cpy_cmt = 1; break;
+		case 'G': opt->max_chain_gap = atoi(optarg); break;
 		}
 	}
 	opt->qr = opt->q + opt->r;
@@ -54,6 +54,7 @@ int bwa_bwtsw2(int argc, char *argv[])
 		fprintf(stderr, "         -t INT   number of threads [%d]\n", opt->n_threads);
 		fprintf(stderr, "         -f FILE  file to output results to instead of stdout\n");
 		fprintf(stderr, "         -H       in SAM output, use hard clipping instead of soft clipping\n");
+		fprintf(stderr, "         -C       copy FASTA/Q comment to SAM output\n");
 		fprintf(stderr, "         -M       mark multi-part alignments as secondary\n");
 		fprintf(stderr, "         -S       skip Smith-Waterman read pairing\n");
 		fprintf(stderr, "         -I INT   ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins);
@@ -62,7 +63,8 @@ int bwa_bwtsw2(int argc, char *argv[])
 		fprintf(stderr, "         -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef);
 		fprintf(stderr, "         -z INT   Z-best [%d]\n", opt->z);
 		fprintf(stderr, "         -s INT   maximum seeding interval size [%d]\n", opt->is);
-		fprintf(stderr, "         -N INT   # seeds to trigger reverse alignment [%d]\n", opt->t_seeds);
+		fprintf(stderr, "         -N INT   # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds);
+		fprintf(stderr, "         -G INT   maximum gap size during chaining [%d]\n", opt->max_chain_gap);
 		fprintf(stderr, "\n");
 		fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n");
 		fprintf(stderr, "      BACs, the default setting usually works well. For the current PacBio\n");
@@ -77,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[])
 	opt->t *= opt->a;
 	opt->coef *= opt->a;
 
-	if ((prefix = bwa_infer_prefix(argv[optind])) == 0) {
-		fprintf(stderr, "[%s] fail to locate the index\n", __func__);
-		return 0;
-	}
-	strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt"));
-	strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target);
-	bns = bns_restore(prefix);
-
-	bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
-
-	bns_destroy(bns);
-	bwt_destroy(target);
-	free(opt); free(prefix);
+	if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0;
+	bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0);
+	bwa_idx_destroy(idx);
+	free(opt);
 	
 	return 0;
 }
diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c
index a6f4d80..cf29087 100644
--- a/bwtsw2_pair.c
+++ b/bwtsw2_pair.c
@@ -6,6 +6,7 @@
 #include "bntseq.h"
 #include "bwtsw2.h"
 #include "kstring.h"
+#include "utils.h"
 #ifndef _NO_SSE2
 #include "ksw.h"
 #else
@@ -24,7 +25,6 @@ typedef struct {
 
 bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
 {
-	extern void ks_introsort_uint64_t(size_t n, uint64_t *a);
 	int i, k, x, p25, p50, p75, tmp, max_len = 0;
 	uint64_t *isize;
 	bsw2pestat_t r;
@@ -44,7 +44,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
 		max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg;
 		isize[k++] = l;
 	}
-	ks_introsort_uint64_t(k, isize);
+	ks_introsort_64(k, isize);
 	p25 = isize[(int)(.25 * k + .499)];
 	p50 = isize[(int)(.50 * k + .499)];
 	p75 = isize[(int)(.75 * k + .499)];
@@ -74,9 +74,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins)
 	r.low  = tmp > max_len? tmp : max_len;
 	if (r.low < 1) r.low = 1;
 	r.high = (int)(p75 + 3. * (p75 - p25) + .499);
-	if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499);
+	if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499);
 	r.low = tmp > max_len? tmp : max_len;
-	if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499);
+	if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499);
 	ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high);
 	free(isize);
 	return r;
@@ -127,35 +127,24 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
 			seq[i] = nst_nt4_table[(int)mseq[i]];
 	}
 #ifndef _NO_SSE2
-	{
-		ksw_query_t *q;
-		ksw_aux_t aux[2];
-		// forward Smith-Waterman
-		aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0];
-		q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat);
-		ksw_sse2(q, end - beg, ref, &aux[0]);
-		free(q);
-		if (aux[0].score < opt->t) {
-			free(seq);
-			return;
-		}
-		++aux[0].qe; ++aux[0].te;
-		// reverse Smith-Waterman
-		seq_reverse(aux[0].qe, seq, 0);
-		seq_reverse(aux[0].te, ref, 0);
-		q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat);
-		ksw_sse2(q, aux[0].te, ref, &aux[1]);
-		free(q);
-		++aux[1].qe; ++aux[1].te;
-		// write output
-		a->G = aux[0].score;
-		a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2;
+	{ // FIXME!!! The following block has not been tested since the update of the ksw library
+		int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t;
+		kswr_t aln;
+		aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0);
+		a->G = aln.score;
+		a->G2 = aln.score2;
+		if (a->G < opt->t) a->G = 0;
 		if (a->G2 < opt->t) a->G2 = 0;
 		if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
-		a->k = beg + (aux[0].te - aux[1].te);
-		a->len = aux[1].te;
-		a->beg = aux[0].qe - aux[1].qe;
-		a->end = aux[0].qe;
+		a->k = beg + aln.tb;
+		a->len = aln.te - aln.tb + 1;
+		a->beg = aln.qb;
+		a->end = aln.qe + 1;
+		/*
+		printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n');
+		printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n');
+		printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len);
+		*/
 	}
 #else
 	{
@@ -168,6 +157,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b
 		a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2);
 		if (a->G < opt->t) a->G = 0;
 		if (a->G2 < opt->t) a->G2 = 0;
+		if (a->G2) a->flag |= BSW2_FLAG_TANDEM;
 		a->k = beg + path[0].i - 1;
 		a->len = path[1].i - path[0].i + 1;
 		a->beg = path[0].j - 1;
diff --git a/cs2nt.c b/cs2nt.c
deleted file mode 100644
index dfbce60..0000000
--- a/cs2nt.c
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <string.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include "bwtaln.h"
-#include "stdaln.h"
-
-/*
-  Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
-  decode as ATTGAC(RBGOG), there are one color change and one nt change;
-  if we decode as ATTAAC(RBRBG), there are two color changes.
-
-  In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
-  as the penalty; otherwise, we will use color quality as the
-  penalty. This means we always prefer two consistent color changes over
-  a nt change, but if a color has high quality, we may prefer one nt
-  change.
-
-  In the above example, the penalties of the two types of decoding are
-  q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
-  otherwise the second. Note that no matter what we choose, the fourth
-  base will get a low nt quality.
- */
-
-#define COLOR_MM 19
-#define NUCL_MM  25
-
-static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
-
-/*
-  {A,C,G,T,N} -> {0,1,2,3,4}
-  nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
-  cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
-  nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
-  btarray[0..4*size]: backtrack array (working space)
- */
-void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
-{
-	int h[8], curr, last;
-	int x, y, xmin, hmin, k;
-
-	// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
-
-	// recursion: initial value
-	if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
-	else {
-		for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
-		h[nt_ref[0]] = 0;
-	}
-	// recursion: main loop
-	curr = 1; last = 0;
-	for (k = 1; k <= size; ++k) {
-		for (x = 0; x != 4; ++x) {
-			int min = 0x7fffffff, ymin = 0;
-			for (y = 0; y != 4; ++y) {
-				int s = h[last<<2|y];
-				if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
-					s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
-				if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
-				if (s < min) {
-					min = s; ymin = y;
-				}
-			}
-			h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
-		}
-		last = curr; curr = 1 - curr; // swap
-	}
-	// back trace
-	hmin = 0x7fffffff; xmin = 0;
-	for (x = 0; x != 4; ++x) {
-		if (h[last<<2|x] < hmin) {
-			hmin = h[last<<2|x]; xmin = x;
-		}
-	}
-	nt_read[size] = xmin;
-	for (k = size - 1; k >= 0; --k)
-		nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
-}
-/*
-  nt_read[0..size]: nucleotide read sequence: 0/1/2/3
-  cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
-  tarray[0..size*2-1]: temporary array
- */
-uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
-{
-	int k, c1, c2;
-	uint8_t *t2array = tarray + size;
-	// get the color sequence of nt_read
-	c1 = nt_read[0];
-	for (k = 1; k <= size; ++k) {
-		c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
-		tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
-		c1 = c2;
-	}
-	for (k = 1; k != size; ++k) {
-		int q = 0;
-		if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
-			q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
-		} else if (tarray[k-1] == cs_read[k-1]>>6) {
-			q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
-		} else if (tarray[k] == cs_read[k]>>6) {
-			q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
-		} // else, q = 0
-		if (q < 0) q = 0;
-		if (q > 60) q = 60;
-		t2array[k] = nt_read[k]<<6 | q;
-		if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
-	}
-	return t2array + 1; // of size-2
-}
-
-// this function will be called when p->seq has been reversed by refine_gapped()
-void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac)
-{
-	uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
-	int i, len;
-	uint8_t *seq;
-
-	// set temporary arrays
-	if (p->type == BWA_TYPE_NO_MATCH) return;
-	len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
-	ta = (uint8_t*)malloc(len * 7);
-	nt_ref = ta;
-	cs_read = nt_ref + len;
-	nt_read = cs_read + len;
-	btarray = nt_read + len;
-	tarray = nt_read + len;
-
-#define __gen_csbase(_cs, _i, _seq) do {							\
-		int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33;	\
-		if (q > 60) q = 60;											\
-		if (_seq[_i] > 3) q = 63;									\
-		(_cs) = _seq[_i]<<6 | q;									\
-	} while (0)
-
-	// generate len, nt_ref[] and cs_read
-	seq = p->strand? p->rseq : p->seq;
-	nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4;
-	if (p->cigar == 0) { // no gap or clipping
-		len = p->len;
-		for (i = 0; i < p->len; ++i) {
-			__gen_csbase(cs_read[i], i, seq);
-			nt_ref[i+1] = bns_pac(pac, p->pos + i);
-		}
-	} else {
-		int k, z;
-		bwtint_t x, y;
-		x = p->pos; y = 0;
-		for (k = z = 0; k < p->n_cigar; ++k) {
-			int l = __cigar_len(p->cigar[k]);
-			if (__cigar_op(p->cigar[k]) == FROM_M) {
-				for (i = 0; i < l; ++i, ++x, ++y) {
-					__gen_csbase(cs_read[z], y, seq);
-					nt_ref[z+1] = bns_pac(pac, x);
-					++z;
-				}
-			} else if (__cigar_op(p->cigar[k]) == FROM_I) {
-				for (i = 0; i < l; ++i, ++y) {
-					__gen_csbase(cs_read[z], y, seq);
-					nt_ref[z+1] = 4;
-					++z;
-				}
-			} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
-			else x += l;
-		}
-		len = z;
-	}
-
-	cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
-	new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
-
-	// update p
-	p->len = p->full_len = len - 1;
-	for (i = 0; i < p->len; ++i) {
-		if ((new_nt_read[i]&0x3f) == 63) {
-			p->qual[i] = 33; seq[i] = 4;
-		} else {
-			p->qual[i] = (new_nt_read[i]&0x3f) + 33;
-			seq[i] = new_nt_read[i]>>6;
-		}
-	}
-	p->qual[p->len] = seq[p->len] = 0;
-	if (p->strand) {
-		memcpy(p->seq, seq, p->len);
-		seq_reverse(p->len, p->seq, 1);
-		seq_reverse(p->len, p->qual, 0);
-	} else {
-		memcpy(p->rseq, seq, p->len);
-		seq_reverse(p->len, p->rseq, 1);
-	}
-	free(ta);
-}
diff --git a/example.c b/example.c
new file mode 100644
index 0000000..6564cbd
--- /dev/null
+++ b/example.c
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <zlib.h>
+#include <string.h>
+#include <assert.h>
+#include "bwamem.h"
+#include "kseq.h" // for the FASTA/Q parser
+KSEQ_DECLARE(gzFile)
+
+int main(int argc, char *argv[])
+{
+	bwaidx_t *idx;
+	gzFile fp;
+	kseq_t *ks;
+	mem_opt_t *opt;
+
+	if (argc < 3) {
+		fprintf(stderr, "Usage: bwamem-lite <idx.base> <reads.fq>\n");
+		return 1;
+	}
+
+	idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index
+	assert(idx);
+	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+	assert(fp);
+	ks = kseq_init(fp); // initialize the FASTA/Q parser
+	opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values
+
+	while (kseq_read(ks) >= 0) { // read one sequence
+		mem_alnreg_v ar;
+		int i, k;
+		ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits
+		for (i = 0; i < ar.n; ++i) { // traverse each hit
+			mem_aln_t a;
+			if (ar.a[i].secondary >= 0) continue; // skip secondary alignments
+			a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR
+			// print alignment
+			printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq);
+			for (k = 0; k < a.n_cigar; ++k) // print CIGAR
+				printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
+			printf("\t%d\n", a.NM); // print edit distance
+			free(a.cigar); // don't forget to deallocate CIGAR
+		}
+		free(ar.a); // and deallocate the hit list
+	}
+
+	free(opt);
+	kseq_destroy(ks);
+	gzclose(fp);
+	bwa_idx_destroy(idx);
+	return 0;
+}
diff --git a/fastmap.c b/fastmap.c
index 4d7a675..56cfb01 100644
--- a/fastmap.c
+++ b/fastmap.c
@@ -2,91 +2,155 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
-#include "bntseq.h"
-#include "bwt.h"
+#include "bwa.h"
+#include "bwamem.h"
 #include "kvec.h"
+#include "utils.h"
 #include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
+KSEQ_DECLARE(gzFile)
 
 extern unsigned char nst_nt4_table[256];
 
-typedef struct {
-	const bwt_t *bwt;
-	const uint8_t *query;
-	int start, len;
-	bwtintv_v *tmpvec[2], *matches;
-} smem_i;
+void *kopen(const char *fn, int *_fd);
+int kclose(void *a);
 
-smem_i *smem_iter_init(const bwt_t *bwt)
+int main_mem(int argc, char *argv[])
 {
-	smem_i *iter;
-	iter = calloc(1, sizeof(smem_i));
-	iter->bwt = bwt;
-	iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v));
-	iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v));
-	iter->matches   = calloc(1, sizeof(bwtintv_v));
-	return iter;
-}
+	mem_opt_t *opt;
+	int fd, fd2, i, c, n, copy_comment = 0;
+	gzFile fp, fp2 = 0;
+	kseq_t *ks, *ks2 = 0;
+	bseq1_t *seqs;
+	bwaidx_t *idx;
+	char *rg_line = 0;
+	void *ko = 0, *ko2 = 0;
 
-void smem_iter_destroy(smem_i *iter)
-{
-	free(iter->tmpvec[0]->a);
-	free(iter->tmpvec[1]->a);
-	free(iter->matches->a);
-	free(iter);
-}
+	opt = mem_opt_init();
+	while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) {
+		if (c == 'k') opt->min_seed_len = atoi(optarg);
+		else if (c == 'w') opt->w = atoi(optarg);
+		else if (c == 'A') opt->a = atoi(optarg);
+		else if (c == 'B') opt->b = atoi(optarg);
+		else if (c == 'O') opt->q = atoi(optarg);
+		else if (c == 'E') opt->r = atoi(optarg);
+		else if (c == 'L') opt->pen_clip = atoi(optarg);
+		else if (c == 'U') opt->pen_unpaired = atoi(optarg);
+		else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1;
+		else if (c == 'P') opt->flag |= MEM_F_NOPAIRING;
+		else if (c == 'H') opt->flag |= MEM_F_HARDCLIP;
+		else if (c == 'a') opt->flag |= MEM_F_ALL;
+		else if (c == 'p') opt->flag |= MEM_F_PE;
+		else if (c == 'M') opt->flag |= MEM_F_NO_MULTI;
+		else if (c == 'c') opt->max_occ = atoi(optarg);
+		else if (c == 'v') bwa_verbose = atoi(optarg);
+		else if (c == 'r') opt->split_factor = atof(optarg);
+		else if (c == 'C') copy_comment = 1;
+		else if (c == 'R') {
+			if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak
+		} else if (c == 's') opt->split_width = atoi(optarg);
+	}
+	if (opt->n_threads < 1) opt->n_threads = 1;
+	if (optind + 1 >= argc) {
+		fprintf(stderr, "\n");
+		fprintf(stderr, "Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq]\n\n");
+		fprintf(stderr, "Algorithm options:\n\n");
+		fprintf(stderr, "       -t INT     number of threads [%d]\n", opt->n_threads);
+		fprintf(stderr, "       -k INT     minimum seed length [%d]\n", opt->min_seed_len);
+		fprintf(stderr, "       -w INT     band width for banded alignment [%d]\n", opt->w);
+		fprintf(stderr, "       -r FLOAT   look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor);
+//		fprintf(stderr, "       -s INT     look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width);
+		fprintf(stderr, "       -c INT     skip seeds with more than INT occurrences [%d]\n", opt->max_occ);
+		fprintf(stderr, "       -P         skip pairing; perform mate SW only\n");
+		fprintf(stderr, "       -A INT     score for a sequence match [%d]\n", opt->a);
+		fprintf(stderr, "       -B INT     penalty for a mismatch [%d]\n", opt->b);
+		fprintf(stderr, "       -O INT     gap open penalty [%d]\n", opt->q);
+		fprintf(stderr, "       -E INT     gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r);
+		fprintf(stderr, "       -L INT     penalty for clipping [%d]\n", opt->pen_clip);
+		fprintf(stderr, "       -U INT     penalty for an unpaired read pair [%d]\n", opt->pen_unpaired);
+		fprintf(stderr, "\nInput/output options:\n\n");
+		fprintf(stderr, "       -p         first query file consists of interleaved paired-end sequences\n");
+		fprintf(stderr, "       -R STR     read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
+		fprintf(stderr, "\n");
+		fprintf(stderr, "       -v INT     verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose);
+		fprintf(stderr, "       -a         output all alignments for SE or unpaired PE\n");
+		fprintf(stderr, "       -C         append FASTA/FASTQ comment to SAM output\n");
+		fprintf(stderr, "       -H         hard clipping\n");
+		fprintf(stderr, "       -M         mark shorter split hits as secondary (for Picard/GATK compatibility)\n");
+		fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n");
+		fprintf(stderr, "\n");
+		free(opt);
+		return 1;
+	}
 
-void smem_set_query(smem_i *iter, int len, const uint8_t *query)
-{
-	iter->query = query;
-	iter->start = 0;
-	iter->len = len;
-}
+	mem_fill_scmat(opt->a, opt->b, opt->mat);
+	if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak
+	bwa_print_sam_hdr(idx->bns, rg_line);
 
-int smem_next(smem_i *iter)
-{
-	iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0;
-	if (iter->start >= iter->len || iter->start < 0) return -1;
-	while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases
-	if (iter->start == iter->len) return -1;
-	iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec);
-	return iter->start;
+	ko = kopen(argv[optind + 1], &fd);
+	fp = gzdopen(fd, "r");
+	ks = kseq_init(fp);
+	if (optind + 2 < argc) {
+		if (opt->flag&MEM_F_PE) {
+			if (bwa_verbose >= 2)
+				fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__);
+		} else {
+			ko2 = kopen(argv[optind + 2], &fd2);
+			fp2 = gzdopen(fd2, "r");
+			ks2 = kseq_init(fp2);
+			opt->flag |= MEM_F_PE;
+		}
+	}
+	while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) {
+		int64_t size = 0;
+		if (!copy_comment)
+			for (i = 0; i < n; ++i) {
+				free(seqs[i].comment); seqs[i].comment = 0;
+			}
+		for (i = 0; i < n; ++i) size += seqs[i].l_seq;
+		if (bwa_verbose >= 3)
+			fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size);
+		mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs);
+		free(seqs);
+	}
+
+	free(opt);
+	bwa_idx_destroy(idx);
+	kseq_destroy(ks);
+	gzclose(fp); kclose(ko);
+	if (ks2) {
+		kseq_destroy(ks2);
+		gzclose(fp2); kclose(ko2);
+	}
+	return 0;
 }
 
 int main_fastmap(int argc, char *argv[])
 {
-	int c, i, min_iwidth = 20, min_len = 17, print_seq = 0;
+	int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0;
 	kseq_t *seq;
 	bwtint_t k;
 	gzFile fp;
-	bwt_t *bwt;
-	bntseq_t *bns;
-	smem_i *iter;
+	smem_i *itr;
+	const bwtintv_v *a;
+	bwaidx_t *idx;
 
-	while ((c = getopt(argc, argv, "w:l:s")) >= 0) {
+	while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) {
 		switch (c) {
-			case 's': print_seq = 1; break;
+			case 's': split_width = atoi(optarg); break;
+			case 'p': print_seq = 1; break;
 			case 'w': min_iwidth = atoi(optarg); break;
 			case 'l': min_len = atoi(optarg); break;
 		}
 	}
 	if (optind + 1 >= argc) {
-		fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", min_len, min_iwidth);
+		fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] <idxbase> <in.fq>\n", split_width, min_len, min_iwidth);
 		return 1;
 	}
 
 	fp = gzopen(argv[optind + 1], "r");
 	seq = kseq_init(fp);
-	{ // load the packed sequences, BWT and SA
-		char *tmp = calloc(strlen(argv[optind]) + 5, 1);
-		strcat(strcpy(tmp, argv[optind]), ".bwt");
-		bwt = bwt_restore_bwt(tmp);
-		strcat(strcpy(tmp, argv[optind]), ".sa");
-		bwt_restore_sa(tmp, bwt);
-		free(tmp);
-		bns = bns_restore(argv[optind]);
-	}
-	iter = smem_iter_init(bwt);
+	idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS);
+	itr = smem_itr_init(idx->bwt);
 	while (kseq_read(seq) >= 0) {
 		printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l);
 		if (print_seq) {
@@ -95,10 +159,10 @@ int main_fastmap(int argc, char *argv[])
 		} else putchar('\n');
 		for (i = 0; i < seq->seq.l; ++i)
 			seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]];
-		smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s);
-		while (smem_next(iter) > 0) {
-			for (i = 0; i < iter->matches->n; ++i) {
-				bwtintv_t *p = &iter->matches->a[i];
+		smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s);
+		while ((a = smem_next(itr, min_len<<1, split_width)) != 0) {
+			for (i = 0; i < a->n; ++i) {
+				bwtintv_t *p = &a->a[i];
 				if ((uint32_t)p->info - (p->info>>32) < min_len) continue;
 				printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]);
 				if (p->x[2] <= min_iwidth) {
@@ -106,10 +170,10 @@ int main_fastmap(int argc, char *argv[])
 						bwtint_t pos;
 						int len, is_rev, ref_id;
 						len  = (uint32_t)p->info - (p->info>>32);
-						pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev);
+						pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev);
 						if (is_rev) pos -= len - 1;
-						bns_cnt_ambi(bns, pos, len, &ref_id);
-						printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1);
+						bns_cnt_ambi(idx->bns, pos, len, &ref_id);
+						printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1);
 					}
 				} else fputs("\t*", stdout);
 				putchar('\n');
@@ -118,9 +182,8 @@ int main_fastmap(int argc, char *argv[])
 		puts("//");
 	}
 
-	smem_iter_destroy(iter);
-	bns_destroy(bns);
-	bwt_destroy(bwt);
+	smem_itr_destroy(itr);
+	bwa_idx_destroy(idx);
 	kseq_destroy(seq);
 	gzclose(fp);
 	return 0;
diff --git a/kbtree.h b/kbtree.h
new file mode 100644
index 0000000..5ed5330
--- /dev/null
+++ b/kbtree.h
@@ -0,0 +1,384 @@
+/*-
+ * Copyright 1997-1999, 2001, John-Mark Gurney.
+ *           2008-2009, Attractive Chaos <attractor at live.co.uk>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __AC_KBTREE_H
+#define __AC_KBTREE_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+typedef struct {
+	int32_t is_internal:1, n:31;
+} kbnode_t;
+
+#define	__KB_KEY(type, x)	((type*)((char*)x + 4))
+#define __KB_PTR(btr, x)	((kbnode_t**)((char*)x + btr->off_ptr))
+
+#define __KB_TREE_T(name)						\
+	typedef struct {							\
+		kbnode_t *root;							\
+		int	off_key, off_ptr, ilen, elen;		\
+		int	n, t;								\
+		int	n_keys, n_nodes;					\
+	} kbtree_##name##_t;
+
+#define __KB_INIT(name, key_t)											\
+	kbtree_##name##_t *kb_init_##name(int size)							\
+	{																	\
+		kbtree_##name##_t *b;											\
+		b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t));	\
+		b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \
+		if (b->t < 2) {													\
+			free(b); return 0;											\
+		}																\
+		b->n = 2 * b->t - 1;											\
+		b->off_ptr = 4 + b->n * sizeof(key_t);							\
+		b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \
+		b->elen = (b->off_ptr + 3) >> 2 << 2;							\
+		b->root = (kbnode_t*)calloc(1, b->ilen);						\
+		++b->n_nodes;													\
+		return b;														\
+	}
+
+#define __kb_destroy(b) do {											\
+		int i, max = 8;													\
+		kbnode_t *x, **top, **stack = 0;								\
+		if (b) {														\
+			top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*));	\
+			*top++ = (b)->root;											\
+			while (top != stack) {										\
+				x = *--top;												\
+				if (x->is_internal == 0) { free(x); continue; }			\
+				for (i = 0; i <= x->n; ++i)								\
+					if (__KB_PTR(b, x)[i]) {							\
+						if (top - stack == max) {						\
+							max <<= 1;									\
+							stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \
+							top = stack + (max>>1);						\
+						}												\
+						*top++ = __KB_PTR(b, x)[i];						\
+					}													\
+				free(x);												\
+			}															\
+		}																\
+		free(b); free(stack);											\
+	} while (0)
+
+#define __kb_get_first(key_t, b, ret) do {	\
+		kbnode_t *__x = (b)->root;			\
+		while (__KB_PTR(b, __x)[0] != 0)	\
+			__x = __KB_PTR(b, __x)[0];		\
+		(ret) = __KB_KEY(key_t, __x)[0];	\
+	} while (0)
+
+#define __KB_GET_AUX0(name, key_t, __cmp)								\
+	static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin, end, n = x->n >> 1;							\
+		if (x->n == 0) return -1;										\
+		if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) {						\
+			begin = 0; end = n;											\
+		} else { begin = n; end = x->n - 1; }							\
+		rr = r? r : &tr;												\
+		n = end;														\
+		while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \
+		return n;														\
+	}
+
+#define __KB_GET_AUX1(name, key_t, __cmp)								\
+	static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \
+	{																	\
+		int tr, *rr, begin = 0, end = x->n;								\
+		if (x->n == 0) return -1;										\
+		rr = r? r : &tr;												\
+		while (begin < end) {											\
+			int mid = (begin + end) >> 1;								\
+			if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \
+			else end = mid;												\
+		}																\
+		if (begin == x->n) { *rr = 1; return x->n - 1; }				\
+		if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin;	\
+		return begin;													\
+	}
+
+#define __KB_GET(name, key_t)											\
+	static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i];		\
+			if (x->is_internal == 0) return 0;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+		return 0;														\
+	}																	\
+	static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_getp_##name(b, &k);									\
+	}
+
+#define __KB_INTERVAL(name, key_t)										\
+	static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper)	\
+	{																	\
+		int i, r = 0;													\
+		kbnode_t *x = b->root;											\
+		*lower = *upper = 0;											\
+		while (x) {														\
+			i = __kb_getp_aux_##name(x, k, &r);							\
+			if (i >= 0 && r == 0) {										\
+				*lower = *upper = &__KB_KEY(key_t, x)[i];				\
+				return;													\
+			}															\
+			if (i >= 0) *lower = &__KB_KEY(key_t, x)[i];				\
+			if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1];		\
+			if (x->is_internal == 0) return;							\
+			x = __KB_PTR(b, x)[i + 1];									\
+		}																\
+	}																	\
+	static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \
+	{																	\
+		kb_intervalp_##name(b, &k, lower, upper);						\
+	}
+
+#define __KB_PUT(name, key_t, __cmp)									\
+	/* x must be an internal node */									\
+	static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \
+	{																	\
+		kbnode_t *z;													\
+		z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen);	\
+		++b->n_nodes;													\
+		z->is_internal = y->is_internal;								\
+		z->n = b->t - 1;												\
+		memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \
+		if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \
+		y->n = b->t - 1;												\
+		memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \
+		__KB_PTR(b, x)[i + 1] = z;										\
+		memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \
+		__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1];			\
+		++x->n;															\
+	}																	\
+	static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \
+	{																	\
+		int i = x->n - 1;												\
+		if (x->is_internal == 0) {										\
+			i = __kb_getp_aux_##name(x, k, 0);							\
+			if (i != x->n - 1)											\
+				memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			__KB_KEY(key_t, x)[i + 1] = *k;								\
+			++x->n;														\
+		} else {														\
+			i = __kb_getp_aux_##name(x, k, 0) + 1;						\
+			if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) {					\
+				__kb_split_##name(b, x, i, __KB_PTR(b, x)[i]);			\
+				if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i;			\
+			}															\
+			__kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k);				\
+		}																\
+	}																	\
+	static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *r, *s;												\
+		++b->n_keys;													\
+		r = b->root;													\
+		if (r->n == 2 * b->t - 1) {										\
+			++b->n_nodes;												\
+			s = (kbnode_t*)calloc(1, b->ilen);							\
+			b->root = s; s->is_internal = 1; s->n = 0;					\
+			__KB_PTR(b, s)[0] = r;										\
+			__kb_split_##name(b, s, 0, r);								\
+			r = s;														\
+		}																\
+		__kb_putp_aux_##name(b, r, k);									\
+	}																	\
+	static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		kb_putp_##name(b, &k);											\
+	}
+
+
+#define __KB_DEL(name, key_t)											\
+	static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \
+	{																	\
+		int yn, zn, i, r = 0;											\
+		kbnode_t *xp, *y, *z;											\
+		key_t kp;														\
+		if (x == 0) return *k;											\
+		if (s) { /* s can only be 0, 1 or 2 */							\
+			r = x->is_internal == 0? 0 : s == 1? 1 : -1;				\
+			i = s == 1? x->n - 1 : -1;									\
+		} else i = __kb_getp_aux_##name(x, k, &r);						\
+		if (x->is_internal == 0) {										\
+			if (s == 2) ++i;											\
+			kp = __KB_KEY(key_t, x)[i];									\
+			memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+			--x->n;														\
+			return kp;													\
+		}																\
+		if (r == 0) {													\
+			if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) {					\
+				xp = __KB_PTR(b, x)[i];									\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \
+				return kp;												\
+			} else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) {		\
+				xp = __KB_PTR(b, x)[i + 1];								\
+				kp = __KB_KEY(key_t, x)[i];								\
+				__KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \
+				return kp;												\
+			} else if (yn == b->t - 1 && zn == b->t - 1) {				\
+				y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1];		\
+				__KB_KEY(key_t, y)[y->n++] = *k;						\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \
+				y->n += z->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(z);												\
+				return __kb_delp_aux_##name(b, y, k, s);				\
+			}															\
+		}																\
+		++i;															\
+		if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) {					\
+			if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) {		\
+				memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				__KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1];		\
+				__KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \
+				if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \
+				--y->n; ++xp->n;										\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				__KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0];			\
+				if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \
+				--y->n;													\
+				memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \
+				if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \
+			} else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1];	\
+				memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t));	\
+				if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \
+				y->n += xp->n;											\
+				memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \
+				--x->n;													\
+				free(xp);												\
+				xp = y;													\
+			} else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \
+				__KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i];	\
+				memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t));	\
+				if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \
+				xp->n += y->n;											\
+				memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \
+				memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \
+				--x->n;													\
+				free(y);												\
+			}															\
+		}																\
+		return __kb_delp_aux_##name(b, xp, k, s);						\
+	}																	\
+	static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \
+	{																	\
+		kbnode_t *x;													\
+		key_t ret;														\
+		ret = __kb_delp_aux_##name(b, b->root, k, 0);					\
+		--b->n_keys;													\
+		if (b->root->n == 0 && b->root->is_internal) {					\
+			--b->n_nodes;												\
+			x = b->root;												\
+			b->root = __KB_PTR(b, x)[0];								\
+			free(x);													\
+		}																\
+		return ret;														\
+	}																	\
+	static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \
+	{																	\
+		return kb_delp_##name(b, &k);									\
+	}
+
+typedef struct {
+	kbnode_t *x;
+	int i;
+} __kbstack_t;
+
+#define __kb_traverse(key_t, b, __func) do {							\
+		int __kmax = 8;													\
+		__kbstack_t *__kstack, *__kp;									\
+		__kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \
+		__kp->x = (b)->root; __kp->i = 0;								\
+		for (;;) {														\
+			while (__kp->x && __kp->i <= __kp->x->n) {					\
+				if (__kp - __kstack == __kmax - 1) {					\
+					__kmax <<= 1;										\
+					__kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \
+					__kp = __kstack + (__kmax>>1) - 1;					\
+				}														\
+				(__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \
+				++__kp;													\
+			}															\
+			--__kp;														\
+			if (__kp >= __kstack) {										\
+				if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \
+				++__kp->i;												\
+			} else break;												\
+		}																\
+		free(__kstack);													\
+	} while (0)
+
+#define KBTREE_INIT(name, key_t, __cmp)			\
+	__KB_TREE_T(name)							\
+	__KB_INIT(name, key_t)						\
+	__KB_GET_AUX1(name, key_t, __cmp)			\
+	__KB_GET(name, key_t)						\
+	__KB_INTERVAL(name, key_t)					\
+	__KB_PUT(name, key_t, __cmp)				\
+	__KB_DEL(name, key_t)
+
+#define KB_DEFAULT_SIZE 512
+
+#define kbtree_t(name) kbtree_##name##_t
+#define kb_init(name, s) kb_init_##name(s)
+#define kb_destroy(name, b) __kb_destroy(b)
+#define kb_get(name, b, k) kb_get_##name(b, k)
+#define kb_put(name, b, k) kb_put_##name(b, k)
+#define kb_del(name, b, k) kb_del_##name(b, k)
+#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u)
+#define kb_getp(name, b, k) kb_getp_##name(b, k)
+#define kb_putp(name, b, k) kb_putp_##name(b, k)
+#define kb_delp(name, b, k) kb_delp_##name(b, k)
+#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u)
+
+#define kb_size(b) ((b)->n_keys)
+
+#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b)))
+#define kb_str_cmp(a, b) strcmp(a, b)
+
+#endif
diff --git a/khash.h b/khash.h
index de6be6d..2422044 100644
--- a/khash.h
+++ b/khash.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008, 2009 by attractor <attractor at live.co.uk>
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -33,7 +33,6 @@ int main() {
 	khiter_t k;
 	khash_t(32) *h = kh_init(32);
 	k = kh_put(32, h, 5, &ret);
-	if (!ret) kh_del(32, h, k);
 	kh_value(h, k) = 10;
 	k = kh_get(32, h, 10);
 	is_missing = (k == kh_end(h));
@@ -47,6 +46,29 @@ int main() {
 */
 
 /*
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
   2009-09-26 (0.2.4):
 
     * Improve portability
@@ -86,11 +108,9 @@ int main() {
   @header
 
   Generic hash table library.
-
-  @copyright Heng Li
  */
 
-#define AC_VERSION_KHASH_H "0.2.4"
+#define AC_VERSION_KHASH_H "0.2.6"
 
 #include <stdlib.h>
 #include <string.h>
@@ -111,24 +131,14 @@ typedef unsigned long long khint64_t;
 #endif
 
 #ifdef _MSC_VER
-#define inline __inline
+#define kh_inline __inline
+#else
+#define kh_inline inline
 #endif
 
 typedef khint32_t khint_t;
 typedef khint_t khiter_t;
 
-#define __ac_HASH_PRIME_SIZE 32
-static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
-{
-  0ul,          3ul,          11ul,         23ul,         53ul,
-  97ul,         193ul,        389ul,        769ul,        1543ul,
-  3079ul,       6151ul,       12289ul,      24593ul,      49157ul,
-  98317ul,      196613ul,     393241ul,     786433ul,     1572869ul,
-  3145739ul,    6291469ul,    12582917ul,   25165843ul,   50331653ul,
-  100663319ul,  201326611ul,  402653189ul,  805306457ul,  1610612741ul,
-  3221225473ul, 4294967291ul
-};
-
 #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
 #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
 #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
@@ -137,88 +147,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
 #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
 #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
 
+#ifdef KHASH_LINEAR
+#define __ac_inc(k, m) 1
+#else
+#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
+#endif
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
 static const double __ac_HASH_UPPER = 0.77;
 
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
-	typedef struct {													\
-		khint_t n_buckets, size, n_occupied, upper_bound;				\
-		khint32_t *flags;												\
-		khkey_t *keys;													\
-		khval_t *vals;													\
-	} kh_##name##_t;													\
-	static inline kh_##name##_t *kh_init_##name() {						\
-		return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));		\
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
 	}																	\
-	static inline void kh_destroy_##name(kh_##name##_t *h)				\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
 	{																	\
 		if (h) {														\
-			free(h->keys); free(h->flags);								\
-			free(h->vals);												\
-			free(h);													\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
 		}																\
 	}																	\
-	static inline void kh_clear_##name(kh_##name##_t *h)				\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
 	{																	\
 		if (h && h->flags) {											\
-			memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
 			h->size = h->n_occupied = 0;								\
 		}																\
 	}																	\
-	static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
 	{																	\
 		if (h->n_buckets) {												\
-			khint_t inc, k, i, last;									\
-			k = __hash_func(key); i = k % h->n_buckets;					\
-			inc = 1 + k % (h->n_buckets - 1); last = i;					\
+			khint_t inc, k, i, last, mask;								\
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
 			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
-				if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
-				else i += inc;											\
+				i = (i + inc) & mask; 									\
 				if (i == last) return h->n_buckets;						\
 			}															\
 			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
 		} else return 0;												\
 	}																	\
-	static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
-	{																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
 		khint32_t *new_flags = 0;										\
 		khint_t j = 1;													\
 		{																\
-			khint_t t = __ac_HASH_PRIME_SIZE - 1;						\
-			while (__ac_prime_list[t] > new_n_buckets) --t;				\
-			new_n_buckets = __ac_prime_list[t+1];						\
-			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	\
-			else {														\
-				new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t));	\
-				memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
-				if (h->n_buckets < new_n_buckets) {						\
-					h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
-					if (kh_is_map)										\
-						h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
-				}														\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) return -1;							\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) return -1;						\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
 			}															\
 		}																\
-		if (j) {														\
+		if (j) { /* rehashing is needed */								\
 			for (j = 0; j != h->n_buckets; ++j) {						\
 				if (__ac_iseither(h->flags, j) == 0) {					\
 					khkey_t key = h->keys[j];							\
 					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
 					if (kh_is_map) val = h->vals[j];					\
 					__ac_set_isdel_true(h->flags, j);					\
-					while (1) {											\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
 						khint_t inc, k, i;								\
 						k = __hash_func(key);							\
-						i = k % new_n_buckets;							\
-						inc = 1 + k % (new_n_buckets - 1);				\
-						while (!__ac_isempty(new_flags, i)) {			\
-							if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
-							else i += inc;								\
-						}												\
+						i = k & new_mask;								\
+						inc = __ac_inc(k, new_mask);					\
+						while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
 						__ac_set_isempty_false(new_flags, i);			\
-						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
 							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
 							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
-							__ac_set_isdel_true(h->flags, i);			\
-						} else {										\
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
 							h->keys[i] = key;							\
 							if (kh_is_map) h->vals[i] = val;			\
 							break;										\
@@ -226,35 +276,39 @@ static const double __ac_HASH_UPPER = 0.77;
 					}													\
 				}														\
 			}															\
-			if (h->n_buckets > new_n_buckets) {							\
-				h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
-				if (kh_is_map)											\
-					h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
 			}															\
-			free(h->flags);												\
+			kfree(h->flags); /* free the working space */				\
 			h->flags = new_flags;										\
 			h->n_buckets = new_n_buckets;								\
 			h->n_occupied = h->size;									\
 			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
 		}																\
+		return 0;														\
 	}																	\
-	static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
 	{																	\
 		khint_t x;														\
-		if (h->n_occupied >= h->upper_bound) {							\
-			if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
-			else kh_resize_##name(h, h->n_buckets + 1);					\
-		}																\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
 		{																\
-			khint_t inc, k, i, site, last;								\
-			x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
-			if (__ac_isempty(h->flags, i)) x = i;						\
+			khint_t inc, k, i, site, last, mask = h->n_buckets - 1;		\
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
 			else {														\
-				inc = 1 + k % (h->n_buckets - 1); last = i;				\
+				inc = __ac_inc(k, mask); last = i;						\
 				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
 					if (__ac_isdel(h->flags, i)) site = i;				\
-					if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
-					else i += inc;										\
+					i = (i + inc) & mask; 								\
 					if (i == last) { x = site; break; }					\
 				}														\
 				if (x == h->n_buckets) {								\
@@ -263,20 +317,20 @@ static const double __ac_HASH_UPPER = 0.77;
 				}														\
 			}															\
 		}																\
-		if (__ac_isempty(h->flags, x)) {								\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
 			h->keys[x] = key;											\
 			__ac_set_isboth_false(h->flags, x);							\
 			++h->size; ++h->n_occupied;									\
 			*ret = 1;													\
-		} else if (__ac_isdel(h->flags, x)) {							\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
 			h->keys[x] = key;											\
 			__ac_set_isboth_false(h->flags, x);							\
 			++h->size;													\
 			*ret = 2;													\
-		} else *ret = 0;												\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
 		return x;														\
 	}																	\
-	static inline void kh_del_##name(kh_##name##_t *h, khint_t x)		\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
 	{																	\
 		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
 			__ac_set_isdel_true(h->flags, x);							\
@@ -284,6 +338,17 @@ static const double __ac_HASH_UPPER = 0.77;
 		}																\
 	}
 
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
 /* --- BEGIN OF HASH FUNCTIONS --- */
 
 /*! @function
@@ -311,10 +376,10 @@ static const double __ac_HASH_UPPER = 0.77;
   @param  s     Pointer to a null terminated string
   @return       The hash value
  */
-static inline khint_t __ac_X31_hash_string(const char *s)
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
 {
-	khint_t h = *s;
-	if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
 	return h;
 }
 /*! @function
@@ -328,9 +393,21 @@ static inline khint_t __ac_X31_hash_string(const char *s)
  */
 #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
 
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
 /* --- END OF HASH FUNCTIONS --- */
 
-/* Other necessary macros... */
+/* Other convenient macros... */
 
 /*!
   @abstract Type of the hash table.
@@ -396,7 +473,6 @@ static inline khint_t __ac_X31_hash_string(const char *s)
  */
 #define kh_del(name, h, k) kh_del_##name(h, k)
 
-
 /*! @function
   @abstract     Test whether a bucket contains data.
   @param  h     Pointer to the hash table [khash_t(name)*]
@@ -455,6 +531,34 @@ static inline khint_t __ac_X31_hash_string(const char *s)
  */
 #define kh_n_buckets(h) ((h)->n_buckets)
 
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
 /* More conenient interfaces */
 
 /*! @function
diff --git a/kopen.c b/kopen.c
new file mode 100644
index 0000000..8887932
--- /dev/null
+++ b/kopen.c
@@ -0,0 +1,345 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#ifndef _WIN32
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#endif
+
+#ifdef _WIN32
+#define _KO_NO_NET
+#endif
+
+#ifndef _KO_NO_NET
+static int socket_wait(int fd, int is_read)
+{
+	fd_set fds, *fdr = 0, *fdw = 0;
+	struct timeval tv;
+	int ret;
+	tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
+	FD_ZERO(&fds);
+	FD_SET(fd, &fds);
+	if (is_read) fdr = &fds;
+	else fdw = &fds;
+	ret = select(fd+1, fdr, fdw, 0, &tv);
+	if (ret == -1) perror("select");
+	return ret;
+}
+
+static int socket_connect(const char *host, const char *port)
+{
+#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
+
+	int on = 1, fd;
+	struct linger lng = { 0, 0 };
+	struct addrinfo hints, *res = 0;
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
+	if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
+	if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
+	if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
+	freeaddrinfo(res);
+	return fd;
+#undef __err_connect
+}
+
+static int http_open(const char *fn)
+{
+	char *p, *proxy, *q, *http_host, *host, *port, *path, *buf;
+	int fd, ret, l;
+
+	/* parse URL; adapted from khttp_parse_url() in knetfile.c */
+	if (strstr(fn, "http://") != fn) return 0;
+	// set ->http_host
+	for (p = (char*)fn + 7; *p && *p != '/'; ++p);
+	l = p - fn - 7;
+	http_host = calloc(l + 1, 1);
+	strncpy(http_host, fn + 7, l);
+	http_host[l] = 0;
+	for (q = http_host; *q && *q != ':'; ++q);
+	if (*q == ':') *q++ = 0;
+	// get http_proxy
+	proxy = getenv("http_proxy");
+	// set host, port and path
+	if (proxy == 0) {
+		host = strdup(http_host); // when there is no proxy, server name is identical to http_host name.
+		port = strdup(*q? q : "80");
+		path = strdup(*p? p : "/");
+	} else {
+		host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
+		for (q = host; *q && *q != ':'; ++q);
+		if (*q == ':') *q++ = 0; 
+		port = strdup(*q? q : "80");
+		path = strdup(fn);
+	}
+
+	/* connect; adapted from khttp_connect() in knetfile.c */
+	l = 0;
+	fd = socket_connect(host, port);
+	buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
+	l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", path, http_host);
+	l += sprintf(buf + l, "\r\n");
+	write(fd, buf, l);
+	l = 0;
+	while (read(fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
+		if (buf[l] == '\n' && l >= 3)
+			if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
+		++l;
+	}
+	buf[l] = 0;
+	if (l < 14) { // prematured header
+		close(fd);
+		fd = -1;
+	}
+	ret = strtol(buf + 8, &p, 0); // HTTP return code
+	if (ret != 200) {
+		close(fd);
+		fd = -1;
+	}
+	free(buf); free(http_host); free(host); free(port); free(path);
+	return fd;
+}
+
+typedef struct {
+	int max_response, ctrl_fd;
+	char *response;
+} ftpaux_t;
+
+static int kftp_get_response(ftpaux_t *aux)
+{
+	unsigned char c;
+	int n = 0;
+	char *p;
+	if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0;
+	while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
+		if (n >= aux->max_response) {
+			aux->max_response = aux->max_response? aux->max_response<<1 : 256;
+			aux->response = realloc(aux->response, aux->max_response);
+		}
+		aux->response[n++] = c;
+		if (c == '\n') {
+			if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2])
+				&& aux->response[3] != '-') break;
+			n = 0;
+			continue;
+		}
+	}
+	if (n < 2) return -1;
+	aux->response[n-2] = 0;
+	return strtol(aux->response, &p, 0);
+}
+
+static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get)
+{
+	if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
+	write(aux->ctrl_fd, cmd, strlen(cmd));
+	return is_get? kftp_get_response(aux) : 0;
+}
+
+static int ftp_open(const char *fn)
+{
+	char *p, *host = 0, *port = 0, *retr = 0;
+	char host2[80], port2[10];
+	int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4];
+	ftpaux_t aux;
+	
+	/* parse URL */
+	if (strstr(fn, "ftp://") != fn) return 0;
+	for (p = (char*)fn + 6; *p && *p != '/'; ++p);
+	if (*p != '/') return 0;
+	l = p - fn - 6;
+	port = strdup("21");
+	host = calloc(l + 1, 1);
+	strncpy(host, fn + 6, l);
+	retr = calloc(strlen(p) + 8, 1);
+	sprintf(retr, "RETR %s\r\n", p);
+	
+	/* connect to ctrl */
+	memset(&aux, 0, sizeof(ftpaux_t));
+	aux.ctrl_fd = socket_connect(host, port);
+	if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */
+
+	/* connect to the data stream */
+	kftp_get_response(&aux);
+	kftp_send_cmd(&aux, "USER anonymous\r\n", 1);
+	kftp_send_cmd(&aux, "PASS kopen@\r\n", 1);
+	kftp_send_cmd(&aux, "TYPE I\r\n", 1);
+	kftp_send_cmd(&aux, "PASV\r\n", 1);
+	for (p = aux.response; *p && *p != '('; ++p);
+	if (*p != '(') goto ftp_open_end;
+	++p;
+	sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
+	memcpy(pasv_ip, v, 4 * sizeof(int));
+	pasv_port = (v[4]<<8&0xff00) + v[5];
+	kftp_send_cmd(&aux, retr, 0);
+	sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]);
+	sprintf(port2, "%d", pasv_port);
+	fd = socket_connect(host2, port2);
+	if (fd == -1) goto ftp_open_end;
+	ret = kftp_get_response(&aux);
+	if (ret != 150) {
+		close(fd);
+		fd = -1;
+	}
+	close(aux.ctrl_fd);
+
+ftp_open_end:
+	free(host); free(port); free(retr); free(aux.response);
+	return fd;
+}
+#endif /* !defined(_KO_NO_NET) */
+
+static char **cmd2argv(const char *cmd)
+{
+	int i, beg, end, argc;
+	char **argv, *str;
+	end = strlen(cmd);
+	for (i = end - 1; i >= 0; --i)
+		if (!isspace(cmd[i])) break;
+	end = i + 1;
+	for (beg = 0; beg < end; ++beg)
+		if (!isspace(cmd[beg])) break;
+	if (beg == end) return 0;
+	for (i = beg + 1, argc = 0; i < end; ++i)
+		if (isspace(cmd[i]) && !isspace(cmd[i-1]))
+			++argc;
+	argv = (char**)calloc(argc + 2, sizeof(void*));
+	argv[0] = str = (char*)calloc(end - beg + 1, 1);
+	strncpy(argv[0], cmd + beg, end - beg);
+	for (i = argc = 1; i < end - beg; ++i)
+		if (isspace(str[i])) str[i] = 0;
+		else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i];
+	return argv;
+}
+
+#define KO_STDIN    1
+#define KO_FILE     2
+#define KO_PIPE     3
+#define KO_HTTP     4
+#define KO_FTP      5
+
+typedef struct {
+	int type, fd;
+	pid_t pid;
+} koaux_t;
+
+void *kopen(const char *fn, int *_fd)
+{
+	koaux_t *aux = 0;
+	*_fd = -1;
+	if (strstr(fn, "http://") == fn) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_HTTP;
+		aux->fd = http_open(fn);
+	} else if (strstr(fn, "ftp://") == fn) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_FTP;
+		aux->fd = ftp_open(fn);
+	} else if (strcmp(fn, "-") == 0) {
+		aux = calloc(1, sizeof(koaux_t));
+		aux->type = KO_STDIN;
+		aux->fd = STDIN_FILENO;
+	} else {
+		const char *p, *q;
+		for (p = fn; *p; ++p)
+			if (!isspace(*p)) break;
+		if (*p == '<') { // pipe open
+			int need_shell, pfd[2];
+			pid_t pid;
+			// a simple check to see if we need to invoke a shell; not always working
+			for (q = p + 1; *q; ++q)
+				if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':')
+					break;
+			need_shell = (*q != 0);
+			pipe(pfd);
+			pid = vfork();
+			if (pid == -1) { /* vfork() error */
+				close(pfd[0]); close(pfd[1]);
+				return 0;
+			}
+			if (pid == 0) { /* the child process */
+				char **argv; /* FIXME: I do not know if this will lead to a memory leak */
+				close(pfd[0]);
+				dup2(pfd[1], STDOUT_FILENO);
+				close(pfd[1]);
+				if (!need_shell) {
+					argv = cmd2argv(p + 1);
+					execvp(argv[0], argv);
+					free(argv[0]); free(argv);
+				} else execl("/bin/sh", "sh", "-c", p + 1, NULL);
+				exit(1);
+			} else { /* parent process */
+				close(pfd[1]);
+				aux = calloc(1, sizeof(koaux_t));
+				aux->type = KO_PIPE;
+				aux->fd = pfd[0];
+				aux->pid = pid;
+			}
+		} else {
+#ifdef _WIN32
+			*_fd = open(fn, O_RDONLY | O_BINARY);
+#else
+			*_fd = open(fn, O_RDONLY);
+#endif
+			if (*_fd) {
+				aux = calloc(1, sizeof(koaux_t));
+				aux->type = KO_FILE;
+				aux->fd = *_fd;
+			}
+		}
+	}
+	*_fd = aux->fd;
+	return aux;
+}
+
+int kclose(void *a)
+{
+	koaux_t *aux = (koaux_t*)a;
+	if (aux->type == KO_PIPE) {
+		int status;
+		pid_t pid;
+		pid = waitpid(aux->pid, &status, WNOHANG);
+		if (pid != aux->pid) kill(aux->pid, 15);
+	}
+	free(aux);
+	return 0;
+}
+
+#ifdef _KO_MAIN
+#define BUF_SIZE 0x10000
+int main(int argc, char *argv[])
+{
+	void *x;
+	int l, fd;
+	unsigned char buf[BUF_SIZE];
+	FILE *fp;
+	if (argc == 1) {
+		fprintf(stderr, "Usage: kopen <file>\n");
+		return 1;
+	}
+	x = kopen(argv[1], &fd);
+	fp = fdopen(fd, "r");
+	if (fp == 0) {
+		fprintf(stderr, "ERROR: fail to open the input\n");
+		return 1;
+	}
+	do {
+		if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0)
+			fwrite(buf, 1, l, stdout);
+	} while (l == BUF_SIZE);
+	fclose(fp);
+	kclose(x);
+	return 0;
+}
+#endif
diff --git a/kseq.h b/kseq.h
index ad8937c..a5cec7c 100644
--- a/kseq.h
+++ b/kseq.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008, by Heng Li <lh3 at sanger.ac.uk>
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor at live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -23,6 +23,8 @@
    SOFTWARE.
 */
 
+/* Last Modified: 05MAR2012 */
+
 #ifndef AC_KSEQ_H
 #define AC_KSEQ_H
 
@@ -30,9 +32,14 @@
 #include <string.h>
 #include <stdlib.h>
 
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
+
 #define __KS_TYPE(type_t)						\
 	typedef struct __kstream_t {				\
-		char *buf;								\
+		unsigned char *buf;						\
 		int begin, end, is_eof;					\
 		type_t f;								\
 	} kstream_t;
@@ -45,7 +52,7 @@
 	{																\
 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 		ks->f = f;													\
-		ks->buf = (char*)malloc(__bufsize);							\
+		ks->buf = (unsigned char*)malloc(__bufsize);				\
 		return ks;													\
 	}																\
 	static inline void ks_destroy(kstream_t *ks)					\
@@ -82,10 +89,10 @@ typedef struct __kstring_t {
 #endif
 
 #define __KS_GETUNTIL(__read, __bufsize)								\
-	static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 	{																	\
 		if (dret) *dret = 0;											\
-		str->l = 0;														\
+		str->l = append? str->l : 0;									\
 		if (ks->begin >= ks->end && ks->is_eof) return -1;				\
 		for (;;) {														\
 			int i;														\
@@ -97,14 +104,20 @@ typedef struct __kstring_t {
 					if (ks->end == 0) break;							\
 				} else break;											\
 			}															\
-			if (delimiter) {											\
+			if (delimiter == KS_SEP_LINE) { \
+				for (i = ks->begin; i < ks->end; ++i) \
+					if (ks->buf[i] == '\n') break; \
+			} else if (delimiter > KS_SEP_MAX) {						\
 				for (i = ks->begin; i < ks->end; ++i)					\
 					if (ks->buf[i] == delimiter) break;					\
-			} else {													\
+			} else if (delimiter == KS_SEP_SPACE) {						\
 				for (i = ks->begin; i < ks->end; ++i)					\
 					if (isspace(ks->buf[i])) break;						\
-			}															\
-			if (str->m - str->l < i - ks->begin + 1) {					\
+			} else if (delimiter == KS_SEP_TAB) {						\
+				for (i = ks->begin; i < ks->end; ++i)					\
+					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+			} else i = 0; /* never come to here! */						\
+			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
 				str->m = str->l + (i - ks->begin) + 1;					\
 				kroundup32(str->m);										\
 				str->s = (char*)realloc(str->s, str->m);				\
@@ -117,9 +130,15 @@ typedef struct __kstring_t {
 				break;													\
 			}															\
 		}																\
+		if (str->s == 0) {												\
+			str->m = 1;													\
+			str->s = (char*)calloc(1, 1);								\
+		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
 		str->s[str->l] = '\0';											\
 		return str->l;													\
-	}
+	} \
+	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
 
 #define KSTREAM_INIT(type_t, __read, __bufsize) \
 	__KS_TYPE(type_t)							\
@@ -127,19 +146,16 @@ typedef struct __kstring_t {
 	__KS_GETC(__read, __bufsize)				\
 	__KS_GETUNTIL(__read, __bufsize)
 
-#define __KSEQ_BASIC(type_t)											\
-	static inline kseq_t *kseq_init(type_t fd)							\
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)										\
+	SCOPE kseq_t *kseq_init(type_t fd)									\
 	{																	\
 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
 		s->f = ks_init(fd);												\
 		return s;														\
 	}																	\
-	static inline void kseq_rewind(kseq_t *ks)							\
-	{																	\
-		ks->last_char = 0;												\
-		ks->f->is_eof = ks->f->begin = ks->f->end = 0;					\
-	}																	\
-	static inline void kseq_destroy(kseq_t *ks)							\
+	SCOPE void kseq_destroy(kseq_t *ks)									\
 	{																	\
 		if (!ks) return;												\
 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
@@ -152,44 +168,46 @@ typedef struct __kstring_t {
    -1   end-of-file
    -2   truncated quality string
  */
-#define __KSEQ_READ														\
-	static int kseq_read(kseq_t *seq)									\
-	{																	\
-		int c;															\
-		kstream_t *ks = seq->f;											\
+#define __KSEQ_READ(SCOPE) \
+	SCOPE int kseq_read(kseq_t *seq) \
+	{ \
+		int c; \
+		kstream_t *ks = seq->f; \
 		if (seq->last_char == 0) { /* then jump to the next header line */ \
-			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');	\
-			if (c == -1) return -1; /* end of file */					\
-			seq->last_char = c;											\
-		} /* the first header char has been read */						\
-		seq->comment.l = seq->seq.l = seq->qual.l = 0;					\
-		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;			\
-		if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);			\
+			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+			if (c == -1) return -1; /* end of file */ \
+			seq->last_char = c; \
+		} /* else: the first header char has been read in the previous call */ \
+		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+			seq->seq.m = 256; \
+			seq->seq.s = (char*)malloc(seq->seq.m); \
+		} \
 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
-			if (isgraph(c)) { /* printable non-space character */		\
-				if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
-					seq->seq.m = seq->seq.l + 2;						\
-					kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
-					seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
-				}														\
-				seq->seq.s[seq->seq.l++] = (char)c;						\
-			}															\
-		}																\
+			if (c == '\n') continue; /* skip empty lines */ \
+			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+		} \
 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
-		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */		\
-		if (c != '+') return seq->seq.l; /* FASTA */					\
-		if (seq->qual.m < seq->seq.m) {	/* allocate enough memory */	\
-			seq->qual.m = seq->seq.m;									\
-			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);		\
-		}																\
+		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+			seq->seq.m = seq->seq.l + 2; \
+			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+		} \
+		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
+		if (c != '+') return seq->seq.l; /* FASTA */ \
+		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
+			seq->qual.m = seq->seq.m; \
+			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+		} \
 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
-		if (c == -1) return -2; /* we should not stop here */			\
-		while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)		\
-			if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c;	\
-		seq->qual.s[seq->qual.l] = 0; /* null terminated string */		\
+		if (c == -1) return -2; /* error: no quality string */ \
+		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
 		seq->last_char = 0;	/* we have not come to the next header line */ \
-		if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
-		return seq->seq.l;												\
+		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+		return seq->seq.l; \
 	}
 
 #define __KSEQ_TYPE(type_t)						\
@@ -199,10 +217,19 @@ typedef struct __kstring_t {
 		kstream_t *f;							\
 	} kseq_t;
 
-#define KSEQ_INIT(type_t, __read)				\
-	KSTREAM_INIT(type_t, __read, 4096)			\
+#define KSEQ_INIT2(SCOPE, type_t, __read)		\
+	KSTREAM_INIT(type_t, __read, 16384)			\
 	__KSEQ_TYPE(type_t)							\
-	__KSEQ_BASIC(type_t)						\
-	__KSEQ_READ
+	__KSEQ_BASIC(SCOPE, type_t)					\
+	__KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+	__KS_TYPE(type_t) \
+	__KSEQ_TYPE(type_t) \
+	extern kseq_t *kseq_init(type_t fd); \
+	void kseq_destroy(kseq_t *ks); \
+	int kseq_read(kseq_t *seq);
 
 #endif
diff --git a/ksort.h b/ksort.h
index 52812e1..ad66a17 100644
--- a/ksort.h
+++ b/ksort.h
@@ -139,7 +139,7 @@ typedef struct {
 			tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
 		}																\
 	}																	\
-	inline void __ks_insertsort_##name(type_t *s, type_t *t)			\
+	static inline void __ks_insertsort_##name(type_t *s, type_t *t)		\
 	{																	\
 		type_t *i, *j, swap_tmp;										\
 		for (i = s + 1; i < t; ++i)										\
diff --git a/kstring.h b/kstring.h
index 398901f..81d7d60 100644
--- a/kstring.h
+++ b/kstring.h
@@ -16,19 +16,33 @@ typedef struct __kstring_t {
 } kstring_t;
 #endif
 
-static inline int kputs(const char *p, kstring_t *s)
+static inline void ks_resize(kstring_t *s, size_t size)
+{
+	if (s->m < size) {
+		s->m = size;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+}
+
+static inline int kputsn(const char *p, int l, kstring_t *s)
 {
-	int l = strlen(p);
 	if (s->l + l + 1 >= s->m) {
 		s->m = s->l + l + 2;
 		kroundup32(s->m);
 		s->s = (char*)realloc(s->s, s->m);
 	}
-	strcpy(s->s + s->l, p);
+	memcpy(s->s + s->l, p, l);
 	s->l += l;
+	s->s[s->l] = 0;
 	return l;
 }
 
+static inline int kputs(const char *p, kstring_t *s)
+{
+	return kputsn(p, strlen(p), s);
+}
+
 static inline int kputc(int c, kstring_t *s)
 {
 	if (s->l + 1 >= s->m) {
@@ -41,6 +55,40 @@ static inline int kputc(int c, kstring_t *s)
 	return c;
 }
 
+static inline int kputw(int c, kstring_t *s)
+{
+	char buf[16];
+	int l, x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+	s->s[s->l] = 0;
+	return 0;
+}
+
+static inline int kputuw(unsigned c, kstring_t *s)
+{
+	char buf[16];
+	int l, i;
+	unsigned x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
+	s->s[s->l] = 0;
+	return 0;
+}
+
 int ksprintf(kstring_t *s, const char *fmt, ...);
 
 #endif
diff --git a/ksw.c b/ksw.c
index bd29e96..b97fed5 100644
--- a/ksw.c
+++ b/ksw.c
@@ -23,7 +23,6 @@
    SOFTWARE.
 */
 
-#ifndef _NO_SSE2
 #include <stdlib.h>
 #include <stdint.h>
 #include <emmintrin.h>
@@ -37,22 +36,35 @@
 #define UNLIKELY(x) (x)
 #endif
 
-struct _ksw_query_t {
+const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
+
+struct _kswq_t {
 	int qlen, slen;
 	uint8_t shift, mdiff, max, size;
 	__m128i *qp, *H0, *H1, *E, *Hmax;
 };
 
-ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
+/**
+ * Initialize the query data structure
+ *
+ * @param size   Number of bytes used to store a score; valid valures are 1 or 2
+ * @param qlen   Length of the query sequence
+ * @param query  Query sequence
+ * @param m      Size of the alphabet
+ * @param mat    Scoring matrix in a one-dimension array
+ *
+ * @return       Query data structure
+ */
+kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat)
 {
-	ksw_query_t *q;
+	kswq_t *q;
 	int slen, a, tmp, p;
 
 	size = size > 1? 2 : 1;
 	p = 8 * (3 - size); // # values per __m128i
 	slen = (qlen + p - 1) / p; // segmented length
-	q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
-	q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory
+	q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
+	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
 	q->H0 = q->qp + slen * m;
 	q->H1 = q->H0 + slen;
 	q->E  = q->H1 + slen;
@@ -91,11 +103,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in
 	return q;
 }
 
-int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
+kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
 {
-	int slen, i, m_b, n_b, te = -1, gmax = 0;
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
 	uint64_t *b;
 	__m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax;
+	kswr_t r;
 
 #define __max_16(ret, xx) do { \
 		(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
@@ -106,10 +119,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
 	} while (0)
 
 	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
 	m_b = n_b = 0; b = 0;
 	zero = _mm_set1_epi32(0);
-	gapoe = _mm_set1_epi8(a->gapo + a->gape);
-	gape = _mm_set1_epi8(a->gape);
+	gapoe = _mm_set1_epi8(_gapo + _gape);
+	gape = _mm_set1_epi8(_gape);
 	shift = _mm_set1_epi8(q->shift);
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen;
@@ -165,11 +181,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) /
 end_loop16:
 		//int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n");
 		__max_16(imax, max); // imax is the maximum number in max
-		if (imax >= a->T) { // write the b array; this condition adds branching unfornately
+		if (imax >= minsc) { // write the b array; this condition adds branching unfornately
 			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
 				if (n_b == m_b) {
 					m_b = m_b? m_b<<1 : 8;
-					b = realloc(b, 8 * m_b);
+					b = (uint64_t*)realloc(b, 8 * m_b);
 				}
 				b[n_b++] = (uint64_t)imax<<32 | i;
 			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
@@ -178,34 +194,38 @@ end_loop16:
 			gmax = imax; te = i; // te is the end position on the target
 			for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
 				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
-			if (gmax + q->shift >= 255) break;
+			if (gmax + q->shift >= 255 || gmax >= endsc) break;
 		}
 		S = H1; H1 = H0; H0 = S; // swap H0 and H1
 	}
-	a->score = gmax; a->te = te;
-	{ // get a->qe, the end of query match; find the 2nd best score
+	r.score = gmax + q->shift < 255? gmax : 255;
+	r.te = te;
+	if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
 		int max = -1, low, high, qlen = slen * 16;
 		uint8_t *t = (uint8_t*)Hmax;
-		for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
-			if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen;
+		for (i = 0; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
 		//printf("%d,%d\n", max, gmax);
-		i = (a->score + q->max - 1) / q->max;
-		low = te - i; high = te + i;
-		for (i = 0, a->score2 = 0; i < n_b; ++i) {
-			int e = (int32_t)b[i];
-			if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
-				a->score2 = b[i]>>32, a->te2 = e;
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
 		}
 	}
 	free(b);
-	return a->score + q->shift >= 255? 255 : a->score;
+	return r;
 }
 
-int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e)
+kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e)
 {
-	int slen, i, m_b, n_b, te = -1, gmax = 0;
+	int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
 	uint64_t *b;
 	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+	kswr_t r;
 
 #define __max_8(ret, xx) do { \
 		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
@@ -215,10 +235,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
 	} while (0)
 
 	// initialization
+	r = g_defr;
+	minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
+	endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
 	m_b = n_b = 0; b = 0;
 	zero = _mm_set1_epi32(0);
-	gapoe = _mm_set1_epi16(a->gapo + a->gape);
-	gape = _mm_set1_epi16(a->gape);
+	gapoe = _mm_set1_epi16(_gapo + _gape);
+	gape = _mm_set1_epi16(_gape);
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen;
 	for (i = 0; i < slen; ++i) {
@@ -260,11 +283,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) //
 		}
 end_loop8:
 		__max_8(imax, max);
-		if (imax >= a->T) {
+		if (imax >= minsc) {
 			if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
 				if (n_b == m_b) {
 					m_b = m_b? m_b<<1 : 8;
-					b = realloc(b, 8 * m_b);
+					b = (uint64_t*)realloc(b, 8 * m_b);
 				}
 				b[n_b++] = (uint64_t)imax<<32 | i;
 			} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
@@ -273,31 +296,244 @@ end_loop8:
 			gmax = imax; te = i;
 			for (j = 0; LIKELY(j < slen); ++j)
 				_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
+			if (gmax >= endsc) break;
 		}
 		S = H1; H1 = H0; H0 = S;
 	}
-	a->score = gmax; a->te = te;
+	r.score = gmax; r.te = te;
 	{
 		int max = -1, low, high, qlen = slen * 8;
 		uint16_t *t = (uint16_t*)Hmax;
-		for (i = 0, a->qe = -1; i < qlen; ++i, ++t)
-			if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen;
-		i = (a->score + q->max - 1) / q->max;
-		low = te - i; high = te + i;
-		for (i = 0, a->score2 = 0; i < n_b; ++i) {
-			int e = (int32_t)b[i];
-			if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2)
-				a->score2 = b[i]>>32, a->te2 = e;
+		for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
+			if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
+		if (b) {
+			i = (r.score + q->max - 1) / q->max;
+			low = te - i; high = te + i;
+			for (i = 0; i < n_b; ++i) {
+				int e = (int32_t)b[i];
+				if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
+					r.score2 = b[i]>>32, r.te2 = e;
+			}
 		}
 	}
 	free(b);
-	return a->score;
+	return r;
 }
 
-int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a)
+static void revseq(int l, uint8_t *s)
 {
-	if (q->size == 1) return ksw_sse2_16(q, tlen, target, a);
-	else return ksw_sse2_8(q, tlen, target, a);
+	int i, t;
+	for (i = 0; i < l>>1; ++i)
+		t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t;
+}
+
+kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry)
+{
+	int size;
+	kswq_t *q;
+	kswr_t r, rr;
+	kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int);
+
+	q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat);
+	if (qry && *qry == 0) *qry = q;
+	func = q->size == 2? ksw_i16 : ksw_u8;
+	size = q->size;
+	r = func(q, tlen, target, gapo, gape, xtra);
+	if (qry == 0) free(q);
+	if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r;
+	revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end
+	q = ksw_qinit(size, r.qe + 1, query, m, mat);
+	rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score);
+	revseq(r.qe + 1, query); revseq(r.te + 1, target);
+	free(q);
+	if (r.score == rr.score)
+		r.tb = r.te - rr.te, r.qb = r.qe - rr.qe;
+	return r;
+}
+
+/********************
+ *** SW extension ***
+ ********************/
+
+typedef struct {
+	int32_t h, e;
+} eh_t;
+
+int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore)
+{
+	eh_t *eh; // score array
+	int8_t *qp; // query profile
+	int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore;
+	if (h0 < 0) h0 = 0;
+	// allocate memory
+	qp = malloc(qlen * m);
+	eh = calloc(qlen + 1, 8);
+	// generate the query profile
+	for (k = i = 0; k < m; ++k) {
+		const int8_t *p = &mat[k * m];
+		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+	}
+	// fill the first row
+	eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0;
+	for (j = 2; j <= qlen && eh[j-1].h > gape; ++j)
+		eh[j].h = eh[j-1].h - gape;
+	// adjust $w if it is too large
+	k = m * m;
+	for (i = 0, max = 0; i < k; ++i) // get the max score
+		max = max > mat[i]? max : mat[i];
+	max_gap = (int)((double)(qlen * max - gapo) / gape + 1.);
+	max_gap = max_gap > 1? max_gap : 1;
+	w = w < max_gap? w : max_gap;
+	// DP loop
+	max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1;
+	beg = 0, end = qlen;
+	for (i = 0; LIKELY(i < tlen); ++i) {
+		int f = 0, h1, m = 0, mj = -1;
+		int8_t *q = &qp[target[i] * qlen];
+		// compute the first column
+		h1 = h0 - (gapo + gape * (i + 1));
+		if (h1 < 0) h1 = 0;
+		// apply the band and the constraint (if provided)
+		if (beg < i - w) beg = i - w;
+		if (end > i + w + 1) end = i + w + 1;
+		if (end > qlen) end = qlen;
+		for (j = beg; LIKELY(j < end); ++j) {
+			// At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1)
+			// Similar to SSE2-SW, cells are computed in the following order:
+			//   H(i,j)   = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
+			//   E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape
+			//   F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape
+			eh_t *p = &eh[j];
+			int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j)
+			p->h = h1;          // set H(i,j-1) for the next row
+			h += q[j];
+			h = h > e? h : e;
+			h = h > f? h : f;
+			h1 = h;             // save H(i,j) to h1 for the next column
+			mj = m > h? mj : j;
+			m = m > h? m : h;   // m is stored at eh[mj+1]
+			h -= gapoe;
+			h = h > 0? h : 0;
+			e -= gape;
+			e = e > h? e : h;   // computed E(i+1,j)
+			p->e = e;           // save E(i+1,j) for the next row
+			f -= gape;
+			f = f > h? f : h;   // computed F(i,j+1)
+		}
+		eh[end].h = h1; eh[end].e = 0;
+		if (j == qlen) {
+			max_ie = gscore > h1? max_ie : i;
+			gscore = gscore > h1? gscore : h1;
+		}
+		if (m == 0) break;
+		if (m > max) max = m, max_i = i, max_j = mj;
+		// update beg and end for the next round
+		for (j = mj; j >= beg && eh[j].h; --j);
+		beg = j + 1;
+		for (j = mj + 2; j <= end && eh[j].h; ++j);
+		end = j;
+		//beg = 0; end = qlen; // uncomment this line for debugging
+	}
+	free(eh); free(qp);
+	if (_qle) *_qle = max_j + 1;
+	if (_tle) *_tle = max_i + 1;
+	if (_gtle) *_gtle = max_ie + 1;
+	if (_gscore) *_gscore = gscore;
+	return max;
+}
+
+/********************
+ * Global alignment *
+ ********************/
+
+#define MINUS_INF -0x40000000
+
+static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len)
+{
+	if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
+		if (*n_cigar == *m_cigar) {
+			*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+			cigar = realloc(cigar, (*m_cigar) << 2);
+		}
+		cigar[(*n_cigar)++] = len<<4 | op;
+	} else cigar[(*n_cigar)-1] += len<<4;
+	return cigar;
+}
+
+int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_)
+{
+	eh_t *eh;
+	int8_t *qp; // query profile
+	int i, j, k, gapoe = gapo + gape, score, n_col;
+	uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
+	if (n_cigar_) *n_cigar_ = 0;
+	// allocate memory
+	n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
+	z = malloc(n_col * tlen);
+	qp = malloc(qlen * m);
+	eh = calloc(qlen + 1, 8);
+	// generate the query profile
+	for (k = i = 0; k < m; ++k) {
+		const int8_t *p = &mat[k * m];
+		for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
+	}
+	// fill the first row
+	eh[0].h = 0; eh[0].e = MINUS_INF;
+	for (j = 1; j <= qlen && j <= w; ++j)
+		eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF;
+	for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
+	// DP loop
+	for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
+		int32_t f = MINUS_INF, h1, beg, end;
+		int8_t *q = &qp[target[i] * qlen];
+		uint8_t *zi = &z[i * n_col];
+		beg = i > w? i - w : 0;
+		end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
+		h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF;
+		for (j = beg; LIKELY(j < end); ++j) {
+			// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
+			// 1) not checking h>0; 2) recording direction for backtracking
+			eh_t *p = &eh[j];
+			int32_t h = p->h, e = p->e;
+			uint8_t d; // direction
+			p->h = h1;
+			h += q[j];
+			d = h >= e? 0 : 1;
+			h = h >= e? h : e;
+			d = h >= f? d : 2;
+			h = h >= f? h : f;
+			h1 = h;
+			h -= gapoe;
+			e -= gape;
+			d |= e > h? 1<<2 : 0;
+			e = e > h? e : h;
+			p->e = e;
+			f -= gape;
+			d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
+			f = f > h? f : h;
+			zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
+		}
+		eh[end].h = h1; eh[end].e = MINUS_INF;
+	}
+	score = eh[qlen].h;
+	if (n_cigar_ && cigar_) { // backtrack
+		int n_cigar = 0, m_cigar = 0, which = 0;
+		uint32_t *cigar = 0, tmp;
+		i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
+		while (i >= 0 && k >= 0) {
+			which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
+			if (which == 0)      cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
+			else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
+			else                 cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
+		}
+		if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
+		if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
+		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+		*n_cigar_ = n_cigar, *cigar_ = cigar;
+	}
+	free(eh); free(qp); free(z);
+	return score;
 }
 
 /*******************************************
@@ -333,30 +569,33 @@ unsigned char seq_nt4_table[256] = {
 
 int main(int argc, char *argv[])
 {
-	int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2;
+	int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0;
 	int8_t mat[25];
-	ksw_aux_t a;
+	int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART;
+	uint8_t *rseq = 0;
 	gzFile fpt, fpq;
 	kseq_t *kst, *ksq;
+
 	// parse command line
-	a.gapo = 5; a.gape = 2; a.T = 10;
-	while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) {
+	while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) {
 		switch (c) {
 			case 'a': sa = atoi(optarg); break;
 			case 'b': sb = atoi(optarg); break;
-			case 'q': a.gapo = atoi(optarg); break;
-			case 'r': a.gape = atoi(optarg); break;
-			case 't': a.T = atoi(optarg); break;
+			case 'q': gapo = atoi(optarg); break;
+			case 'r': gape = atoi(optarg); break;
+			case 't': minsc = atoi(optarg); break;
 			case 'f': forward_only = 1; break;
-			case 's': size = atoi(optarg); break;
+			case '1': xtra |= KSW_XBYTE; break;
 		}
 	}
 	if (optind + 2 > argc) {
-		fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] <target.fa> <query.fa>\n", size, sa, sb, a.gapo, a.gape);
+		fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] <target.fa> <query.fa>\n", sa, sb, gapo, gape, minsc);
 		return 1;
 	}
+	if (minsc > 0xffff) minsc = 0xffff;
+	xtra |= KSW_XSUBO | minsc;
 	// initialize scoring matrix
-	for (i = k = 0; i < 5; ++i) {
+	for (i = k = 0; i < 4; ++i) {
 		for (j = 0; j < 4; ++j)
 			mat[k++] = i == j? sa : -sb;
 		mat[k++] = 0; // ambiguous base
@@ -367,35 +606,34 @@ int main(int argc, char *argv[])
 	fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq);
 	// all-pair alignment
 	while (kseq_read(ksq) > 0) {
-		ksw_query_t *q[2];
-		for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
-		q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
+		kswq_t *q[2] = {0, 0};
+		kswr_t r;
+		for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]];
 		if (!forward_only) { // reverse
-			for (i = 0; i < ksq->seq.l/2; ++i) {
-				int t = ksq->seq.s[i];
-				ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i];
-				ksq->seq.s[ksq->seq.l-1-i] = t;
+			if ((int)ksq->seq.m > max_rseq) {
+				max_rseq = ksq->seq.m;
+				rseq = (uint8_t*)realloc(rseq, max_rseq);
 			}
-			for (i = 0; i < ksq->seq.l; ++i)
-				ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
-			q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat);
-		} else q[1] = 0;
+			for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j)
+				rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i];
+		}
 		gzrewind(fpt); kseq_rewind(kst);
 		while (kseq_read(kst) > 0) {
-			int s;
-			for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
-			s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a);
-			printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
-			if (q[1]) {
-				s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a);
-				printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1);
+			for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]];
+			r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]);
+			if (r.score >= minsc)
+				printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2);
+			if (rseq) {
+				r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]);
+				if (r.score >= minsc)
+					printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2);
 			}
 		}
 		free(q[0]); free(q[1]);
 	}
+	free(rseq);
 	kseq_destroy(kst); gzclose(fpt);
 	kseq_destroy(ksq); gzclose(fpq);
 	return 0;
 }
-#endif // _KSW_MAIN
-#endif // _NO_SSE2
+#endif
diff --git a/ksw.h b/ksw.h
index d93d6a9..d2975de 100644
--- a/ksw.h
+++ b/ksw.h
@@ -1,51 +1,108 @@
 #ifndef __AC_KSW_H
 #define __AC_KSW_H
 
-struct _ksw_query_t;
-typedef struct _ksw_query_t ksw_query_t;
+#include <stdint.h>
+
+#define KSW_XBYTE  0x10000
+#define KSW_XSTOP  0x20000
+#define KSW_XSUBO  0x40000
+#define KSW_XSTART 0x80000
+
+struct _kswq_t;
+typedef struct _kswq_t kswq_t;
 
 typedef struct {
-	// input
-	unsigned gapo, gape; // the first gap costs gapo+gape
-	unsigned T; // threshold
-	// output
-	int score, te, qe, score2, te2;
-} ksw_aux_t;
+	int score; // best score
+	int te, qe; // target end and query end
+	int score2, te2; // second best score and ending position on the target
+	int tb, qb; // target start and query start
+} kswr_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 	/**
-	 * Initialize the query data structure
+	 * Aligning two sequences
+	 *
+	 * @param qlen    length of the query sequence (typically <tlen)
+	 * @param query   query sequence with 0 <= query[i] < m
+	 * @param tlen    length of the target sequence
+	 * @param target  target sequence
+	 * @param m       number of residue types
+	 * @param mat     m*m scoring matrix in one-dimension array
+	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+	 * @param gape    gap extension penalty
+	 * @param xtra    extra information (see below)
+	 * @param qry     query profile (see below)
+	 *
+	 * @return        alignment information in a struct; unset values to -1
+	 *
+	 * When xtra==0, ksw_align() uses a signed two-byte integer to store a
+	 * score and only finds the best score and the end positions. The 2nd best
+	 * score or the start positions are not attempted. The default behavior can
+	 * be tuned by setting KSW_X* flags:
+	 *
+	 *   KSW_XBYTE:  use an unsigned byte to store a score. If overflow occurs,
+	 *               kswr_t::score will be set to 255
+	 *
+	 *   KSW_XSUBO:  track the 2nd best score and the ending position on the
+	 *               target if the 2nd best is higher than (xtra&0xffff)
+	 *
+	 *   KSW_XSTOP:  stop if the maximum score is above (xtra&0xffff)
 	 *
-	 * @param size   Number of bytes used to store a score; valid valures are 1 or 2
-	 * @param qlen   Length of the query sequence
-	 * @param query  Query sequence
-	 * @param m      Size of the alphabet
-	 * @param mat    Scoring matrix in a one-dimension array
+	 *   KSW_XSTART: find the start positions
 	 *
-	 * @return       Query data structure
+	 * When *qry==NULL, ksw_align() will compute and allocate the query profile
+	 * and when the function returns, *qry will point to the profile, which can
+	 * be deallocated simply by free(). If one query is aligned against multiple
+	 * target sequences, *qry should be set to NULL during the first call and
+	 * freed after the last call. Note that qry can equal 0. In this case, the
+	 * query profile will be deallocated in ksw_align().
 	 */
-	ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free()
+	kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry);
 
 	/**
-	 * Compute the maximum local score for queries initialized with ksw_qinit(1, ...)
+	 * Banded global alignment
 	 *
-	 * @param q       Query data structure returned by ksw_qinit(1, ...)
-	 * @param tlen    Length of the target sequence
-	 * @param target  Target sequence
-	 * @param a       Auxiliary data structure (see ksw.h)
+	 * @param qlen    query length
+	 * @param query   query sequence with 0 <= query[i] < m
+	 * @param tlen    target length
+	 * @param target  target sequence with 0 <= target[i] < m
+	 * @param m       number of residue types
+	 * @param mat     m*m scoring mattrix in one-dimension array
+	 * @param gapo    gap open penalty; a gap of length l cost "-(gapo+l*gape)"
+	 * @param gape    gap extension penalty
+	 * @param w       band width
+	 * @param n_cigar (out) number of CIGAR elements
+	 * @param cigar   (out) BAM-encoded CIGAR; caller need to deallocate with free()
 	 *
-	 * @return        The maximum local score; if the returned value equals 255, the SW may not be finished
+	 * @return        score of the alignment
 	 */
-	int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
+	int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar);
 
-	/** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */
-	int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
-
-	/** Unified interface for ksw_sse2_8() and ksw_sse2_16() */
-	int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a);
+	/**
+	 * Extend alignment
+	 *
+	 * The routine aligns $query and $target, assuming their upstream sequences,
+	 * which are not provided, have been aligned with score $h0. In return,
+	 * region [0,*qle) on the query and [0,*tle) on the target sequences are
+	 * aligned together. If *gscore>=0, *gscore keeps the best score such that
+	 * the entire query sequence is aligned; *gtle keeps the position on the
+	 * target where *gscore is achieved. Returning *gscore and *gtle helps the
+	 * caller to decide whether an end-to-end hit or a partial hit is preferred.
+	 *
+	 * The first 9 parameters are identical to those in ksw_global()
+	 *
+	 * @param h0      alignment score of upstream sequences
+	 * @param _qle    (out) length of the query in the alignment
+	 * @param _tle    (out) length of the target in the alignment
+	 * @param _gtle   (out) length of the target if query is fully aligned
+	 * @param _gscore (out) score of the best end-to-end alignment; negative if not found
+	 *
+	 * @return        best semi-local alignment score
+	 */
+	int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore);
 
 #ifdef __cplusplus
 }
diff --git a/kvec.h b/kvec.h
index 57204d6..9c9ca6e 100644
--- a/kvec.h
+++ b/kvec.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008, by Attractive Chaos <attractivechaos at aol.co.uk>
+   Copyright (c) 2008, by Attractive Chaos <attractor at live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -76,15 +76,15 @@ int main() {
 		(v).a[(v).n++] = (x);										\
 	} while (0)
 
-#define kv_pushp(type, v) (((v).n == (v).m)?							\
+#define kv_pushp(type, v) ((((v).n == (v).m)?							\
 						   ((v).m = ((v).m? (v).m<<1 : 2),				\
 							(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0)	\
-						   : 0), ((v).a + ((v).n++))
+						   : 0), &(v).a[(v).n++])
 
-#define kv_a(type, v, i) ((v).m <= (size_t)(i)?						\
+#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
 						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
 						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
-						  : (v).n <= (size_t)(i)? (v).n = (i)			\
-						  : 0), (v).a[(i)]
+						  : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
+						  : 0), (v).a[(i)])
 
 #endif
diff --git a/main.c b/main.c
index 0e7af77..ba60cf7 100644
--- a/main.c
+++ b/main.c
@@ -4,7 +4,7 @@
 #include "utils.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.6.2-r126"
+#define PACKAGE_VERSION "0.7.0-r313"
 #endif
 
 static int usage()
@@ -20,14 +20,13 @@ static int usage()
 	fprintf(stderr, "         sampe         generate alignment (paired ended)\n");
 	fprintf(stderr, "         bwasw         BWA-SW for long queries\n");
 	fprintf(stderr, "         fastmap       identify super-maximal exact matches\n");
+	fprintf(stderr, "         mem           BWA-MEM algorithm\n");
 	fprintf(stderr, "\n");
 	fprintf(stderr, "         fa2pac        convert FASTA to PAC format\n");
 	fprintf(stderr, "         pac2bwt       generate BWT from PAC\n");
 	fprintf(stderr, "         pac2bwtgen    alternative algorithm for generating BWT\n");
 	fprintf(stderr, "         bwtupdate     update .bwt to the new format\n");
 	fprintf(stderr, "         bwt2sa        generate SA from BWT and Occ\n");
-	fprintf(stderr, "         pac2cspac     convert PAC to color-space PAC\n");
-	fprintf(stderr, "         stdsw         standard SW/NW alignment\n");
 	fprintf(stderr, "\n");
 	return 1;
 }
@@ -50,15 +49,13 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1);
 	else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1);
 	else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1);
-	else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1);
 	else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1);
 	else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1);
-	else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1);
-	else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1);
 	else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 	else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 	else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1);
 	else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1);
+	else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1);
 	else {
 		fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
 		return 1;
diff --git a/main.h b/main.h
index 026a80b..3e70362 100644
--- a/main.h
+++ b/main.h
@@ -6,7 +6,6 @@ extern "C" {
 #endif
 
 	int bwa_fa2pac(int argc, char *argv[]);
-	int bwa_pac2cspac(int argc, char *argv[]);
 	int bwa_pac2bwt(int argc, char *argv[]);
 	int bwa_bwtupdate(int argc, char *argv[]);
 	int bwa_bwt2sa(int argc, char *argv[]);
@@ -17,11 +16,10 @@ extern "C" {
 	int bwa_sai2sam_se(int argc, char *argv[]);
 	int bwa_sai2sam_pe(int argc, char *argv[]);
 
-	int bwa_stdsw(int argc, char *argv[]);
-
 	int bwa_bwtsw2(int argc, char *argv[]);
 
 	int main_fastmap(int argc, char *argv[]);
+	int main_mem(int argc, char *argv[]);
 
 #ifdef __cplusplus
 }
diff --git a/simple_dp.c b/simple_dp.c
deleted file mode 100644
index 7c078c2..0000000
--- a/simple_dp.c
+++ /dev/null
@@ -1,162 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <zlib.h>
-#include <stdint.h>
-#include "stdaln.h"
-#include "utils.h"
-
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
-typedef struct {
-	int l;
-	unsigned char *s;
-	char *n;
-} seq1_t;
-
-typedef struct {
-	int n_seqs, m_seqs;
-	seq1_t *seqs;
-} seqs_t;
-
-unsigned char aln_rev_table[256] = {
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N',
-	'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N',
-	'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N',
-	'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N',
-	'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N'
-};
-
-static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0;
-static AlnParam g_aln_param;
-
-static void revseq(int len, uint8_t *seq)
-{
-	int i;
-	for (i = 0; i < len>>1; ++i) {
-		uint8_t tmp = aln_rev_table[seq[len-1-i]];
-		seq[len-1-i] = aln_rev_table[seq[i]];
-		seq[i] = tmp;
-	}
-	if (len&1) seq[i] = aln_rev_table[seq[i]];
-}
-
-static seqs_t *load_seqs(const char *fn)
-{
-	seqs_t *s;
-	seq1_t *p;
-	gzFile fp;
-	int l;
-	kseq_t *seq;
-
-	fp = xzopen(fn, "r");
-	seq = kseq_init(fp);
-	s = (seqs_t*)calloc(1, sizeof(seqs_t));
-	s->m_seqs = 256;
-	s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t));
-	while ((l = kseq_read(seq)) >= 0) {
-		if (s->n_seqs == s->m_seqs) {
-			s->m_seqs <<= 1;
-			s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t));
-		}
-		p = s->seqs + (s->n_seqs++);
-		p->l = seq->seq.l;
-		p->s = (unsigned char*)malloc(p->l + 1);
-		memcpy(p->s, seq->seq.s, p->l);
-		p->s[p->l] = 0;
-		p->n = strdup((const char*)seq->name.s);
-	}
-	kseq_destroy(seq);
-	gzclose(fp);
-	fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs);
-	return s;
-}
-
-static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand)
-{
-	int i;
-	for (i = 0; i < ss->n_seqs; ++i) {
-		AlnAln *aa;
-		seq1_t *p = ss->seqs + i;
-		g_aln_param.band_width = l + p->l;
-		aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l);
-		if (aa->score >= g_thres || g_is_global) {
-			printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand,
-				   aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo);
-			// NB: I put the short sequence as the first sequence in SW, an insertion to
-			// the reference becomes a deletion from the short sequence. Therefore, I use
-			// "MDI" here rather than "MID", and print ->out2 first rather than ->out1.
-			for (i = 0; i != aa->n_cigar; ++i)
-				printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]);
-			printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1);
-		}
-		aln_free_AlnAln(aa);
-	}
-}
-
-static void aln_seqs(const seqs_t *ss, const char *fn)
-{
-	gzFile fp;
-	kseq_t *seq;
-	int l;
-
-	fp = xzopen(fn, "r");
-	seq = kseq_init(fp);
-	while ((l = kseq_read(seq)) >= 0) {
-		if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+');
-		if (g_strand&2) {
-			revseq(l, (uint8_t*)seq->seq.s);
-			aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-');
-		}
-	}
-	kseq_destroy(seq);
-	gzclose(fp);
-}
-
-int bwa_stdsw(int argc, char *argv[])
-{
-	int c;
-	seqs_t *ss;
-
-	while ((c = getopt(argc, argv, "gT:frp")) >= 0) {
-		switch (c) {
-		case 'g': g_is_global = 1; break;
-		case 'T': g_thres = atoi(optarg); break;
-		case 'f': g_strand |= 1; break;
-		case 'r': g_strand |= 2; break;
-		case 'p': g_aa = 1; break;
-		}
-	}
-	if (g_strand == 0) g_strand = 3;
-	if (g_aa) g_strand = 1;
-	if (optind + 1 >= argc) {
-		fprintf(stderr, "\nUsage:   bwa stdsw [options] <seq1.long.fa> <seq2.short.fa>\n\n");
-		fprintf(stderr, "Options: -T INT    minimum score [%d]\n", g_thres);
-		fprintf(stderr, "         -p        protein alignment (suppressing -r)\n");
-		fprintf(stderr, "         -f        forward strand only\n");
-		fprintf(stderr, "         -r        reverse strand only\n");
-		fprintf(stderr, "         -g        global alignment\n\n");
-		fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n");
-		fprintf(stderr, "      sequences and ONE long sequence. It outputs the suboptimal score on the long\n");
-		fprintf(stderr, "      sequence.\n\n");
-		return 1;
-	}
-	g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast;
-	g_aln_param.gap_end = 0;
-	ss = load_seqs(argv[optind]);
-	aln_seqs(ss, argv[optind+1]);
-	return 0;
-}
diff --git a/solid2fastq.pl b/solid2fastq.pl
deleted file mode 100755
index c60ad81..0000000
--- a/solid2fastq.pl
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/perl -w
-
-# Author: lh3
-# Note: Ideally, this script should be written in C. It is a bit slow at present.
-# Also note that this script is different from the one contained in MAQ.
-
-use strict;
-use warnings;
-use Getopt::Std;
-
-my %opts;
-my $version = '0.1.4';
-my $usage = qq{
-Usage: solid2fastq.pl <in.title> <out.prefix>
-
-Note: <in.title> is the string showed in the `# Title:' line of a
-      ".csfasta" read file. Then <in.title>F3.csfasta is read sequence
-      file and <in.title>F3_QV.qual is the quality file. If
-      <in.title>R3.csfasta is present, this script assumes reads are
-      paired; otherwise reads will be regarded as single-end.
-
-      The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
-      tag and `2' for F3. Usually you may want to use short <out.prefix>
-      to save diskspace. Long <out.prefix> also causes troubles to maq.
-
-};
-
-getopts('', \%opts);
-die($usage) if (@ARGV != 2);
-my ($title, $pre) = @ARGV;
-my (@fhr, @fhw);
-my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual');
-my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0;
-if ($is_paired) { # paired end
-  for (0 .. 3) {
-	my $fn = "$title$fn_suff[$_]";
-	$fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
-	open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
-  }
-  open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo
-  open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die;
-  open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
-  my (@df, @dr);
-  @df = &read1(1); @dr = &read1(2);
-  while (@df && @dr) {
-	if ($df[0] eq $dr[0]) { # mate pair
-	  print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
-	  @df = &read1(1); @dr = &read1(2);
-	} else {
-	  if ($df[0] le $dr[0]) {
-		print {$fhw[2]} $df[1];
-		@df = &read1(1);
-	  } else {
-		print {$fhw[2]} $dr[1];
-		@dr = &read1(2);
-	  }
-	}
-  }
-  if (@df) {
-	print {$fhw[2]} $df[1];
-	while (@df = &read1(1, $fhr[0], $fhr[1])) {
-	  print {$fhw[2]} $df[1];
-	}
-  }
-  if (@dr) {
-	print {$fhw[2]} $dr[1];
-	while (@dr = &read1(2, $fhr[2], $fhr[3])) {
-	  print {$fhw[2]} $dr[1];
-	}
-  }
-  close($fhr[$_]) for (0 .. $#fhr);
-  close($fhw[$_]) for (0 .. $#fhw);
-} else { # single end
-  for (0 .. 1) {
-	my $fn = "$title$fn_suff[$_]";
-	$fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
-	open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
-  }
-  open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
-  my @df;
-  while (@df = &read1(1, $fhr[0], $fhr[1])) {
-	print {$fhw[2]} $df[1];
-  }
-  close($fhr[$_]) for (0 .. $#fhr);
-  close($fhw[2]);
-}
-
-sub read1 {
-  my $i = shift(@_);
-  my $j = ($i-1)<<1;
-  my ($key, $seq);
-  my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
-  while (<$fhs>) {
-	my $t = <$fhq>;
-	if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
-	  $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
-	  die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t);
-	  my $name = "$pre:$1_$2_$3/$i";
-	  $_ = substr(<$fhs>, 2);
-	  tr/0123./ACGTN/;
-	  my $s = $_;
-	  $_ = <$fhq>;
-	  s/-1\b/0/eg;
-	  s/^(\d+)\s*//;
-	  s/(\d+)\s*/chr($1+33)/eg;
-	  $seq = qq/\@$name\n$s+\n$_\n/;
-	  last;
-	}
-  }
-  return defined($seq)? ($key, $seq) : ();
-}
diff --git a/stdaln.c b/stdaln.c
index eb41882..cd064cf 100644
--- a/stdaln.c
+++ b/stdaln.c
@@ -542,13 +542,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2,
 	int start, end, max_score;
 	int thres, *suba, *ss;
 
-	int gap_open, gap_ext, b;
+	int gap_open, gap_ext;
 	int *score_matrix, N_MATRIX_ROW;
 
 	/* initialize some align-related parameters. just for compatibility */
 	gap_open = ap->gap_open;
 	gap_ext = ap->gap_ext;
-	b = ap->band_width;
 	score_matrix = ap->matrix;
 	N_MATRIX_ROW = ap->row;
 	thres = _thres > 0? _thres : -_thres;
@@ -862,7 +861,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar)
 int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap,
 					path_t *path, int *path_len, int G0, uint8_t *_mem)
 {
-	int q, r, qr, tmp_len;
+	int q, r, qr;
 	int32_t **s_array, *score_array;
 	int is_overflow, of_base;
 	uint32_t *eh;
@@ -889,7 +888,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2
 		s_array[i] = (int32_t*)_p, _p += 4 * len1;
 	/* initialization */
 	aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array);
-	tmp_len = len1 + 1;
 	start = 1; end = 2;
 	end_i = end_j = 0;
 	score = 0;
diff --git a/utils.c b/utils.c
index 8c1ad7e..20b09ee 100644
--- a/utils.c
+++ b/utils.c
@@ -35,6 +35,18 @@
 #include <sys/time.h>
 #include "utils.h"
 
+#include "ksort.h"
+#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y))
+KSORT_INIT(128, pair64_t, pair64_lt)
+KSORT_INIT(64,  uint64_t, ks_lt_generic)
+
+#include "kseq.h"
+KSEQ_INIT2(, gzFile, gzread)
+
+/********************
+ * System utilities *
+ ********************/
+
 FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
 {
 	FILE *fp = 0;
@@ -46,6 +58,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode)
 	}
 	return fp;
 }
+
 FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp)
 {
 	if (freopen(fn, mode, fp) == 0) {
@@ -56,6 +69,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE
 	}
 	return fp;
 }
+
 gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
 {
 	gzFile fp;
@@ -67,6 +81,7 @@ gzFile err_xzopen_core(const char *func, const char *fn, const char *mode)
 	}
 	return fp;
 }
+
 void err_fatal(const char *header, const char *fmt, ...)
 {
 	va_list args;
@@ -86,68 +101,54 @@ void err_fatal_simple_core(const char *func, const char *msg)
 
 size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
 {
-    size_t ret = fwrite(ptr, size, nmemb, stream);
-    if (ret != nmemb) 
-    {
-        err_fatal_simple_core("fwrite", strerror(errno));
-    }
-    return ret;
+	size_t ret = fwrite(ptr, size, nmemb, stream);
+	if (ret != nmemb) 
+		err_fatal_simple_core("fwrite", strerror(errno));
+	return ret;
 }
 
 int err_printf(const char *format, ...) 
 {
-    va_list arg;
-    int done;
-
-    va_start(arg, format);
-    done = vfprintf(stdout, format, arg);
-    int saveErrno = errno;
-    va_end(arg);
-
-    if (done < 0) 
-    {
-        err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno));
-    }
-    return done;
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stdout, format, arg);
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno));
+	return done;
 }
 
 int err_fprintf(FILE *stream, const char *format, ...) 
 {
-    va_list arg;
-    int done;
-
-    va_start(arg, format);
-    done = vfprintf(stream, format, arg);
-    int saveErrno = errno;
-    va_end(arg);
-
-    if (done < 0) 
-    {
-        err_fatal_simple_core("vfprintf", strerror(saveErrno));
-    }
-    return done;
+	va_list arg;
+	int done;
+	va_start(arg, format);
+	done = vfprintf(stream, format, arg);
+	int saveErrno = errno;
+	va_end(arg);
+	if (done < 0) err_fatal_simple_core("vfprintf", strerror(saveErrno));
+	return done;
 }
 
 int err_fflush(FILE *stream) 
 {
-    int ret = fflush(stream);
-    if (ret != 0) 
-    {
-        err_fatal_simple_core("fflush", strerror(errno));
-    }
-    return ret;
+	int ret = fflush(stream);
+	if (ret != 0) err_fatal_simple_core("fflush", strerror(errno));
+	return ret;
 }
 
 int err_fclose(FILE *stream) 
 {
-    int ret = fclose(stream);
-    if (ret != 0) 
-    {
-        err_fatal_simple_core("fclose", strerror(errno));
-    }
-    return ret;
+	int ret = fclose(stream);
+	if (ret != 0) err_fatal_simple_core("fclose", strerror(errno));
+	return ret;
 }
 
+/*********
+ * Timer *
+ *********/
+
 double cputime()
 {
 	struct rusage r;
diff --git a/utils.h b/utils.h
index b6839e9..a3db251 100644
--- a/utils.h
+++ b/utils.h
@@ -28,6 +28,7 @@
 #ifndef LH3_UTILS_H
 #define LH3_UTILS_H
 
+#include <stdint.h>
 #include <stdio.h>
 #include <zlib.h>
 
@@ -38,14 +39,19 @@
 #define ATTRIBUTE(list)
 #endif
 
-
-
 #define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg)
 #define xopen(fn, mode) err_xopen_core(__func__, fn, mode)
 #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp)
 #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode)
 #define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg)
 
+typedef struct {
+	uint64_t x, y;
+} pair64_t;
+
+typedef struct { size_t n, m; uint64_t *a; } uint64_v;
+typedef struct { size_t n, m; pair64_t *a; } pair64_v;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -66,8 +72,24 @@ extern "C" {
 	double cputime();
 	double realtime();
 
+	void ks_introsort_64 (size_t n, uint64_t *a);
+	void ks_introsort_128(size_t n, pair64_t *a);
+
 #ifdef __cplusplus
 }
 #endif
 
+static inline uint64_t hash_64(uint64_t key)
+{
+	key += ~(key << 32);
+	key ^= (key >> 22);
+	key += ~(key << 13);
+	key ^= (key >> 8);
+	key += (key << 3);
+	key ^= (key >> 15);
+	key += ~(key << 27);
+	key ^= (key >> 31);
+	return key;
+}
+
 #endif

-- 
Burrows-Wheeler Aligner