[med-svn] [vsearch] 01/04: Imported Upstream version 2.0.0

Andreas Tille tille at debian.org
Thu Jun 30 08:33:39 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository vsearch.

commit ecb9cf5f8bb758b6c534bf550b1ba065df752163
Author: Andreas Tille <tille at debian.org>
Date:   Thu Jun 30 10:05:17 2016 +0200

    Imported Upstream version 2.0.0
---
 configure.ac       |   2 +-
 man/vsearch.1      |  87 +++++++---
 src/chimera.cc     |   2 +-
 src/dynlibs.cc     |  17 +-
 src/dynlibs.h      |   4 +
 src/eestats.cc     |   2 +-
 src/fasta.cc       | 316 +++-------------------------------
 src/fasta.h        |  63 ++-----
 src/fastq.cc       | 368 ++++-----------------------------------
 src/fastq.h        |  71 ++------
 src/fastqops.cc    |   8 +-
 src/fastx.cc       | 491 ++++++++++++++++++++++++++++++++++++++---------------
 src/fastx.h        |  53 +++++-
 src/maps.cc        |  53 ++++++
 src/maps.h         |   1 +
 src/mergepairs.cc  |   4 +-
 src/msa.cc         |  82 +++------
 src/rerep.cc       |   2 +-
 src/search.cc      |   2 +-
 src/searchcore.cc  |   4 +-
 src/searchexact.cc |   2 +-
 src/util.cc        |   2 +-
 src/vsearch.cc     | 113 +++++++++++-
 src/vsearch.h      |   4 +-
 24 files changed, 783 insertions(+), 970 deletions(-)

diff --git a/configure.ac b/configure.ac
index 79d3c5c..6260dd7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.63])
-AC_INIT([vsearch], [1.11.1], [torognes at ifi.uio.no])
+AC_INIT([vsearch], [2.0.0], [torognes at ifi.uio.no])
 AM_INIT_AUTOMAKE([subdir-objects])
 AC_LANG([C++])
 AC_CONFIG_SRCDIR([src/vsearch.cc])
diff --git a/man/vsearch.1 b/man/vsearch.1
index 4572afb..672d911 100644
--- a/man/vsearch.1
+++ b/man/vsearch.1
@@ -1,5 +1,5 @@
 .\" ============================================================================
-.TH vsearch 1 "April 13, 2016" "version 1.11.1" "USER COMMANDS"
+.TH vsearch 1 "June 24, 2016" "version 2.0.0" "USER COMMANDS"
 .\" ============================================================================
 .SH NAME
 vsearch \(em chimera detection, clustering, dereplication and
@@ -169,12 +169,15 @@ both of the symbols are ambiguous (RYSWKMDBHVN) in which case the
 score is zero. Alignment of two identical ambiguous symbols (e.g. R vs
 R) also receives a score of zero.
 .PP
-\fBvsearch\fR can be compiled to accepted compressed fasta files as
-input (gz and bzip2 formats). On the other hand, special files like
-pipes, named pipes, or sockets cannot be used as input. To present a
-progress indicator, \fBvsearch\fR needs to seek to the end of
-\fIfilename\fR to find its length. Consequently, \fIfilename\fR must
-be a regular file, not a stream.
+Input files compressed with gzip or bzip2 are automatically
+detected and decompressed if vsearch was compiled with the appropriate
+libraries. Input from pipes is supported, but then compressed input
+must be indicated using the \-\-gzip_decompress or
+\-\-bzip2_decompress options. For input files the name '\-'
+represents standard input (/dev/stdin). Multiple FASTA or FASTQ files
+may be piped into vsearch for dereplication or other operations.  When
+reading from a pipe, the progress indicator is not updated.  For
+output files the name '\-' represents standard output (/dev/stdout).
 .\" ----------------------------------------------------------------------------
 .SS Options
 \fBvsearch\fR recognizes a large number of command-line options. For
@@ -182,16 +185,26 @@ easier navigation, options are grouped below by theme (chimera
 detection, clustering, dereplication and rereplication, FASTA/FASTQ
 file processing, masking, pairwise alignment, searching, shuffling,
 sorting, and subsampling). We start with the general options that
-apply to all themes.
+apply to all themes. Options may start with a single (\-) or double
+dash (\-\-). Option names may be shortened as long as they are not
+ambiguous (e.g. \-\-derep_f).
 .PP
 General options:
 .RS
 .TP 9
+.B \-\-bzip2_decompress
+Decompress input using bzip2. The option is required only when reading
+from a pipe, otherwise compression is automatically detected.
+.TP
 .BI \-\-fasta_width\~ "positive integer"
 Fasta files produced by \fBvsearch\fR are wrapped (sequences are
 written on lines of \fIinteger\fR nucleotides, 80 by default). Set
 that value to 0 to eliminate the wrapping.
 .TP
+.B \-\-gzip_decompress
+Decompress input using gzip. The option is required only when reading
+from a pipe, otherwise compression is automatically detected.
+.TP
 .B \-\-help | \-\-h
 Display help text and exit.
 .TP
@@ -1163,8 +1176,11 @@ length. Output order may vary when using multiple threads.
 Write search results to \fIfilename\fR using a blast-like
 tab-separated format of twelve fields (listed below), with one line
 per query-target matching (or lack of matching if \-\-output_no_hits
-is used). Output order may vary when using multiple threads. A similar
-output can be obtain with \-\-userout \fIfilename\fR and
+is used). Warning, vsearch uses global pairwise alignments, not
+blast's seed-and-extend algorithm. Therefore, some common blast output
+values (alignment start and end, evalue, bit score) are reported
+differently. Output order may vary when using multiple threads. A
+similar output can be obtain with \-\-userout \fIfilename\fR and
 \-\-userfields
 query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+bits.  A
 complete list and description is available in the section "Userfields"
@@ -1193,18 +1209,20 @@ integer value).
 positive integer value).
 .IP \n+[step].
 \fIqlo\fR: first nucleotide of the query aligned with the
-target. Always equal to 1 if there is an alignment, 0 otherwise.
+target. Always equal to 1 if there is an alignment, 0 otherwise (see
+\fIqilo\fR to ignore initial gaps).
 .IP \n+[step].
 \fIqhi\fR: last nucleotide of the query aligned with the
-target. Always equal to the length of the pairwise alignment. The
-field is set to 0 if there is no alignment.
+target. Always equal to the length of the pairwise alignment, 0
+otherwise (see \fIqihi\fR to ignore terminal gaps).
 .IP \n+[step].
 \fItlo\fR: irst nucleotide of the target aligned with the
-query. Always equal to 1 if there is an alignment, 0 otherwise.
+query. Always equal to 1 if there is an alignment, 0 otherwise (see
+\fItilo\fR to ignore initial gaps).
 .IP \n+[step].
 \fIthi\fR: last nucleotide of the target aligned with the
-query. Always equal to the length of the pairwise alignment. The field
-is set to 0 if there is no alignment.
+query. Always equal to the length of the pairwise alignment, 0
+otherwise (see \fItihi\fR to ignore terminal gaps).
 .IP \n+[step].
 \fIevalue\fR: expectancy-value (not computed for nucleotide
 alignments). Always set to -1.
@@ -1814,8 +1832,8 @@ is not computed by \fBvsearch\fR. Always set to +0.
 .TP
 .B qhi
 Last nucleotide of the query aligned with the target. Always equal to
-the length of the pairwise alignment. The field is set to 0 if there
-is no alignment.
+the length of the pairwise alignment, 0 otherwise (see \fIqihi\fR to
+ignore terminal gaps).
 .TP
 .B qihi
 Last nucleotide of the query aligned with the target (ignoring
@@ -1833,7 +1851,8 @@ if there is no alignment.
 .TP
 .B qlo
 First nucleotide of the query aligned with the target. Always equal to
-1 if there is an alignment, 0 otherwise.
+1 if there is an alignment, 0 otherwise (see \fIqilo\fR to ignore
+initial gaps).
 .TP
 .B qrow
 Print the sequence of the query segment as seen in the pairwise
@@ -1873,8 +1892,8 @@ is not computed by \fBvsearch\fR. Always set to +0.
 .TP
 .B thi
 Last nucleotide of the target aligned with the query. Always equal to
-the length of the pairwise alignment. The field is set to 0 if there
-is no alignment.
+the length of the pairwise alignment, 0 otherwise (see \fItihi\fR to
+ignore terminal gaps).
 .TP
 .B tihi
 Last nucleotide of the target aligned with the query (ignoring
@@ -1892,7 +1911,8 @@ if there is no alignment.
 .TP
 .B tlo
 First nucleotide of the target aligned with the query. Always equal to
-1 if there is an alignment, 0 otherwise.
+1 if there is an alignment, 0 otherwise (see \fItilo\fR to ignore
+initial gaps).
 .TP
 .B trow
 Print the sequence of the target segment as seen in the pairwise
@@ -2452,6 +2472,29 @@ specified with \-\-fastaout and \-\-fastaout_discarded when \-\-eeout
 or \-\-fastq_eeout option is in effect for fastq_filter and
 fastq_mergepairs. The options \-\-eeout and \-\-fastq_eeout are now
 equivalent.
+.TP
+.BR v1.11.2\~ "released June 21, 2016"
+Two bugs were fixed. The first issue was related to the \-\-query_cov
+option that used a different coverage definition than the qcov
+userfield. The coverage is now defined as the fraction of the whole
+query sequence length that is aligned with matching or mismatching
+residues in the target. All gaps are ignored. The other issue was
+related to the consensus sequences produced during clustering when
+only N's were present in some positions. Previously these would be
+converted to A's in the consensus. The behaviour is changed so that
+N's are produced in the consensus, and it should now be more
+compatible with usearch.
+.TP
+.BR v2.0.0\~ "released June 24, 2016"
+This major new version supports reading from pipes. Two new options
+are added: \-\-gzip_decompress and \-\-bzip2_decompress. One of these
+options must be specified if reading compressed input from a pipe, but
+are not required when reading from ordinary files. The vsearch header
+that was previously written to stdout is now written to stderr. This
+enables piping of results for further processing. The file name '\-'
+now represent standard input (/dev/stdin) or standard outout
+(/dev/stdout) when reading or writing files, respectively. Code for
+reading FASTA and FASTQ files have been refactored.
 .RE
 .LP
 .\" ============================================================================
diff --git a/src/chimera.cc b/src/chimera.cc
index a01fd50..c750dd9 100644
--- a/src/chimera.cc
+++ b/src/chimera.cc
@@ -79,7 +79,7 @@ const double chimera_id = 0.55;
 static int tophits;
 static pthread_attr_t attr;
 static pthread_t * pthread;
-static fasta_handle query_fasta_h;
+static fastx_handle query_fasta_h;
 
 /* mutexes and global data protected by mutex */
 static pthread_mutex_t mutex_input;
diff --git a/src/dynlibs.cc b/src/dynlibs.cc
index 2cc804c..4f6cc2e 100644
--- a/src/dynlibs.cc
+++ b/src/dynlibs.cc
@@ -70,6 +70,10 @@ void * gz_lib;
 gzFile (*gzdopen_p)(int, const char *);
 int (*gzclose_p)(gzFile);
 int (*gzread_p)(gzFile, void *, unsigned);
+int (*gzgetc_p)(gzFile);
+int (*gzrewind_p)(gzFile);
+int (*gzungetc_p)(int, gzFile);
+const char * (*gzerror_p)(gzFile, int*);
 #endif
 
 #ifdef HAVE_BZLIB_H
@@ -90,12 +94,13 @@ void dynlibs_open()
   gz_lib = dlopen(gz_libname, RTLD_LAZY);
   if (gz_lib)
     {
-      gzdopen_p = (gzFile (*)(int, const char*))
-        dlsym(gz_lib, "gzdopen");
-      gzclose_p = (int (*)(gzFile))
-        dlsym(gz_lib, "gzclose");
-      gzread_p = (int (*)(gzFile, void*, unsigned))
-        dlsym(gz_lib, "gzread");
+      gzdopen_p = (gzFile (*)(int, const char*)) dlsym(gz_lib, "gzdopen");
+      gzclose_p = (int (*)(gzFile)) dlsym(gz_lib, "gzclose");
+      gzread_p = (int (*)(gzFile, void*, unsigned)) dlsym(gz_lib, "gzread");
+      gzgetc_p = (int (*)(gzFile)) dlsym(gz_lib, "gzgetc");
+      gzrewind_p = (int (*)(gzFile)) dlsym(gz_lib, "gzrewind");
+      gzerror_p = (const char * (*)(gzFile, int*)) dlsym(gz_lib, "gzerror");
+      gzungetc_p = (int (*)(int, gzFile)) dlsym(gz_lib, "gzungetc");
     }
 #endif
 
diff --git a/src/dynlibs.h b/src/dynlibs.h
index 711b570..0d13cbc 100644
--- a/src/dynlibs.h
+++ b/src/dynlibs.h
@@ -63,6 +63,10 @@ extern void * gz_lib;
 extern gzFile (*gzdopen_p)(int, const char *);
 extern int (*gzclose_p)(gzFile);
 extern int (*gzread_p)(gzFile, void*, unsigned);
+extern int (*gzgetc_p)(gzFile);
+extern int (*gzrewind_p)(gzFile);
+extern int (*gzungetc_p)(int, gzFile);
+extern const char * (*gzerror_p)(gzFile, int*);
 #endif
 
 #ifdef HAVE_BZLIB_H
diff --git a/src/eestats.cc b/src/eestats.cc
index ff2aad4..bb6ea6b 100644
--- a/src/eestats.cc
+++ b/src/eestats.cc
@@ -72,7 +72,7 @@ long ee_start(int pos, int resolution)
 
 void fastq_eestats()
 {
-  fastq_handle h = fastq_open(opt_fastq_eestats);
+  fastx_handle h = fastq_open(opt_fastq_eestats);
 
   unsigned long filesize = fastq_get_size(h);
 
diff --git a/src/fasta.cc b/src/fasta.cc
index 6a80abf..d788c4b 100644
--- a/src/fasta.cc
+++ b/src/fasta.cc
@@ -60,279 +60,22 @@
 
 #include "vsearch.h"
 
-#define FASTA_BUFFER_ALLOC 8192
-
-#ifdef HAVE_BZLIB_H
-#define BZ_VERBOSE_0 0
-#define BZ_VERBOSE_1 1
-#define BZ_VERBOSE_2 2
-#define BZ_VERBOSE_3 3
-#define BZ_VERBOSE_4 4
-#define BZ_MORE_MEM 0  /* faster decompression using more memory */
-#define BZ_LESS_MEM 1  /* slower decompression but requires less memory */
-#endif
-
-#define FORMAT_PLAIN 1
-#define FORMAT_BZIP  2
-#define FORMAT_GZIP  3
-
-static unsigned char MAGIC_GZIP[] = "\x1f\x8b";
-static unsigned char MAGIC_BZIP[] = "BZ";
-
-void buffer_init(struct fasta_buffer_s * buffer)
+fastx_handle fasta_open(const char * filename)
 {
-  buffer->alloc = FASTA_BUFFER_ALLOC;
-  buffer->data = (char*) xmalloc(buffer->alloc);
-  buffer->data[0] = 0;
-  buffer->length = 0;
-  buffer->position = 0;
-}
-
-void buffer_free(struct fasta_buffer_s * buffer)
-{
-  if (buffer->data)
-    free(buffer->data);
-  buffer->data = 0;
-  buffer->alloc = 0;
-  buffer->length = 0;
-  buffer->position = 0;
-}
-
-fasta_handle fasta_open(const char * filename)
-{
-  fasta_handle h = (fasta_handle) xmalloc(sizeof(struct fasta_s));
-  
-  h->fp = NULL;
-  h->fp = fopen(filename, "rb");
-  if(!h->fp)
-    fatal("Error: Unable to open fasta file for reading (%s)", filename);
-  
-  /* detect compression (plain, gzipped or bzipped) */
-  
-  unsigned char magic[2];
-  h->format = FORMAT_PLAIN;
-  if (fread(&magic, 1, 2, h->fp) >= 2)
-    {
-      if (!memcmp(magic, MAGIC_GZIP, 2))
-        h->format = FORMAT_GZIP;
-      else if (!memcmp(magic, MAGIC_BZIP, 2))
-        h->format = FORMAT_BZIP;
-    }
-
-  /* Get size of original (uncompressed) file */
-
-  if (fseek(h->fp, 0, SEEK_END))
-    fatal("Error: Unable to seek in fasta file (%s)", filename);
-  h->file_size = ftell(h->fp);
-  rewind(h->fp);
-
-  if (h->format == FORMAT_GZIP)
-    {
-      /* GZIP: Keep original file open, then open as gzipped file as well */
-#ifdef HAVE_ZLIB_H
-      if (!gz_lib)
-        fatal("Files compressed with gzip are not supported");
-      if (! (h->fp_gz = (*gzdopen_p)(fileno(h->fp), "rb")))
-        fatal("Unable to open gzip compressed fasta file (%s)", filename);
-#else
-      fatal("Files compressed with gzip are not supported");
-#endif
-    }
-
-  if (h->format == FORMAT_BZIP)
-    {
-      /* BZIP2: Keep original file open, then open as bzipped file as well */
-#ifdef HAVE_BZLIB_H
-      if (!bz2_lib)
-        fatal("Files compressed with bzip2 are not supported");
-      int bzError;
-      if (! (h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp,
-                                            BZ_VERBOSE_0, BZ_MORE_MEM,
-                                            NULL, 0)))
-        fatal("Unable to open bzip2 compressed fasta file (%s)", filename);
-#else
-      fatal("Files compressed with bzip2 are not supported");
-#endif
-    }
+  fastx_handle h = fastx_open(filename);
 
-
-
-  h->stripped_all = 0;
-
-  for(int i=0; i<256; i++)
-    h->stripped[i] = 0;
-
-  h->file_position = 0;
-
-  buffer_init(& h->file_buffer);
-  buffer_init(& h->header_buffer);
-  buffer_init(& h->sequence_buffer);
-
-  h->lineno = 1;
-  h->lineno_start = 1;
-  h->seqno = -1;
+  if (fastx_is_fastq(h))
+    fatal("FASTA file expected, FASTQ file found (%s)", filename);
 
   return h;
 }
 
-void fasta_close(fasta_handle h)
-{
-  /* Warn about stripped chars */
-
-  if (h->stripped_all)
-    {
-      fprintf(stderr, "WARNING: invalid characters stripped from fasta file:");
-      for (int i=0; i<256;i++)
-        if (h->stripped[i])
-          fprintf(stderr, " %c(%lu)", i, h->stripped[i]);
-      fprintf(stderr, "\n");
-
-      if (opt_log)
-        {
-          fprintf(fp_log, "WARNING: invalid characters stripped from fasta file:");
-          for (int i=0; i<256;i++)
-            if (h->stripped[i])
-              fprintf(fp_log, " %c(%lu)", i, h->stripped[i]);
-          fprintf(fp_log, "\n");
-        }
-    }
-
-#ifdef HAVE_BZLIB_H
-  int bz_error;
-#endif
-  
-  switch(h->format)
-    {
-    case FORMAT_PLAIN:
-      fclose(h->fp);
-      h->fp = 0;
-      break;
-
-    case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
-      (*gzclose_p)(h->fp_gz);
-      h->fp_gz = 0;
-      h->fp = 0;
-      break;
-#endif
-      
-    case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
-      (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz);
-      h->fp_bz = 0;
-      h->fp = 0;
-      break;
-#endif
-
-    default:
-      fatal("Internal error");
-    }
-  
-  buffer_free(& h->file_buffer);
-  buffer_free(& h->header_buffer);
-  buffer_free(& h->sequence_buffer);
-
-  h->file_size = 0;
-  h->file_position = 0;
-
-  h->lineno = 0;
-  h->seqno = -1;
-
-  free(h);
-}
-
-
-unsigned long fasta_file_fill_buffer(fasta_handle h)
-{
-  /* read more data if necessary */
-  unsigned long rest = h->file_buffer.length - h->file_buffer.position;
-  
-  if (rest > 0)
-    return rest;
-  else
-    {
-      unsigned long space = h->file_buffer.alloc - h->file_buffer.length;
-
-      if (space == 0)
-        {
-          /* back to beginning of buffer */
-          h->file_buffer.position = 0;
-          h->file_buffer.length = 0;
-          space = h->file_buffer.alloc;
-        }
-      
-      int bytes_read = 0;
-      
-#ifdef HAVE_BZLIB_H
-      int bzError = 0;
-#endif
-
-      switch(h->format)
-        {
-        case FORMAT_PLAIN:
-          bytes_read = fread(h->file_buffer.data
-                             + h->file_buffer.position,
-                             1,
-                             space,
-                             h->fp);
-          break;
-
-        case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
-          bytes_read = (*gzread_p)(h->fp_gz,
-                                   h->file_buffer.data
-                                   + h->file_buffer.position,
-                                   space);
-          if (bytes_read < 0)
-            fatal("Error reading gzip compressed fasta file");
-          break;
-#endif
-          
-        case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
-          bytes_read = (*BZ2_bzRead_p)(& bzError,
-                                       h->fp_bz,
-                                       h->file_buffer.data
-                                       + h->file_buffer.position,
-                                       space);
-          if ((bytes_read < 0) ||
-              ! ((bzError == BZ_OK) ||
-                 (bzError == BZ_STREAM_END) ||
-                 (bzError == BZ_SEQUENCE_ERROR)))
-            fatal("Error reading bzip2 compressed fasta file");
-          break;
-#endif
-          
-        default:
-          fatal("Internal error");
-        }
-      
-      h->file_buffer.length += bytes_read;
-      return bytes_read;
-    }
-}
-
-void buffer_extend(struct fasta_buffer_s * buffer, char * buf, unsigned long len)
+void fasta_close(fastx_handle h)
 {
-  if (buffer->length + len + 1 > buffer->alloc)
-    {
-      /* alloc space for len more characters + terminating zero,
-         but round up to nearest block size */
-      buffer->alloc = 
-        FASTA_BUFFER_ALLOC * 
-        (((buffer->length + len) / FASTA_BUFFER_ALLOC) + 1);
-      buffer->data = (char*) xrealloc(buffer->data, buffer->alloc);
-    }
-
-  /* copy string */
-  memcpy(buffer->data + buffer->length, buf, len);
-  buffer->length += len;
-
-  /* add terminator */
-  buffer->data[buffer->length] = 0;
+  fastx_close(h);
 }
 
-void fasta_truncate_header(fasta_handle h, bool truncateatspace)
+void fasta_truncate_header(fastx_handle h, bool truncateatspace)
 {
   /* Truncate the zero-terminated header string by inserting a new
      terminator (zero byte) at the first space/tab character
@@ -346,7 +89,7 @@ void fasta_truncate_header(fasta_handle h, bool truncateatspace)
   h->header_buffer.data[h->header_buffer.length] = 0;
 }
 
-void fasta_filter_sequence(fasta_handle h,
+void fasta_filter_sequence(fastx_handle h,
                            unsigned int * char_action,
                            char * char_mapping)
 {
@@ -408,7 +151,7 @@ void fasta_filter_sequence(fasta_handle h,
   h->sequence_buffer.length = q - h->sequence_buffer.data;
 }
 
-bool fasta_next(fasta_handle h,
+bool fasta_next(fastx_handle h,
                 bool truncateatspace,
                 char * char_mapping)
 {
@@ -417,7 +160,7 @@ bool fasta_next(fasta_handle h,
   h->header_buffer.length = 0;
   h->sequence_buffer.length = 0;
 
-  unsigned long rest = fasta_file_fill_buffer(h);
+  unsigned long rest = fastx_file_fill_buffer(h);
 
   if (rest == 0)
     return 0;
@@ -427,7 +170,10 @@ bool fasta_next(fasta_handle h,
   /* check initial > character */
   
   if (h->file_buffer.data[h->file_buffer.position] != '>')
-    fatal("Invalid FASTA - header must start with > character");
+    {
+      fprintf(stderr, "Found character %02x\n", (unsigned char)(h->file_buffer.data[h->file_buffer.position]));
+      fatal("Invalid FASTA - header must start with > character");
+    }
   h->file_buffer.position++;
   rest--;
 
@@ -435,7 +181,7 @@ bool fasta_next(fasta_handle h,
   while (lf == 0)
     {
       /* get more data if buffer empty*/
-      rest = fasta_file_fill_buffer(h);
+      rest = fastx_file_fill_buffer(h);
       if (rest == 0)
         fatal("Invalid FASTA - header must be terminated with newline");
       
@@ -464,7 +210,7 @@ bool fasta_next(fasta_handle h,
   while (1)
     {
       /* get more data, if necessary */
-      rest = fasta_file_fill_buffer(h);
+      rest = fastx_file_fill_buffer(h);
 
       /* end if no more data */
       if (rest == 0)
@@ -496,62 +242,50 @@ bool fasta_next(fasta_handle h,
   fasta_truncate_header(h, truncateatspace);
   fasta_filter_sequence(h, char_fasta_action, char_mapping);
 
-#ifdef HAVE_ZLIB_H
-  if (h->format == FORMAT_GZIP)
-    {
-      /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */
-      int fd = dup(fileno(h->fp));
-      h->file_position = lseek(fd, 0, SEEK_CUR);
-      close(fd);
-    }
-  else
-#endif
-    h->file_position = ftell(h->fp);
-  
   return 1;
 }
 
-long fasta_get_abundance(fasta_handle h)
+long fasta_get_abundance(fastx_handle h)
 {
   return abundance_get(global_abundance, h->header_buffer.data);
 }
 
-unsigned long fasta_get_position(fasta_handle h)
+unsigned long fasta_get_position(fastx_handle h)
 {
   return h->file_position;
 }
 
-unsigned long fasta_get_size(fasta_handle h)
+unsigned long fasta_get_size(fastx_handle h)
 {
   return h->file_size;
 }
 
-unsigned long fasta_get_lineno(fasta_handle h)
+unsigned long fasta_get_lineno(fastx_handle h)
 {
   return h->lineno_start;
 }
 
-unsigned long fasta_get_seqno(fasta_handle h)
+unsigned long fasta_get_seqno(fastx_handle h)
 {
   return h->seqno;
 }
 
-unsigned long fasta_get_header_length(fasta_handle h)
+unsigned long fasta_get_header_length(fastx_handle h)
 {
   return h->header_buffer.length;
 }
 
-unsigned long fasta_get_sequence_length(fasta_handle h)
+unsigned long fasta_get_sequence_length(fastx_handle h)
 {
   return h->sequence_buffer.length;
 }
 
-char * fasta_get_header(fasta_handle h)
+char * fasta_get_header(fastx_handle h)
 {
   return h->header_buffer.data;
 }
 
-char * fasta_get_sequence(fasta_handle h)
+char * fasta_get_sequence(fastx_handle h)
 {
   return h->sequence_buffer.data;
 }
diff --git a/src/fasta.h b/src/fasta.h
index 0c1d5a9..38f5f4d 100644
--- a/src/fasta.h
+++ b/src/fasta.h
@@ -58,61 +58,24 @@
 
 */
 
-struct fasta_buffer_s
-{
-  char * data;
-  unsigned long length;
-  unsigned long alloc;
-  unsigned long position;
-};
-
-struct fasta_s
-{
-  FILE * fp;
-
-#ifdef HAVE_ZLIB_H
-  gzFile fp_gz;
-#endif
-
-#ifdef HAVE_BZLIB_H
-  BZFILE * fp_bz;
-#endif
-
-  struct fasta_buffer_s file_buffer;
-  struct fasta_buffer_s header_buffer;
-  struct fasta_buffer_s sequence_buffer;
-
-  unsigned long file_size;
-  unsigned long file_position;
-
-  unsigned long lineno;
-  unsigned long lineno_start;
-  long seqno;
-
-  unsigned long stripped_all;
-  unsigned long stripped[256];
-
-  int format;
-};
-
-typedef struct fasta_s * fasta_handle;
 
 /* fasta input */
 
-fasta_handle fasta_open(const char * filename);
-void fasta_close(fasta_handle h);
-bool fasta_next(fasta_handle h,
+void fasta_open_rest(fastx_handle h);
+fastx_handle fasta_open(const char * filename);
+void fasta_close(fastx_handle h);
+bool fasta_next(fastx_handle h,
                 bool truncateatspace,
                 char * char_mapping);
-unsigned long fasta_get_position(fasta_handle h);
-unsigned long fasta_get_size(fasta_handle h);
-unsigned long fasta_get_lineno(fasta_handle h);
-unsigned long fasta_get_seqno(fasta_handle h);
-char * fasta_get_header(fasta_handle h);
-char * fasta_get_sequence(fasta_handle h);
-unsigned long fasta_get_header_length(fasta_handle h);
-unsigned long fasta_get_sequence_length(fasta_handle h);
-long fasta_get_abundance(fasta_handle h);
+unsigned long fasta_get_position(fastx_handle h);
+unsigned long fasta_get_size(fastx_handle h);
+unsigned long fasta_get_lineno(fastx_handle h);
+unsigned long fasta_get_seqno(fastx_handle h);
+char * fasta_get_header(fastx_handle h);
+char * fasta_get_sequence(fastx_handle h);
+unsigned long fasta_get_header_length(fastx_handle h);
+unsigned long fasta_get_sequence_length(fastx_handle h);
+long fasta_get_abundance(fastx_handle h);
 
 /* fasta output */
 
diff --git a/src/fastq.cc b/src/fastq.cc
index 894d4ea..b45cedc 100644
--- a/src/fastq.cc
+++ b/src/fastq.cc
@@ -60,27 +60,6 @@
 
 #include "vsearch.h"
 
-#define FASTQ_BUFFER_ALLOC 8192
-
-#ifdef HAVE_BZLIB_H
-#define BZ_VERBOSE_0 0
-#define BZ_VERBOSE_1 1
-#define BZ_VERBOSE_2 2
-#define BZ_VERBOSE_3 3
-#define BZ_VERBOSE_4 4
-#define BZ_MORE_MEM 0  /* faster decompression using more memory */
-#define BZ_LESS_MEM 1  /* slower decompression but requires less memory */
-#endif
-
-#define FORMAT_PLAIN 1
-#define FORMAT_BZIP  2
-#define FORMAT_GZIP  3
-
-static unsigned char MAGIC_GZIP[] = "\x1f\x8b";
-static unsigned char MAGIC_BZIP[] = "BZ";
-
-static char map_identity[256];
-
 void fastq_fatal(unsigned long lineno, const char * msg)
 {
   char * string; 
@@ -99,286 +78,8 @@ void fastq_fatal(unsigned long lineno, const char * msg)
     fatal("Out of memory");
 }
 
-void buffer_init(struct fastq_buffer_s * buffer)
-{
-  buffer->alloc = FASTQ_BUFFER_ALLOC;
-  buffer->data = (char*) xmalloc(buffer->alloc);
-  buffer->data[0] = 0;
-  buffer->length = 0;
-  buffer->position = 0;
-}
-
-void buffer_free(struct fastq_buffer_s * buffer)
-{
-  if (buffer->data)
-    free(buffer->data);
-  buffer->data = 0;
-  buffer->alloc = 0;
-  buffer->length = 0;
-  buffer->position = 0;
-}
-
-void buffer_makespace(struct fastq_buffer_s * buffer, unsigned long x)
-{
-  /* make sure there is space for x more chars in buffer */
-
-  if (buffer->length + x > buffer->alloc)
-    {
-      /* alloc space for x more characters,
-         but round up to nearest block size */
-      buffer->alloc = 
-        ((buffer->length + x + FASTQ_BUFFER_ALLOC - 1) / FASTQ_BUFFER_ALLOC)
-        * FASTQ_BUFFER_ALLOC;
-      buffer->data = (char*) xrealloc(buffer->data, buffer->alloc);
-    }
-}
-
-void buffer_truncate(struct fastq_buffer_s * buffer, bool truncateatspace)
-{
-  /* Truncate the zero-terminated header string by inserting a new
-     terminator (zero byte) at the first space/tab character
-     (if truncateatspace) or first linefeed. */
-  
-  if (truncateatspace)
-    buffer->length = strcspn(buffer->data, " \t\n");
-  else
-    buffer->length = strcspn(buffer->data, "\n");
-  
-  buffer->data[buffer->length] = 0;
-}
-
-
-fastq_handle fastq_open(const char * filename)
-{
-  fastq_handle h = (fastq_handle) xmalloc(sizeof(struct fastq_s));
-  
-  h->fp = NULL;
-  h->fp = fopen(filename, "rb");
-  if(!h->fp)
-    fatal("Error: Unable to open fastq file for reading (%s)", filename);
-  
-  /* detect compression (plain, gzipped or bzipped) */
-  
-  unsigned char magic[2];
-  h->format = FORMAT_PLAIN;
-  if (fread(&magic, 1, 2, h->fp) >= 2)
-    {
-      if (!memcmp(magic, MAGIC_GZIP, 2))
-        h->format = FORMAT_GZIP;
-      else if (!memcmp(magic, MAGIC_BZIP, 2))
-        h->format = FORMAT_BZIP;
-    }
-
-  /* Get size of original (uncompressed) file */
-
-  if (fseek(h->fp, 0, SEEK_END))
-    fatal("Error: Unable to seek in fastq file (%s)", filename);
-  h->file_size = ftell(h->fp);
-  rewind(h->fp);
-
-  if (h->format == FORMAT_GZIP)
-    {
-      /* GZIP: Keep original file open, then open as gzipped file as well */
-#ifdef HAVE_ZLIB_H
-      if (!gz_lib)
-        fatal("Files compressed with gzip are not supported");
-      if (! (h->fp_gz = (*gzdopen_p)(dup(fileno(h->fp)), "rb")))
-        fatal("Unable to open gzip compressed fastq file (%s)", filename);
-#else
-      fatal("Files compressed with gzip are not supported");
-#endif
-    }
-
-  if (h->format == FORMAT_BZIP)
-    {
-      /* BZIP2: Keep original file open, then open as bzipped file as well */
-#ifdef HAVE_BZLIB_H
-      if (!bz2_lib)
-        fatal("Files compressed with bzip2 are not supported");
-      int bzError;
-      if (! (h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp,
-                                            BZ_VERBOSE_0, BZ_MORE_MEM,
-                                            NULL, 0)))
-        fatal("Unable to open bzip2 compressed fastq file (%s)", filename);
-#else
-      fatal("Files compressed with bzip2 are not supported");
-#endif
-    }
-
-  h->stripped_all = 0;
-
-  for(int i=0; i<256; i++)
-    h->stripped[i] = 0;
-
-  h->file_position = 0;
-
-  buffer_init(& h->file_buffer);
-  buffer_init(& h->header_buffer);
-  buffer_init(& h->sequence_buffer);
-  buffer_init(& h->quality_buffer);
-
-  h->lineno = 1;
-  h->lineno_start = 1;
-  h->seqno = -1;
-
-  for(int i=0; i<256; i++)
-    map_identity[i] = i;
-
-  return h;
-}
-
-void fastq_close(fastq_handle h)
-{
-  /* Warn about stripped chars */
-
-  if (h->stripped_all)
-    {
-      fprintf(stderr, "WARNING: invalid characters stripped from fastq file:");
-      for (int i=0; i<256;i++)
-        if (h->stripped[i])
-          fprintf(stderr, " %c(%lu)", i, h->stripped[i]);
-      fprintf(stderr, "\n");
-
-      if (opt_log)
-        {
-          fprintf(fp_log, "WARNING: invalid characters stripped from fastq file:");
-          for (int i=0; i<256;i++)
-            if (h->stripped[i])
-              fprintf(fp_log, " %c(%lu)", i, h->stripped[i]);
-          fprintf(fp_log, "\n");
-        }
-    }
-
-#ifdef HAVE_BZLIB_H
-  int bz_error;
-#endif
-  
-  switch(h->format)
-    {
-    case FORMAT_PLAIN:
-      break;
-
-    case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
-      (*gzclose_p)(h->fp_gz);
-      h->fp_gz = 0;
-      break;
-#endif
-      
-    case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
-      (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz);
-      h->fp_bz = 0;
-      break;
-#endif
-
-    default:
-      fatal("Internal error");
-    }
-
-  fclose(h->fp);
-  h->fp = 0;
-  
-  buffer_free(& h->file_buffer);
-  buffer_free(& h->header_buffer);
-  buffer_free(& h->sequence_buffer);
-  buffer_free(& h->quality_buffer);
-
-  h->file_size = 0;
-  h->file_position = 0;
-
-  h->lineno = 0;
-  h->seqno = -1;
-
-  free(h);
-  h=0;
-}
-
-
-unsigned long fastq_file_fill_buffer(fastq_handle h)
-{
-  /* read more data if necessary */
-  unsigned long rest = h->file_buffer.length - h->file_buffer.position;
-  
-  if (rest > 0)
-    return rest;
-  else
-    {
-      unsigned long space = h->file_buffer.alloc - h->file_buffer.length;
-
-      if (space == 0)
-        {
-          /* back to beginning of buffer */
-          h->file_buffer.position = 0;
-          h->file_buffer.length = 0;
-          space = h->file_buffer.alloc;
-        }
-      
-      int bytes_read = 0;
-      
-#ifdef HAVE_BZLIB_H
-      int bzError = 0;
-#endif
-
-      switch(h->format)
-        {
-        case FORMAT_PLAIN:
-          bytes_read = fread(h->file_buffer.data
-                             + h->file_buffer.position,
-                             1,
-                             space,
-                             h->fp);
-          break;
-
-        case FORMAT_GZIP:
-#ifdef HAVE_ZLIB_H
-          bytes_read = (*gzread_p)(h->fp_gz,
-                                   h->file_buffer.data
-                                   + h->file_buffer.position,
-                                   space);
-          if (bytes_read < 0)
-            fatal("Error reading gzip compressed fastq file");
-          break;
-#endif
-          
-        case FORMAT_BZIP:
-#ifdef HAVE_BZLIB_H
-          bytes_read = (*BZ2_bzRead_p)(& bzError,
-                                       h->fp_bz,
-                                       h->file_buffer.data
-                                       + h->file_buffer.position,
-                                       space);
-          if ((bytes_read < 0) ||
-              ! ((bzError == BZ_OK) ||
-                 (bzError == BZ_STREAM_END) ||
-                 (bzError == BZ_SEQUENCE_ERROR)))
-            fatal("Error reading bzip2 compressed fastq file");
-          break;
-#endif
-          
-        default:
-          fatal("Internal error");
-        }
-      
-      h->file_buffer.length += bytes_read;
-      return bytes_read;
-    }
-}
-
-void buffer_extend(struct fastq_buffer_s * dest_buffer,
-                  char * source_buf,
-                  unsigned long len)
-{
-  buffer_makespace(dest_buffer, len+1);
-  memcpy(dest_buffer->data + dest_buffer->length,
-         source_buf,
-         len);
-  dest_buffer->length += len;
-  dest_buffer->data[dest_buffer->length] = 0;
-}
-
-void buffer_filter_extend(fastq_handle h,
-                          struct fastq_buffer_s * dest_buffer,
+void buffer_filter_extend(fastx_handle h,
+                          struct fastx_buffer_s * dest_buffer,
                           char * source_buf,
                           unsigned long len,
                           unsigned int * char_action,
@@ -446,7 +147,22 @@ void buffer_filter_extend(fastq_handle h,
   dest_buffer->length += q - d;
 }
 
-bool fastq_next(fastq_handle h,
+fastx_handle fastq_open(const char * filename)
+{
+  fastx_handle h = fastx_open(filename);
+
+  if (!fastx_is_fastq(h))
+    fatal("FASTQ file expected, FASTA file found (%s)", filename);
+
+  return h;
+}
+
+void fastq_close(fastx_handle h)
+{
+  fastx_close(h);
+}
+
+bool fastq_next(fastx_handle h,
                 bool truncateatspace,
                 char * char_mapping)
 {
@@ -459,7 +175,7 @@ bool fastq_next(fastq_handle h,
 
   h->lineno_start = h->lineno;
 
-  unsigned long rest = fastq_file_fill_buffer(h);
+  unsigned long rest = fastx_file_fill_buffer(h);
 
   /* check end of file */
 
@@ -479,7 +195,7 @@ bool fastq_next(fastq_handle h,
   while (lf == 0)
     {
       /* get more data if buffer empty */
-      rest = fastq_file_fill_buffer(h);
+      rest = fastx_file_fill_buffer(h);
       if (rest == 0)
         fastq_fatal(h->lineno, "Unexpected end of file");
       
@@ -510,7 +226,7 @@ bool fastq_next(fastq_handle h,
   while (1)
     {
       /* get more data, if necessary */
-      rest = fastq_file_fill_buffer(h);
+      rest = fastx_file_fill_buffer(h);
 
       /* cannot end here */
       if (rest == 0)
@@ -548,7 +264,7 @@ bool fastq_next(fastq_handle h,
   unsigned long lineno_plus = h->lineno;
 
   /* read + line */
-  fastq_buffer_s plusline_buffer;
+  fastx_buffer_s plusline_buffer;
   buffer_init(&plusline_buffer);
 
   /* skip + character */
@@ -559,7 +275,7 @@ bool fastq_next(fastq_handle h,
   while (lf == 0)
     {
       /* get more data if buffer empty */
-      rest = fastq_file_fill_buffer(h);
+      rest = fastx_file_fill_buffer(h);
 
       /* cannot end here */
       if (rest == 0)
@@ -614,7 +330,7 @@ bool fastq_next(fastq_handle h,
   while (1)
     {
       /* get more data, if necessary */
-      rest = fastq_file_fill_buffer(h);
+      rest = fastx_file_fill_buffer(h);
 
       /* end if no more data */
       if (rest == 0)
@@ -642,7 +358,7 @@ bool fastq_next(fastq_handle h,
                            & h->quality_buffer,
                            h->file_buffer.data + h->file_buffer.position,
                            len,
-                           char_fq_action_qual, map_identity, lineno_qual);
+                           char_fq_action_qual, chrmap_identity, lineno_qual);
       h->file_buffer.position += len;
       rest -= len;
     }
@@ -656,69 +372,57 @@ bool fastq_next(fastq_handle h,
 
   buffer_truncate(& h->header_buffer, truncateatspace);
 
-#ifdef HAVE_ZLIB_H
-  if (h->format == FORMAT_GZIP)
-    {
-      /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */
-      int fd = dup(fileno(h->fp));
-      h->file_position = lseek(fd, 0, SEEK_CUR);
-      close(fd);
-    }
-  else
-#endif
-    h->file_position = ftell(h->fp);
-  
   h->seqno++;
 
   return 1;
 }
 
-char * fastq_get_quality(fastq_handle h)
+char * fastq_get_quality(fastx_handle h)
 {
   return h->quality_buffer.data;
 }
 
-unsigned long fastq_get_quality_length(fastq_handle h)
+unsigned long fastq_get_quality_length(fastx_handle h)
 {
   return h->quality_buffer.length;
 }
 
-unsigned long fastq_get_position(fastq_handle h)
+unsigned long fastq_get_position(fastx_handle h)
 {
   return h->file_position;
 }
 
-unsigned long fastq_get_size(fastq_handle h)
+unsigned long fastq_get_size(fastx_handle h)
 {
   return h->file_size;
 }
 
-unsigned long fastq_get_lineno(fastq_handle h)
+unsigned long fastq_get_lineno(fastx_handle h)
 {
   return h->lineno_start;
 }
 
-unsigned long fastq_get_seqno(fastq_handle h)
+unsigned long fastq_get_seqno(fastx_handle h)
 {
   return h->seqno;
 }
 
-unsigned long fastq_get_header_length(fastq_handle h)
+unsigned long fastq_get_header_length(fastx_handle h)
 {
   return h->header_buffer.length;
 }
 
-unsigned long fastq_get_sequence_length(fastq_handle h)
+unsigned long fastq_get_sequence_length(fastx_handle h)
 {
   return h->sequence_buffer.length;
 }
 
-char * fastq_get_header(fastq_handle h)
+char * fastq_get_header(fastx_handle h)
 {
   return h->header_buffer.data;
 }
 
-char * fastq_get_sequence(fastq_handle h)
+char * fastq_get_sequence(fastx_handle h)
 {
   return h->sequence_buffer.data;
 }
@@ -802,7 +506,7 @@ void fastq_print_relabel(FILE * fp,
   fastq_print_quality(fp, quality);
 }
 
-long fastq_get_abundance(fastq_handle h)
+long fastq_get_abundance(fastx_handle h)
 {
   return abundance_get(global_abundance, h->header_buffer.data);
 }
diff --git a/src/fastq.h b/src/fastq.h
index 0bffae2..daad7c7 100644
--- a/src/fastq.h
+++ b/src/fastq.h
@@ -58,64 +58,23 @@
 
 */
 
-struct fastq_buffer_s
-{
-  char * data;
-  unsigned long length;
-  unsigned long alloc;
-  unsigned long position;
-};
-
-struct fastq_s
-{
-  FILE * fp;
-
-#ifdef HAVE_ZLIB_H
-  gzFile fp_gz;
-#endif
-
-#ifdef HAVE_BZLIB_H
-  BZFILE * fp_bz;
-#endif
-
-  struct fastq_buffer_s file_buffer;
-
-  struct fastq_buffer_s header_buffer;
-  struct fastq_buffer_s sequence_buffer;
-  struct fastq_buffer_s quality_buffer;
-
-  unsigned long file_size;
-  unsigned long file_position;
-
-  unsigned long lineno;
-  unsigned long lineno_start;
-  long seqno;
-
-  unsigned long stripped_all;
-  unsigned long stripped[256];
-
-  int format;
-
-};
-
-typedef struct fastq_s * fastq_handle;
-
-fastq_handle fastq_open(const char * filename);
-void fastq_close(fastq_handle h);
-bool fastq_next(fastq_handle h,
+void fastq_open_rest(fastx_handle h);
+fastx_handle fastq_open(const char * filename);
+void fastq_close(fastx_handle h);
+bool fastq_next(fastx_handle h,
                 bool truncateatspace,
                 char * char_mapping);
-unsigned long fastq_get_position(fastq_handle h);
-unsigned long fastq_get_size(fastq_handle h);
-unsigned long fastq_get_lineno(fastq_handle h);
-unsigned long fastq_get_seqno(fastq_handle h);
-char * fastq_get_header(fastq_handle h);
-char * fastq_get_sequence(fastq_handle h);
-char * fastq_get_quality(fastq_handle h);
-long fastq_get_abundance(fastq_handle h);
-unsigned long fastq_get_header_length(fastq_handle h);
-unsigned long fastq_get_sequence_length(fastq_handle h);
-unsigned long fastq_get_quality_length(fastq_handle h);
+unsigned long fastq_get_position(fastx_handle h);
+unsigned long fastq_get_size(fastx_handle h);
+unsigned long fastq_get_lineno(fastx_handle h);
+unsigned long fastq_get_seqno(fastx_handle h);
+char * fastq_get_header(fastx_handle h);
+char * fastq_get_sequence(fastx_handle h);
+char * fastq_get_quality(fastx_handle h);
+long fastq_get_abundance(fastx_handle h);
+unsigned long fastq_get_header_length(fastx_handle h);
+unsigned long fastq_get_sequence_length(fastx_handle h);
+unsigned long fastq_get_quality_length(fastx_handle h);
 
 void fastq_print(FILE * fp, char * header, char * sequence, char * quality);
 
diff --git a/src/fastqops.cc b/src/fastqops.cc
index 2281e31..4721e3b 100644
--- a/src/fastqops.cc
+++ b/src/fastqops.cc
@@ -82,7 +82,7 @@ int fastq_get_qual(char q)
 
 void fastq_filter()
 {
-  fastq_handle h = fastq_open(opt_fastq_filter);
+  fastx_handle h = fastq_open(opt_fastq_filter);
 
   unsigned long filesize = fastq_get_size(h);
 
@@ -347,7 +347,7 @@ void fastq_chars()
       maxrun[c] = 0;
     }
 
-  fastq_handle h = fastq_open(opt_fastq_chars);
+  fastx_handle h = fastq_open(opt_fastq_chars);
 
   unsigned long filesize = fastq_get_size(h);
 
@@ -525,7 +525,7 @@ double q2p(double q)
 
 void fastq_stats()
 {
-  fastq_handle h = fastq_open(opt_fastq_stats);
+  fastx_handle h = fastq_open(opt_fastq_stats);
 
   unsigned long filesize = fastq_get_size(h);
 
@@ -951,7 +951,7 @@ void fastx_revcomp()
 
 void fastq_convert()
 {
-  fastq_handle h = fastq_open(opt_fastq_convert);
+  fastx_handle h = fastq_open(opt_fastq_convert);
 
   if (!h)
     fatal("Unable to open FASTQ file");
diff --git a/src/fastx.cc b/src/fastx.cc
index 7953d69..0d5344a 100644
--- a/src/fastx.cc
+++ b/src/fastx.cc
@@ -60,7 +60,10 @@
 
 #include "vsearch.h"
 
-/* file type detector and wrapper for fastq and fastx parser */
+/* file compression and format detector */
+/* basic file buffering function for fastq and fastx parsers */
+
+#define FASTX_BUFFER_ALLOC 8192
 
 #ifdef HAVE_BZLIB_H
 #define BZ_VERBOSE_0 0
@@ -79,151 +82,308 @@
 static unsigned char MAGIC_GZIP[] = "\x1f\x8b";
 static unsigned char MAGIC_BZIP[] = "BZ";
 
-int fastx_detect(const char * filename)
+
+void buffer_init(struct fastx_buffer_s * buffer)
+{
+  buffer->alloc = FASTX_BUFFER_ALLOC;
+  buffer->data = (char*) xmalloc(buffer->alloc);
+  buffer->data[0] = 0;
+  buffer->length = 0;
+  buffer->position = 0;
+}
+
+void buffer_free(struct fastx_buffer_s * buffer)
+{
+  if (buffer->data)
+    free(buffer->data);
+  buffer->data = 0;
+  buffer->alloc = 0;
+  buffer->length = 0;
+  buffer->position = 0;
+}
+
+void buffer_makespace(struct fastx_buffer_s * buffer, unsigned long x)
+{
+  /* make sure there is space for x more chars in buffer */
+
+  if (buffer->length + x > buffer->alloc)
+    {
+      /* alloc space for x more characters,
+         but round up to nearest block size */
+      buffer->alloc = 
+        ((buffer->length + x + FASTX_BUFFER_ALLOC - 1) / FASTX_BUFFER_ALLOC)
+        * FASTX_BUFFER_ALLOC;
+      buffer->data = (char*) xrealloc(buffer->data, buffer->alloc);
+    }
+}
+
+void buffer_truncate(struct fastx_buffer_s * buffer, bool truncateatspace)
 {
+  /* Truncate the zero-terminated header string by inserting a new
+     terminator (zero byte) at the first space/tab character
+     (if truncateatspace) or first linefeed. */
+  
+  if (truncateatspace)
+    buffer->length = strcspn(buffer->data, " \t\n");
+  else
+    buffer->length = strcspn(buffer->data, "\n");
+  
+  buffer->data[buffer->length] = 0;
+}
+
+void buffer_extend(struct fastx_buffer_s * dest_buffer,
+                  char * source_buf,
+                  unsigned long len)
+{
+  buffer_makespace(dest_buffer, len+1);
+  memcpy(dest_buffer->data + dest_buffer->length,
+         source_buf,
+         len);
+  dest_buffer->length += len;
+  dest_buffer->data[dest_buffer->length] = 0;
+}
+
+fastx_handle fastx_open(const char * filename)
+{
+  fastx_handle h = (fastx_handle) xmalloc(sizeof(struct fastx_s));
+
+  h->fp = 0;
+
 #ifdef HAVE_ZLIB_H
-  gzFile fp_gz = 0;
+  h->fp_gz = 0;
 #endif
 
 #ifdef HAVE_BZLIB_H
-  BZFILE * fp_bz = 0;
+  h->fp_bz = 0;
+  int bzError = 0;
 #endif
+ 
+  h->fp = fopen(filename, "rb");
+  if (!h->fp)
+    fatal("Unable to open file for reading (%s)", filename);
 
-  int format;
+  /* Get mode and size of original (uncompressed) file */
 
-  FILE * fp = fopen(filename, "rb");
-  if (!fp)
-    fatal("Error: Unable to open file for reading (%s)", filename);
-  
-  /* detect compression (plain, gzipped or bzipped) */
-  
-  unsigned char magic[2];
-  format = FORMAT_PLAIN;
-  if (fread(&magic, 1, 2, fp) >= 2)
+  struct stat fs;
+  h->file_size = 0;
+
+  if (fstat(fileno(h->fp), & fs))
+    fatal("Unable to fstat on input file (%s)", filename);
+
+  h->is_pipe = ! S_ISREG(fs.st_mode);
+
+  h->file_size = 0;
+
+  if (! h->is_pipe)
     {
-      if (!memcmp(magic, MAGIC_GZIP, 2))
-        format = FORMAT_GZIP;
-      else if (!memcmp(magic, MAGIC_BZIP, 2))
-        format = FORMAT_BZIP;
+      if (fseek(h->fp, 0, SEEK_END))
+        fatal("Unable to seek in input file (%s)", filename);
+      h->file_size = ftell(h->fp);
+      rewind(h->fp);
     }
 
-  /* close and reopen to avoid problems with gzip library */
-  /* rewind was not enough */
+  if (opt_gzip_decompress)
+    {
+      h->format = FORMAT_GZIP;
+    }
+  else if (opt_bzip2_decompress)
+    {
+      h->format = FORMAT_BZIP;
+    }
+  else if (h->is_pipe)
+    {
+      h->format = FORMAT_PLAIN;
+    }
+  else
+    {
+      /* autodetect compression (plain, gzipped or bzipped) */
+
+      /* read two characters and compare with magic */
 
-  fclose(fp);
-  fp = fopen(filename, "rb");
-  if (!fp)
-    fatal("Error: Unable to open file for reading (%s)", filename);
+      unsigned char magic[2];
 
-  if (format == FORMAT_GZIP)
+      h->format = FORMAT_PLAIN;
+      if (fread(&magic, 1, 2, h->fp) < 2)
+        fatal("Unable to read from file (%s)", filename);
+
+      if (memcmp(magic, MAGIC_GZIP, 2) == 0)
+        h->format = FORMAT_GZIP;
+      else if (memcmp(magic, MAGIC_BZIP, 2) == 0)
+        h->format = FORMAT_BZIP;
+
+      /* close and reopen to avoid problems with gzip library */
+      /* rewind was not enough */
+
+      fclose(h->fp);
+      h->fp = fopen(filename, "rb");
+      if (!h->fp)
+        fatal("Unable to open file for reading (%s)", filename);
+    }
+
+  if (h->format == FORMAT_GZIP)
     {
-      /* GZIP: Keep original file open, then open as bzipped file as well */
+      /* GZIP: Keep original file open, then open as gzipped file as well */
 #ifdef HAVE_ZLIB_H
       if (!gz_lib)
         fatal("Files compressed with gzip are not supported");
-      if (! (fp_gz = (*gzdopen_p)(fileno(fp), "rb")))
+      if (! (h->fp_gz = (*gzdopen_p)(fileno(h->fp), "rb"))) // dup?
         fatal("Unable to open gzip compressed file (%s)", filename);
 #else
       fatal("Files compressed with gzip are not supported");
 #endif
     }
 
-  if (format == FORMAT_BZIP)
+  if (h->format == FORMAT_BZIP)
     {
       /* BZIP2: Keep original file open, then open as bzipped file as well */
 #ifdef HAVE_BZLIB_H
       if (!bz2_lib)
         fatal("Files compressed with bzip2 are not supported");
-      int bzError;
-      if (! (fp_bz = (*BZ2_bzReadOpen_p)(& bzError, fp,
-                                       BZ_VERBOSE_0, BZ_MORE_MEM, NULL, 0)))
+      if (! (h->fp_bz = (*BZ2_bzReadOpen_p)(& bzError, h->fp,
+                                         BZ_VERBOSE_0, BZ_MORE_MEM,
+                                         NULL, 0)))
         fatal("Unable to open bzip2 compressed file (%s)", filename);
 #else
       fatal("Files compressed with bzip2 are not supported");
 #endif
     }
 
-  /* read one char and see if it starts with > or @ */
+  /* init buffers */
 
-  const int BUFFERLEN = 1;
-  char buffer[BUFFERLEN];
-  
-  int bytes_read = 0;
+  h->file_position = 0;
+
+  buffer_init(& h->file_buffer);
+
+  /* start filling up file buffer */
+
+  unsigned long rest = fastx_file_fill_buffer(h);
   
-#ifdef HAVE_BZLIB_H
-  int bzError = 0;
-#endif
- 
-  switch(format)
+  if (rest < 2)
+    fatal("File too small");
+
+
+  /* examine first char and see if it starts with > or @ */
+
+  int filetype = 0;
+  char * first = h->file_buffer.data;
+
+  if (*first == '>')
     {
-    case FORMAT_PLAIN:
-      bytes_read = fread(buffer,
-                         1,
-                         BUFFERLEN,
-                         fp);
-      break;
+      filetype = 1;
+      h->is_fastq = 0;
+    }
+  else if (*first == '@')
+    {
+      filetype = 2;
+      h->is_fastq = 1;
+    }
+
+  if (filetype == 0)
+    {
+      /* close files if unrecognized file type */
       
-    case FORMAT_GZIP:
+      switch(h->format)
+        {
+        case FORMAT_PLAIN:
+          break;
+          
+        case FORMAT_GZIP:
 #ifdef HAVE_ZLIB_H
-      bytes_read = (*gzread_p)(fp_gz,
-                             buffer,
-                             BUFFERLEN);
-      if (bytes_read < 0)
-        fatal("Error reading gzip compressed file (%s)", filename);
-      break;
+          (*gzclose_p)(h->fp_gz);
+          h->fp_gz = 0;
+          break;
 #endif
-      
-    case FORMAT_BZIP:
+          
+        case FORMAT_BZIP:
 #ifdef HAVE_BZLIB_H
-      bytes_read = (*BZ2_bzRead_p)(& bzError,
-                                 fp_bz,
-                                 buffer,
-                                 BUFFERLEN);
-      if ((bytes_read < 0) ||
-          ! ((bzError == BZ_OK) ||
-             (bzError == BZ_STREAM_END) ||
-             (bzError == BZ_SEQUENCE_ERROR)))
-        fatal("Error reading bzip2 compressed file (%s)", filename);
-      break;
+          (*BZ2_bzReadClose_p)(&bzError, h->fp_bz);
+          h->fp_bz = 0;
+          break;
 #endif
+          
+        default:
+          fatal("Internal error");
+        }
       
-    default:
-      fatal("Internal error");
+      fclose(h->fp);
+      h->fp = 0;
+
+      if (memcmp(first, MAGIC_GZIP, 2) == 0)
+        fatal("File appears to be gzip compressed. Please use --gzip_decompress");
+
+      if (memcmp(first, MAGIC_BZIP, 2) == 0)
+        fatal("File appears to be bzip2 compressed. Please use --bzip2_decompress");
+
+      fatal("File type not recognized.");
+
+      return 0;
     }
 
-  if (bytes_read < BUFFERLEN)
-    fatal("Error reading file (%s)", filename);
+  /* more initialization */
 
-  int filetype = 0;
-  if (buffer[0] == '>')
-    filetype = 1;
-  else if (buffer[0] == '@')
-    filetype = 2;
+  buffer_init(& h->header_buffer);
+  buffer_init(& h->sequence_buffer);
+  buffer_init(& h->quality_buffer);
+
+  h->stripped_all = 0;
+
+  for(int i=0; i<256; i++)
+    h->stripped[i] = 0;
 
-  /* close files */
+  h->lineno = 1;
+  h->lineno_start = 1;
+  h->seqno = -1;
+
+  return h;
+}
+
+bool fastx_is_fastq(fastx_handle h)
+{
+  return h->is_fastq;
+}
+
+void fastx_close(fastx_handle h)
+{
+  /* Warn about stripped chars */
+
+  if (h->stripped_all)
+    {
+      fprintf(stderr, "WARNING: invalid characters stripped from fastq file:");
+      for (int i=0; i<256;i++)
+        if (h->stripped[i])
+          fprintf(stderr, " %c(%lu)", i, h->stripped[i]);
+      fprintf(stderr, "\n");
+
+      if (opt_log)
+        {
+          fprintf(fp_log, "WARNING: invalid characters stripped from fastq file:");
+          for (int i=0; i<256;i++)
+            if (h->stripped[i])
+              fprintf(fp_log, " %c(%lu)", i, h->stripped[i]);
+          fprintf(fp_log, "\n");
+        }
+    }
 
 #ifdef HAVE_BZLIB_H
   int bz_error;
 #endif
-  
-  switch(format)
+
+  switch(h->format)
     {
     case FORMAT_PLAIN:
-      fclose(fp);
-      fp = 0;
       break;
 
     case FORMAT_GZIP:
 #ifdef HAVE_ZLIB_H
-      (*gzclose_p)(fp_gz);
-      fp_gz = 0;
+      (*gzclose_p)(h->fp_gz);
+      h->fp_gz = 0;
       break;
 #endif
-      
+
     case FORMAT_BZIP:
 #ifdef HAVE_BZLIB_H
-      (*BZ2_bzReadClose_p)(&bz_error, fp_bz);
-      fp_bz = 0;
+      (*BZ2_bzReadClose_p)(&bz_error, h->fp_bz);
+      h->fp_bz = 0;
       break;
 #endif
 
@@ -231,46 +391,107 @@ int fastx_detect(const char * filename)
       fatal("Internal error");
     }
 
-  return filetype;
-}
+  fclose(h->fp);
+  h->fp = 0;
 
-bool fastx_is_fastq(fastx_handle h)
-{
-  return h->is_fastq;
+  buffer_free(& h->file_buffer);
+  buffer_free(& h->header_buffer);
+  buffer_free(& h->sequence_buffer);
+  buffer_free(& h->quality_buffer);
+
+  h->file_size = 0;
+  h->file_position = 0;
+
+  h->lineno = 0;
+  h->seqno = -1;
+
+  free(h);
+  h=0;
 }
 
-fastx_handle fastx_open(const char * filename)
+unsigned long fastx_file_fill_buffer(fastx_handle h)
 {
-  int filetype = fastx_detect(filename);
+  /* read more data if necessary */
+  unsigned long rest = h->file_buffer.length - h->file_buffer.position;
 
-  if (filetype == 0)
-    return 0;
+  if (rest > 0)
+    return rest;
+  else
+    {
+      unsigned long space = h->file_buffer.alloc - h->file_buffer.length;
 
-  fastx_handle h = (fastx_handle) xmalloc(sizeof(struct fastx_s));
+      if (space == 0)
+        {
+          /* back to beginning of buffer */
+          h->file_buffer.position = 0;
+          h->file_buffer.length = 0;
+          space = h->file_buffer.alloc;
+        }
 
-  h->is_fastq = (filetype == 2);
+      int bytes_read = 0;
 
-  if (h->is_fastq)
-    h->handle.fastq = fastq_open(filename);
-  else
-    h->handle.fasta = fasta_open(filename);
+#ifdef HAVE_BZLIB_H
+      int bzError = 0;
+#endif
 
-  return h;
-}
+      switch(h->format)
+        {
+        case FORMAT_PLAIN:
+          bytes_read = fread(h->file_buffer.data
+                             + h->file_buffer.position,
+                             1,
+                             space,
+                             h->fp);
+          break;
+
+        case FORMAT_GZIP:
+#ifdef HAVE_ZLIB_H
+          bytes_read = (*gzread_p)(h->fp_gz,
+                                   h->file_buffer.data
+                                   + h->file_buffer.position,
+                                   space);
+          if (bytes_read < 0)
+            fatal("Unable to read gzip compressed file");
+          break;
+#endif
 
-void fastx_close(fastx_handle h)
-{
-  if (h->is_fastq)
-    {
-      fastq_close(h->handle.fastq);
-      h->handle.fastq = 0;
-    }
-  else
-    {
-      fasta_close(h->handle.fasta);
-      h->handle.fasta = 0;
+        case FORMAT_BZIP:
+#ifdef HAVE_BZLIB_H
+          bytes_read = (*BZ2_bzRead_p)(& bzError,
+                                       h->fp_bz,
+                                       h->file_buffer.data
+                                       + h->file_buffer.position,
+                                       space);
+          if ((bytes_read < 0) ||
+              ! ((bzError == BZ_OK) ||
+                 (bzError == BZ_STREAM_END) ||
+                 (bzError == BZ_SEQUENCE_ERROR)))
+            fatal("Unable to read from bzip2 compressed file");
+          break;
+#endif
+
+        default:
+          fatal("Internal error");
+        }
+
+      if (!h->is_pipe)
+        {
+#ifdef HAVE_ZLIB_H
+          if (h->format == FORMAT_GZIP)
+            {
+              /* Circumvent the missing gzoffset function in zlib 1.2.3 and earlier */
+              int fd = dup(fileno(h->fp));
+              h->file_position = lseek(fd, 0, SEEK_CUR);
+              close(fd);
+            }
+          else
+#endif
+            h->file_position = ftell(h->fp);
+        }
+
+      h->file_buffer.length += bytes_read;
+      return bytes_read;
     }
-  free(h);
 }
 
 bool fastx_next(fastx_handle h,
@@ -278,83 +499,83 @@ bool fastx_next(fastx_handle h,
                 char * char_mapping)
 {
   if (h->is_fastq)
-    return fastq_next(h->handle.fastq, truncateatspace, char_mapping);
+    return fastq_next(h, truncateatspace, char_mapping);
   else
-    return fasta_next(h->handle.fasta, truncateatspace, char_mapping);
+    return fasta_next(h, truncateatspace, char_mapping);
 }
 
 unsigned long fastx_get_position(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_position(h->handle.fastq);
+    return fastq_get_position(h);
   else
-    return fasta_get_position(h->handle.fasta);
+    return fasta_get_position(h);
 }
 
 
 unsigned long fastx_get_size(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_size(h->handle.fastq);
+    return fastq_get_size(h);
   else
-    return fasta_get_size(h->handle.fasta);
+    return fasta_get_size(h);
 }
 
 
 unsigned long fastx_get_lineno(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_lineno(h->handle.fastq);
+    return fastq_get_lineno(h);
   else
-    return fasta_get_lineno(h->handle.fasta);
+    return fasta_get_lineno(h);
 }
 
 
 unsigned long fastx_get_seqno(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_seqno(h->handle.fastq);
+    return fastq_get_seqno(h);
   else
-    return fasta_get_seqno(h->handle.fasta);
+    return fasta_get_seqno(h);
 }
 
 char * fastx_get_header(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_header(h->handle.fastq);
+    return fastq_get_header(h);
   else
-    return fasta_get_header(h->handle.fasta);
+    return fasta_get_header(h);
 }
 
 char * fastx_get_sequence(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_sequence(h->handle.fastq);
+    return fastq_get_sequence(h);
   else
-    return fasta_get_sequence(h->handle.fasta);
+    return fasta_get_sequence(h);
 }
 
 unsigned long fastx_get_header_length(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_header_length(h->handle.fastq);
+    return fastq_get_header_length(h);
   else
-    return fasta_get_header_length(h->handle.fasta);
+    return fasta_get_header_length(h);
 }
 
 unsigned long fastx_get_sequence_length(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_sequence_length(h->handle.fastq);
+    return fastq_get_sequence_length(h);
   else
-    return fasta_get_sequence_length(h->handle.fasta);
+    return fasta_get_sequence_length(h);
 }
 
 
 char * fastx_get_quality(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_quality(h->handle.fastq);
+    return fastq_get_quality(h);
   else
     return 0;
 }
@@ -362,8 +583,8 @@ char * fastx_get_quality(fastx_handle h)
 long fastx_get_abundance(fastx_handle h)
 {
   if (h->is_fastq)
-    return fastq_get_abundance(h->handle.fastq);
+    return fastq_get_abundance(h);
   else
-    return fasta_get_abundance(h->handle.fasta);
+    return fasta_get_abundance(h);
 }
 
diff --git a/src/fastx.h b/src/fastx.h
index de9d68f..d3b396f 100644
--- a/src/fastx.h
+++ b/src/fastx.h
@@ -58,18 +58,59 @@
 
 */
 
+struct fastx_buffer_s
+{
+  char * data;
+  unsigned long length;
+  unsigned long alloc;
+  unsigned long position;
+};
+
+void buffer_init(struct fastx_buffer_s * buffer);
+void buffer_free(struct fastx_buffer_s * buffer);
+void buffer_extend(struct fastx_buffer_s * dest_buffer,
+                   char * source_buf,
+                   unsigned long len);
+void buffer_makespace(struct fastx_buffer_s * buffer, unsigned long x);
+void buffer_truncate(struct fastx_buffer_s * buffer, bool truncateatspace);
+
 struct fastx_s
 {
+  bool is_pipe;
   bool is_fastq;
-  union
-  {
-    fasta_handle fasta;
-    fastq_handle fastq;
-  } handle;
+
+  FILE * fp;
+
+#ifdef HAVE_ZLIB_H
+  gzFile fp_gz;
+#endif
+
+#ifdef HAVE_BZLIB_H
+  BZFILE * fp_bz;
+#endif
+
+  struct fastx_buffer_s file_buffer;
+
+  struct fastx_buffer_s header_buffer;
+  struct fastx_buffer_s sequence_buffer;
+  struct fastx_buffer_s quality_buffer;
+
+  unsigned long file_size;
+  unsigned long file_position;
+
+  unsigned long lineno;
+  unsigned long lineno_start;
+  long seqno;
+
+  unsigned long stripped_all;
+  unsigned long stripped[256];
+
+  int format;
 };
 
 typedef struct fastx_s * fastx_handle;
 
+
 /* fastx input */
 
 bool fastx_is_fastq(fastx_handle h);
@@ -89,3 +130,5 @@ unsigned long fastx_get_sequence_length(fastx_handle h);
 
 char * fastx_get_quality(fastx_handle h);
 long fastx_get_abundance(fastx_handle h);
+
+unsigned long fastx_file_fill_buffer(fastx_handle h);
diff --git a/src/maps.cc b/src/maps.cc
index 043212c..7ad4351 100644
--- a/src/maps.cc
+++ b/src/maps.cc
@@ -430,3 +430,56 @@ char chrmap_no_change[256] =
     'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N',
     'N','N','N','N','N','N','N','N','N','N','N','N','N','N','N','N'
   };
+
+char chrmap_identity[256] =
+  {
+    /* identity map */
+
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+
+    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+
+    0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+    0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+    0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+
+    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+    0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+
+    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+    0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+
+    0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+    0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+
+    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+    0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+
+    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+
+    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+
+    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+
+    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+
+    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+    0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+
+    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+
+    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+
+    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+  };
diff --git a/src/maps.h b/src/maps.h
index fd51831..10bf8ce 100644
--- a/src/maps.h
+++ b/src/maps.h
@@ -72,3 +72,4 @@ extern char chrmap_complement[256];
 extern char chrmap_normalize[256];
 extern char chrmap_upcase[256];
 extern char chrmap_no_change[256];
+extern char chrmap_identity[256];
diff --git a/src/mergepairs.cc b/src/mergepairs.cc
index 33d478c..1d5fbd0 100644
--- a/src/mergepairs.cc
+++ b/src/mergepairs.cc
@@ -77,8 +77,8 @@ static FILE * fp_fastqout_notmerged_rev = 0;
 static FILE * fp_fastaout_notmerged_fwd = 0;
 static FILE * fp_fastaout_notmerged_rev = 0;
 static FILE * fp_eetabbedout = 0;
-static fastq_handle fastq_fwd;
-static fastq_handle fastq_rev;
+static fastx_handle fastq_fwd;
+static fastx_handle fastq_rev;
 static long merged = 0;
 static long notmerged = 0;
 static long total = 0;
diff --git a/src/msa.cc b/src/msa.cc
index af873c6..959fc9b 100644
--- a/src/msa.cc
+++ b/src/msa.cc
@@ -62,80 +62,48 @@
 
 /* Compute consensus sequence and msa of clustered sequences */
 
+#define PROFSIZE 6
+
 static char * aln;
 static int alnpos;
 static int * profile;
 
 void msa_add(char c)
 {
-  int * p = profile + 4 * alnpos;
+  int * p = profile + PROFSIZE * alnpos;
 
   switch(toupper(c))
     {
     case 'A':
-      p[0] += 12;
+      p[0]++;
       break;
     case 'C':
-      p[1] += 12;
+      p[1]++;
       break;
     case 'G':
-      p[2] += 12;
+      p[2]++;
       break;
     case 'T':
     case 'U':
-      p[3] += 12;
+      p[3]++;
       break;
     case 'R':
-      p[0] += 6;
-      p[2] += 6;
-      break;
     case 'Y':
-      p[1] += 6;
-      p[3] += 6;
-      break;
     case 'S':
-      p[1] += 6;
-      p[2] += 6;
-      break;
     case 'W':
-      p[0] += 6;
-      p[3] += 6;
-      break;
     case 'K':
-      p[2] += 6;
-      p[3] += 6;
-      break;
     case 'M':
-      p[0] += 6;
-      p[1] += 6;
-      break;
     case 'B':
-      p[1] += 4;
-      p[2] += 4;
-      p[3] += 4;
-      break;
     case 'D':
-      p[0] += 4;
-      p[2] += 4;
-      p[3] += 4;
-      break;
     case 'H':
-      p[0] += 4;
-      p[1] += 4;
-      p[3] += 4;
-      break;
     case 'V':
-      p[0] += 4;
-      p[1] += 4;
-      p[2] += 4;
-      break;
     case 'N':
-      p[0] += 3;
-      p[1] += 3;
-      p[2] += 3;
-      p[3] += 3;
+      p[4]++;
+      break;
+    case '-':
+      p[5]++;
       break;
-    }   
+    }
 
   aln[alnpos++] = c;
 }
@@ -185,8 +153,8 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile,
   alnlen += centroid_len;
 
   /* allocate memory for profile (for consensus) and aligned seq */
-  profile = (int *) xmalloc(4 * sizeof(int) * alnlen);
-  memset(profile, 0, 4 * sizeof(int) * alnlen);
+  profile = (int *) xmalloc(PROFSIZE * sizeof(int) * alnlen);
+  memset(profile, 0, PROFSIZE * sizeof(int) * alnlen);
   aln = (char *) xmalloc(alnlen+1);
   char * cons = (char *) xmalloc(alnlen+1);
   
@@ -313,26 +281,32 @@ void msa(FILE * fp_msaout, FILE * fp_consout, FILE * fp_profile,
         }
       else
         {
-          /* find most common symbol */
+          /* find most common symbol of A, C, G and T */
           char best_sym = 0;
-          int best_count = -1;
-          int nongap_count = 0;
+          int best_count = 0;
           for(int c=0; c<4; c++)
             {
-              int count = profile[4*i+c];
+              int count = profile[PROFSIZE*i+c];
               if (count > best_count)
                 {
                   best_count = count;
-                  best_sym = c;
+                  best_sym = c+1;
                 }
-              nongap_count += count;
             }
 
-          int gap_count = 12 * target_count - nongap_count;
+          /* if no A, C, G, or T, check if there are any N's */
+          int n_count = profile[PROFSIZE*i+4];
+          if ((best_count == 0) && (n_count > 0))
+            {
+              best_count = n_count;
+              best_sym = 15; // N
+            }
 
+          /* compare to the number of gap symbols */
+          int gap_count = profile[PROFSIZE*i+5];
           if (best_count >= gap_count)
             {
-              char sym = sym_nt_2bit[(int)best_sym];
+              char sym = sym_nt_4bit[(int)best_sym];
               aln[i] = sym;
               cons[conslen++] = sym;
             }
diff --git a/src/rerep.cc b/src/rerep.cc
index d886f93..cf15b1b 100644
--- a/src/rerep.cc
+++ b/src/rerep.cc
@@ -74,7 +74,7 @@ void rereplicate()
         fatal("Unable to open fasta output file for writing");
     }
 
-  fasta_handle fh = fasta_open(opt_rereplicate);
+  fastx_handle fh = fasta_open(opt_rereplicate);
   long filesize = fasta_get_size(fh);
 
   progress_init("Rereplicating", filesize);
diff --git a/src/search.cc b/src/search.cc
index 0fad7bb..8b5dcee 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -68,7 +68,7 @@ static pthread_t * pthread;
 static int tophits; /* the maximum number of hits to keep */
 static int seqcount; /* number of database sequences */
 static pthread_attr_t attr;
-static fasta_handle query_fasta_h;
+static fastx_handle query_fasta_h;
 
 /* global data protected by mutex */
 static pthread_mutex_t mutex_input;
diff --git a/src/searchcore.cc b/src/searchcore.cc
index 9bdcfd9..c4f3e23 100644
--- a/src/searchcore.cc
+++ b/src/searchcore.cc
@@ -433,9 +433,9 @@ int search_acceptable_aligned(struct searchinfo_s * si,
       ((!opt_rightjust) || (hit->trim_q_right +
                             hit->trim_t_right == 0)) &&
       /* query_cov */
-      (hit->internal_alignmentlength >= opt_query_cov * si->qseqlen) &&
+      (hit->matches + hit->mismatches >= opt_query_cov * si->qseqlen) &&
       /* target_cov */
-      (hit->internal_alignmentlength >=
+      (hit->matches + hit->mismatches >=
        opt_target_cov * db_getsequencelen(hit->target)) &&
       /* maxid */
       (hit->id <= 100.0 * opt_maxid) &&
diff --git a/src/searchexact.cc b/src/searchexact.cc
index 677ccca..739a46d 100644
--- a/src/searchexact.cc
+++ b/src/searchexact.cc
@@ -68,7 +68,7 @@ static pthread_t * pthread;
 static int tophits; /* the maximum number of hits to keep */
 static int seqcount; /* number of database sequences */
 static pthread_attr_t attr;
-static fasta_handle query_fasta_h;
+static fastx_handle query_fasta_h;
 
 /* global data protected by mutex */
 static pthread_mutex_t mutex_input;
diff --git a/src/util.cc b/src/util.cc
index f9afc5c..c28aac0 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -93,7 +93,7 @@ void progress_update(unsigned long progress)
           fprintf(stderr, "  \r%s %.0f%%", progress_prompt,
                   100.0 * progress / progress_size);
         else
-          fprintf(stderr, "  \r%s ?%%", progress_prompt);
+          fprintf(stderr, "  \r%s 0%%", progress_prompt);
         progress_next = progress + progress_chunk;
       }
 }
diff --git a/src/vsearch.cc b/src/vsearch.cc
index bd62f36..8cfdc10 100644
--- a/src/vsearch.cc
+++ b/src/vsearch.cc
@@ -62,6 +62,7 @@
 
 /* options */
 
+bool opt_bzip2_decompress;
 bool opt_clusterout_id;
 bool opt_clusterout_sort;
 bool opt_eeout;
@@ -69,6 +70,7 @@ bool opt_fasta_score;
 bool opt_fastq_allowmergestagger;
 bool opt_fastq_eeout;
 bool opt_fastq_nostagger;
+bool opt_gzip_decompress;
 bool opt_quiet;
 bool opt_relabel_keep;
 bool opt_relabel_md5;
@@ -264,6 +266,9 @@ FILE * fp_log = 0;
 
 abundance_t * global_abundance;
 
+char * STDIN_NAME = (char*) "/dev/stdin";
+char * STDOUT_NAME = (char*) "/dev/stdout";
+
 #define cpuid(f1, f2, a, b, c, d)                                \
   __asm__ __volatile__ ("cpuid"                                  \
                         : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
@@ -514,6 +519,7 @@ void args_init(int argc, char **argv)
   opt_alnout = 0;
   opt_blast6out = 0;
   opt_borderline = 0;
+  opt_bzip2_decompress = 0;
   opt_centroids = 0;
   opt_chimeras = 0;
   opt_cluster_fast = 0;
@@ -587,6 +593,7 @@ void args_init(int argc, char **argv)
   opt_gap_open_target_interior=20;
   opt_gap_open_target_left=2;
   opt_gap_open_target_right=2;
+  opt_gzip_decompress = 0;
   opt_hardmask = 0;
   opt_help = 0;
   opt_id = -1.0;
@@ -862,6 +869,8 @@ void args_init(int argc, char **argv)
     {"minhsp",                required_argument, 0, 0 },
     {"band",                  required_argument, 0, 0 },
     {"hspw",                  required_argument, 0, 0 },
+    {"gzip_decompress",       no_argument,       0, 0 },
+    {"bzip2_decompress",      no_argument,       0, 0 },
     { 0, 0, 0, 0 }
   };
 
@@ -1600,6 +1609,16 @@ void args_init(int argc, char **argv)
           fprintf(stderr, "WARNING: Option --hspw is ignored\n");
           break;
 
+        case 173:
+          /* gzip_decompress */
+          opt_gzip_decompress = 1;
+          break;
+
+        case 174:
+          /* bzip2_decompress */
+          opt_bzip2_decompress = 1;
+          break;
+
         default:
           fatal("Internal error in option parsing");
         }
@@ -1751,6 +1770,9 @@ void args_init(int argc, char **argv)
   if (opt_fastq_qminout > opt_fastq_qmaxout)
     fatal("The argument to --fastq_qminout cannot be larger than to --fastq_qmaxout");
 
+  if (opt_gzip_decompress && opt_bzip2_decompress)
+    fatal("Specify either --gzip_decompress or --bzip2_decompress, not both");
+
   /* TODO: check valid range of gap penalties */
 
   /* adapt/adjust parameters */
@@ -1794,6 +1816,87 @@ void args_init(int argc, char **argv)
       else
         opt_minseqlength = 1;
     }
+
+  /* replace filename "-" by "/dev/stdin" for input file options */
+
+  char * * stdin_options[] =
+    {
+      & opt_allpairs_global,
+      & opt_cluster_fast,
+      & opt_cluster_size,
+      & opt_cluster_smallmem,
+      & opt_db,
+      & opt_derep_fulllength,
+      & opt_derep_prefix,
+      & opt_fastq_chars,
+      & opt_fastq_convert,
+      & opt_fastq_eestats,
+      & opt_fastq_filter,
+      & opt_fastq_mergepairs,
+      & opt_fastq_stats,
+      & opt_fastx_mask,
+      & opt_fastx_revcomp,
+      & opt_fastx_subsample,
+      & opt_maskfasta,
+      & opt_rereplicate,
+      & opt_reverse,
+      & opt_search_exact,
+      & opt_shuffle,
+      & opt_sortbylength,
+      & opt_sortbysize,
+      & opt_uchime_denovo,
+      & opt_uchime_ref,
+      & opt_usearch_global,
+      0
+    };
+
+  int i = 0;
+  while(char * * stdin_opt = stdin_options[i++])
+    if ((*stdin_opt) && (!strcmp(*stdin_opt, "-")))
+      *stdin_opt = STDIN_NAME;
+
+
+  /* replace filename "-" by "/dev/stdout" for output file options */
+
+  char * * stdout_options[] =
+    {
+      & opt_alnout,
+      & opt_blast6out,
+      & opt_borderline,
+      & opt_centroids,
+      & opt_chimeras,
+      & opt_consout,
+      & opt_dbmatched,
+      & opt_dbnotmatched,
+      & opt_eetabbedout,
+      & opt_fastaout,
+      & opt_fastaout_discarded,
+      & opt_fastaout_notmerged_fwd,
+      & opt_fastaout_notmerged_rev,
+      & opt_fastapairs,
+      & opt_fastqout,
+      & opt_fastqout_discarded,
+      & opt_fastqout_notmerged_fwd,
+      & opt_fastqout_notmerged_rev,
+      & opt_log,
+      & opt_matched,
+      & opt_msaout,
+      & opt_nonchimeras,
+      & opt_notmatched,
+      & opt_output,
+      & opt_profile,
+      & opt_samout,
+      & opt_uc,
+      & opt_uchimealns,
+      & opt_uchimeout,
+      & opt_userout,
+      0
+    };
+
+  int o = 0;
+  while(char * * stdout_opt = stdout_options[o++])
+    if ((*stdout_opt) && (!strcmp(*stdout_opt, "-")))
+      *stdout_opt = STDOUT_NAME;
 }
 
 
@@ -1810,7 +1913,9 @@ void cmd_help()
       fprintf(stdout,
               "\n"
               "General options\n"
+              "  --bzip2_decompress          decompress input with bzip2 (required for pipes)\n"
               "  --fasta_width INT           width of FASTA seq lines, 0 for no wrap (80)\n"
+              "  --gzip_decompress           decompress input with gzip (required for pipes)\n"
               "  --help | --h                display help information\n"
               "  --log FILENAME              write messages, timing and memory info to file\n"
               "  --maxseqlength INT          maximum sequence length (50000)\n"
@@ -2343,8 +2448,10 @@ void cmd_uchime()
   if (opt_minh <= 0.0)
     fatal("Argument to --minh must be > 0");
 
+#if 0
   if (opt_abskew <= 1.0)
     fatal("Argument to --abskew must be > 1");
+#endif
 
   chimera();
 }
@@ -2401,9 +2508,9 @@ void show_header()
 {
   if (! opt_quiet)
     {
-      fprintf(stdout, "%s\n", progheader);
-      fprintf(stdout, "https://github.com/torognes/vsearch\n");
-      fprintf(stdout, "\n");
+      fprintf(stderr, "%s\n", progheader);
+      fprintf(stderr, "https://github.com/torognes/vsearch\n");
+      fprintf(stderr, "\n");
     }
 }
 
diff --git a/src/vsearch.h b/src/vsearch.h
index a2353ec..43ba02c 100644
--- a/src/vsearch.h
+++ b/src/vsearch.h
@@ -138,9 +138,9 @@
 #include "cpu.h"
 #include "allpairs.h"
 #include "subsample.h"
+#include "fastx.h"
 #include "fasta.h"
 #include "fastq.h"
-#include "fastx.h"
 #include "fastqops.h"
 #include "dbhash.h"
 #include "searchexact.h"
@@ -159,6 +159,7 @@
 
 /* options */
 
+extern bool opt_bzip2_decompress;
 extern bool opt_clusterout_id;
 extern bool opt_clusterout_sort;
 extern bool opt_eeout;
@@ -166,6 +167,7 @@ extern bool opt_fasta_score;
 extern bool opt_fastq_allowmergestagger;
 extern bool opt_fastq_eeout;
 extern bool opt_fastq_nostagger;
+extern bool opt_gzip_decompress;
 extern bool opt_quiet;
 extern bool opt_relabel_keep;
 extern bool opt_relabel_md5;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/vsearch.git



More information about the debian-med-commit mailing list