[med-svn] [plink1.9] 01/02: Imported Upstream version 1.90~b3l-150418

Dylan Aïssi bob.dybian-guest at moszumanska.debian.org
Mon Apr 27 03:25:07 UTC 2015


This is an automated email from the git hooks/post-receive script.

bob.dybian-guest pushed a commit to branch master
in repository plink1.9.

commit 71a56b4eed006fdeec03b1aaa664070b112037f8
Author: Dylan Aïssi <bob.dybian at gmail.com>
Date:   Mon Apr 27 05:21:36 2015 +0200

    Imported Upstream version 1.90~b3l-150418
---
 Makefile         |    2 +-
 bgzf.c           | 1123 +++++++++++++++++++++++++++++++++++++++++++++++++
 bgzf.h           |  313 ++++++++++++++
 hfile.c          |  578 ++++++++++++++++++++++++++
 hfile.h          |  212 ++++++++++
 hfile_internal.h |   76 ++++
 hts.h            |  456 ++++++++++++++++++++
 hts_defs.h       |   55 +++
 khash.h          |  619 +++++++++++++++++++++++++++
 pigz.c           |  463 +++++++++++++++++----
 pigz.h           |  148 ++++++-
 plink.c          |  525 +++++++++++++++--------
 plink_assoc.c    |    8 +-
 plink_calc.c     |  993 +++++++++++++++++++++++---------------------
 plink_calc.h     |    4 +-
 plink_cluster.c  |    7 +-
 plink_common.c   |  153 +++----
 plink_common.h   |  149 ++++---
 plink_data.c     |  376 ++++++++++++-----
 plink_dosage.c   |  330 +++++----------
 plink_family.c   | 1216 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 plink_family.h   |   26 +-
 plink_filter.c   |  233 +++++++----
 plink_filter.h   |    8 +-
 plink_glm.c      |   45 +-
 plink_help.c     |   78 ++--
 plink_ld.c       |   43 +-
 plink_misc.c     |  326 +++++++--------
 plink_misc.h     |   10 +-
 plink_set.c      |    1 +
 yarn.h           |    5 +
 31 files changed, 6969 insertions(+), 1612 deletions(-)

diff --git a/Makefile b/Makefile
index 5c8ac77..c7ef5ee 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ ifdef NO_LAPACK
   BLASFLAGS=
 endif
 
-SRC = plink.c plink_assoc.c plink_calc.c plink_cluster.c plink_cnv.c plink_common.c plink_data.c plink_dosage.c plink_family.c plink_filter.c plink_glm.c plink_help.c plink_homozyg.c plink_lasso.c plink_ld.c plink_matrix.c plink_misc.c plink_rserve.c plink_set.c plink_stats.c SFMT.c dcdflib.c pigz.c yarn.c Rconnection.cc
+SRC = plink.c plink_assoc.c plink_calc.c plink_cluster.c plink_cnv.c plink_common.c plink_data.c plink_dosage.c plink_family.c plink_filter.c plink_glm.c plink_help.c plink_homozyg.c plink_lasso.c plink_ld.c plink_matrix.c plink_misc.c plink_rserve.c plink_set.c plink_stats.c SFMT.c dcdflib.c pigz.c yarn.c Rconnection.cc hfile.c bgzf.c
 
 # In the event that you are still concurrently using PLINK 1.07, we suggest
 # renaming that binary to "plink107" and "plink1".  (Previously,
diff --git a/bgzf.c b/bgzf.c
new file mode 100644
index 0000000..a6c9c9b
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,1123 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+   Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/types.h>
+
+#include "hts.h"
+#include "bgzf.h"
+#include "hfile.h"
+
+#define BGZF_CACHE
+
+#ifndef _WIN32
+  #define BGZF_MT
+#endif
+
+#define BLOCK_HEADER_LENGTH 18
+#define BLOCK_FOOTER_LENGTH 8
+
+
+/* BGZF/GZIP header (speciallized from RFC 1952; little endian):
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+ | 31|139|  8|  4|              0|  0|255|      6| 66| 67|      2|BLK_LEN|
+ +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
+  BGZF extension:
+                ^                              ^   ^   ^
+                |                              |   |   |
+               FLG.EXTRA                     XLEN  B   C
+
+  BGZF format is compatible with GZIP. It limits the size of each compressed
+  block to 2^16 bytes and adds and an extra "BC" field in the gzip header which
+  records the size.
+
+*/
+static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
+
+#ifdef BGZF_CACHE
+typedef struct {
+    int size;
+    uint8_t *block;
+    int64_t end_offset;
+} cache_t;
+#include "khash.h"
+KHASH_MAP_INIT_INT64(cache, cache_t)
+#endif
+
+typedef struct
+{
+    uint64_t uaddr;  // offset w.r.t. uncompressed data
+    uint64_t caddr;  // offset w.r.t. compressed data
+}
+bgzidx1_t;
+
+struct __bgzidx_t
+{
+    int noffs, moffs;       // the size of the index, n:used, m:allocated
+    bgzidx1_t *offs;        // offsets
+    uint64_t ublock_addr;   // offset of the current block (uncompressed data)
+};
+
+void bgzf_index_destroy(BGZF *fp);
+int bgzf_index_add_block(BGZF *fp);
+
+static inline void packInt16(uint8_t *buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+    return buffer[0] | buffer[1] << 8;
+}
+
+static inline void packInt32(uint8_t *buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+static BGZF *bgzf_read_init(hFILE *hfpr)
+{
+    BGZF *fp;
+    uint8_t magic[18];
+    ssize_t n = hpeek(hfpr, magic, 18);
+    if (n < 0) return NULL;
+
+    fp = (BGZF*)calloc(1, sizeof(BGZF));
+    if (fp == NULL) return NULL;
+
+    fp->is_write = 0;
+    fp->is_compressed = (n==2 && magic[0]==0x1f && magic[1]==0x8b);
+    fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+    fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+    fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b) ? 1 : 0;
+    fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1;
+#ifdef BGZF_CACHE
+    fp->cache = kh_init(cache);
+#endif
+    return fp;
+}
+
+// get the compress level from the mode string: compress_level==-1 for the default level, -2 plain uncompressed
+static int mode2level(const char *__restrict mode)
+{
+    int i, compress_level = -1;
+    for (i = 0; mode[i]; ++i)
+        if (mode[i] >= '0' && mode[i] <= '9') break;
+    if (mode[i]) compress_level = (int)mode[i] - '0';
+    if (strchr(mode, 'u')) compress_level = -2;
+    return compress_level;
+}
+static BGZF *bgzf_write_init(const char *mode)
+{
+    BGZF *fp;
+    fp = (BGZF*)calloc(1, sizeof(BGZF));
+    fp->is_write = 1;
+    int compress_level = mode2level(mode);
+    if ( compress_level==-2 )
+    {
+        fp->is_compressed = 0;
+        return fp;
+    }
+    fp->is_compressed = 1;
+    fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+    fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE);
+    fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+    if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+    if ( strchr(mode,'g') )
+    {
+        // gzip output
+        fp->is_gzip = 1;
+        fp->gz_stream = (z_stream*)calloc(1,sizeof(z_stream));
+        fp->gz_stream->zalloc = NULL;
+        fp->gz_stream->zfree  = NULL;
+        if ( deflateInit2(fp->gz_stream, fp->compress_level, Z_DEFLATED, 15|16, 8, Z_DEFAULT_STRATEGY)!=Z_OK ) return NULL;
+    }
+    return fp;
+}
+
+BGZF *bgzf_open(const char *path, const char *mode)
+{
+    BGZF *fp = 0;
+    assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+    if (strchr(mode, 'r')) {
+        hFILE *fpr;
+        if ((fpr = hopen(path, mode)) == 0) return 0;
+        fp = bgzf_read_init(fpr);
+        if (fp == 0) { hclose_abruptly(fpr); return NULL; }
+        fp->fp = fpr;
+    } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+        hFILE *fpw;
+        if ((fpw = hopen(path, mode)) == 0) return 0;
+        fp = bgzf_write_init(mode);
+        fp->fp = fpw;
+    }
+    else { errno = EINVAL; return 0; }
+
+    fp->is_be = ed_is_big();
+    return fp;
+}
+
+BGZF *bgzf_dopen(int fd, const char *mode)
+{
+    BGZF *fp = 0;
+    assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+    if (strchr(mode, 'r')) {
+        hFILE *fpr;
+        if ((fpr = hdopen(fd, mode)) == 0) return 0;
+        fp = bgzf_read_init(fpr);
+        if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd
+        fp->fp = fpr;
+    } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+        hFILE *fpw;
+        if ((fpw = hdopen(fd, mode)) == 0) return 0;
+        fp = bgzf_write_init(mode);
+        fp->fp = fpw;
+    }
+    else { errno = EINVAL; return 0; }
+
+    fp->is_be = ed_is_big();
+    return fp;
+}
+
+BGZF *bgzf_hopen(hFILE *hfp, const char *mode)
+{
+    BGZF *fp = NULL;
+    assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE);
+    if (strchr(mode, 'r')) {
+        fp = bgzf_read_init(hfp);
+        if (fp == NULL) return NULL;
+    } else if (strchr(mode, 'w') || strchr(mode, 'a')) {
+        fp = bgzf_write_init(mode);
+    }
+    else { errno = EINVAL; return 0; }
+
+    fp->fp = hfp;
+    fp->is_be = ed_is_big();
+    return fp;
+}
+
+static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level)
+{
+    uint32_t crc;
+    z_stream zs;
+    uint8_t *dst = (uint8_t*)_dst;
+
+    // compress the body
+    zs.zalloc = NULL; zs.zfree = NULL;
+    zs.next_in  = (Bytef*)src;
+    zs.avail_in = slen;
+    zs.next_out = dst + BLOCK_HEADER_LENGTH;
+    zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+    if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer
+    if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1;
+    if (deflateEnd(&zs) != Z_OK) return -1;
+    *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+    // write the header
+    memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block
+    packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes
+    // write the footer
+    crc = crc32(crc32(0L, NULL, 0L), (Bytef*)src, slen);
+    packInt32((uint8_t*)&dst[*dlen - 8], crc);
+    packInt32((uint8_t*)&dst[*dlen - 4], slen);
+    return 0;
+}
+
+static int bgzf_gzip_compress(BGZF *fp, void *_dst, int *dlen, void *src, int slen, int level)
+{
+    uint8_t *dst = (uint8_t*)_dst;
+    z_stream *zs = fp->gz_stream;
+    int flush = slen ? Z_NO_FLUSH : Z_FINISH;
+    zs->next_in   = (Bytef*)src;
+    zs->avail_in  = slen;
+    zs->next_out  = dst;
+    zs->avail_out = *dlen;
+    if ( deflate(zs, flush) == Z_STREAM_ERROR ) return -1;
+    *dlen = *dlen - zs->avail_out;
+    return 0;
+}
+
+// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length.
+static int deflate_block(BGZF *fp, int block_length)
+{
+    int comp_size = BGZF_MAX_BLOCK_SIZE;
+    int ret;
+    if ( !fp->is_gzip )
+        ret = bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+    else
+        ret = bgzf_gzip_compress(fp, fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level);
+
+    if ( ret != 0 )
+    {
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    fp->block_offset = 0;
+    return comp_size;
+}
+
+// Inflate the block in fp->compressed_block into fp->uncompressed_block
+static int inflate_block(BGZF* fp, int block_length)
+{
+    z_stream zs;
+    zs.zalloc = NULL;
+    zs.zfree = NULL;
+    zs.next_in = (Bytef*)fp->compressed_block + 18;
+    zs.avail_in = block_length - 16;
+    zs.next_out = (Bytef*)fp->uncompressed_block;
+    zs.avail_out = BGZF_MAX_BLOCK_SIZE;
+
+    if (inflateInit2(&zs, -15) != Z_OK) {
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    if (inflate(&zs, Z_FINISH) != Z_STREAM_END) {
+        inflateEnd(&zs);
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    if (inflateEnd(&zs) != Z_OK) {
+        fp->errcode |= BGZF_ERR_ZLIB;
+        return -1;
+    }
+    return zs.total_out;
+}
+
+static int inflate_gzip_block(BGZF *fp, int cached)
+{
+    int ret = Z_OK;
+    do
+    {
+        if ( !cached && fp->gz_stream->avail_out!=0 )
+        {
+            fp->gz_stream->avail_in = hread(fp->fp, fp->compressed_block, BGZF_BLOCK_SIZE);
+            if ( fp->gz_stream->avail_in<=0 ) return fp->gz_stream->avail_in;
+            if ( fp->gz_stream->avail_in==0 ) break;
+            fp->gz_stream->next_in = (Bytef*)(fp->compressed_block);
+        }
+        else cached = 0;
+        do
+        {
+            fp->gz_stream->next_out = (Bytef*)fp->uncompressed_block + fp->block_offset;
+            fp->gz_stream->avail_out = BGZF_MAX_BLOCK_SIZE - fp->block_offset;
+            ret = inflate(fp->gz_stream, Z_NO_FLUSH);
+            if ( ret==Z_BUF_ERROR ) continue;   // non-critical error
+            if ( ret<0 ) return -1;
+            unsigned int have = BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+            if ( have ) return have;
+        }
+        while ( fp->gz_stream->avail_out == 0 );
+    }
+    while (ret != Z_STREAM_END);
+    return BGZF_MAX_BLOCK_SIZE - fp->gz_stream->avail_out;
+}
+
+// Returns: 0 on success (BGZF header); -1 on non-BGZF GZIP header; -2 on error
+static int check_header(const uint8_t *header)
+{
+    if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+    return ((header[3] & 4) != 0
+            && unpackInt16((uint8_t*)&header[10]) == 6
+            && header[12] == 'B' && header[13] == 'C'
+            && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
+
+#ifdef BGZF_CACHE
+static void free_cache(BGZF *fp)
+{
+    khint_t k;
+    khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+    if (fp->is_write) return;
+    for (k = kh_begin(h); k < kh_end(h); ++k)
+        if (kh_exist(h, k)) free(kh_val(h, k).block);
+    kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+    khint_t k;
+    cache_t *p;
+    khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+    k = kh_get(cache, h, block_address);
+    if (k == kh_end(h)) return 0;
+    p = &kh_val(h, k);
+    if (fp->block_length != 0) fp->block_offset = 0;
+    fp->block_address = block_address;
+    fp->block_length = p->size;
+    memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE);
+    if ( hseek(fp->fp, p->end_offset, SEEK_SET) < 0 )
+    {
+        // todo: move the error up
+        fprintf(stderr, "Could not hseek to %" PRId64 "\n", p->end_offset);
+        exit(1);
+    }
+    return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+    int ret;
+    khint_t k;
+    cache_t *p;
+    khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+    if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
+    if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
+        /* A better way would be to remove the oldest block in the
+         * cache, but here we remove a random one for simplicity. This
+         * should not have a big impact on performance. */
+        for (k = kh_begin(h); k < kh_end(h); ++k)
+            if (kh_exist(h, k)) break;
+        if (k < kh_end(h)) {
+            free(kh_val(h, k).block);
+            kh_del(cache, h, k);
+        }
+    }
+    k = kh_put(cache, h, fp->block_address, &ret);
+    if (ret == 0) return; // if this happens, a bug!
+    p = &kh_val(h, k);
+    p->size = fp->block_length;
+    p->end_offset = fp->block_address + size;
+    p->block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
+    memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+}
+#else
+static void free_cache(BGZF *fp) {}
+static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
+static void cache_block(BGZF *fp, int size) {}
+#endif
+
+int bgzf_read_block(BGZF *fp)
+{
+    uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block;
+    int count, size = 0, block_length, remaining;
+
+    // Reading an uncompressed file
+    if ( !fp->is_compressed )
+    {
+        count = hread(fp->fp, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE);
+        if ( count==0 )
+        {
+            fp->block_length = 0;
+            return 0;
+        }
+        if (fp->block_length != 0) fp->block_offset = 0;
+        fp->block_address += count;
+        fp->block_length = count;
+        return 0;
+    }
+
+    // Reading compressed file
+    int64_t block_address;
+    block_address = htell(fp->fp);
+    if ( fp->is_gzip && fp->gz_stream ) // is this is a initialized gzip stream?
+    {
+        count = inflate_gzip_block(fp, 0);
+        if ( count<0 )
+        {
+            fp->errcode |= BGZF_ERR_ZLIB;
+            return -1;
+        }
+        fp->block_length = count;
+        fp->block_address = block_address;
+        return 0;
+    }
+    if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0;
+    count = hread(fp->fp, header, sizeof(header));
+    if (count == 0) { // no data read
+        fp->block_length = 0;
+        return 0;
+    }
+    int ret;
+    if ( count != sizeof(header) || (ret=check_header(header))==-2 )
+    {
+        fp->errcode |= BGZF_ERR_HEADER;
+        return -1;
+    }
+    if ( ret==-1 )
+    {
+        // GZIP, not BGZF
+        uint8_t *cblock = (uint8_t*)fp->compressed_block;
+        memcpy(cblock, header, sizeof(header));
+        count = hread(fp->fp, cblock+sizeof(header), BGZF_BLOCK_SIZE - sizeof(header)) + sizeof(header);
+        int nskip = 10;
+
+        // Check optional fields to skip: FLG.FNAME,FLG.FCOMMENT,FLG.FHCRC,FLG.FEXTRA
+        // Note: Some of these fields are untested, I did not have appropriate data available
+        if ( header[3] & 0x4 ) // FLG.FEXTRA
+        {
+            nskip += unpackInt16(&cblock[nskip]) + 2;
+        }
+        if ( header[3] & 0x8 ) // FLG.FNAME
+        {
+            while ( nskip<BGZF_BLOCK_SIZE && cblock[nskip] ) nskip++;
+            if ( nskip==BGZF_BLOCK_SIZE )
+            {
+                fp->errcode |= BGZF_ERR_HEADER;
+                return -1;
+            }
+            nskip++;
+        }
+        if ( header[3] & 0x10 ) // FLG.FCOMMENT
+        {
+            while ( nskip<BGZF_BLOCK_SIZE && cblock[nskip] ) nskip++;
+            if ( nskip==BGZF_BLOCK_SIZE )
+            {
+                fp->errcode |= BGZF_ERR_HEADER;
+                return -1;
+            }
+            nskip++;
+        }
+        if ( header[3] & 0x2 ) nskip += 2;  //  FLG.FHCRC
+
+        fp->is_gzip = 1;
+        fp->gz_stream = (z_stream*) calloc(1,sizeof(z_stream));
+        int ret = inflateInit2(fp->gz_stream, -15);
+        if (ret != Z_OK)
+        {
+            fp->errcode |= BGZF_ERR_ZLIB;
+            return -1;
+        }
+        fp->gz_stream->avail_in = count - nskip;
+        fp->gz_stream->next_in  = cblock + nskip;
+        count = inflate_gzip_block(fp, 1);
+        if ( count<0 )
+        {
+            fp->errcode |= BGZF_ERR_ZLIB;
+            return -1;
+        }
+        fp->block_length = count;
+        fp->block_address = block_address;
+        if ( fp->idx_build_otf ) return -1; // cannot build index for gzip
+        return 0;
+    }
+    size = count;
+    block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1"
+    compressed_block = (uint8_t*)fp->compressed_block;
+    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+    remaining = block_length - BLOCK_HEADER_LENGTH;
+    count = hread(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+    if (count != remaining) {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+    size += count;
+    if ((count = inflate_block(fp, block_length)) < 0) return -1;
+    if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek.
+    fp->block_address = block_address;
+    fp->block_length = count;
+    if ( fp->idx_build_otf )
+    {
+        bgzf_index_add_block(fp);
+        fp->idx->ublock_addr += count;
+    }
+    cache_block(fp, size);
+    return 0;
+}
+
+ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
+{
+    ssize_t bytes_read = 0;
+    uint8_t *output = (uint8_t*)data;
+    if (length <= 0) return 0;
+    assert(fp->is_write == 0);
+    // kludge to address signed vs. unsigned comparison warning
+    while (bytes_read < ((intptr_t)length)) {
+        int copy_length, available = fp->block_length - fp->block_offset;
+        uint8_t *buffer;
+        if (available <= 0) {
+            if (bgzf_read_block(fp) != 0) return -1;
+            available = fp->block_length - fp->block_offset;
+            if (available <= 0) break;
+        }
+        copy_length = ((intptr_t)(length - bytes_read)) < available? length - bytes_read : available;
+        buffer = (uint8_t*)fp->uncompressed_block;
+        memcpy(output, buffer + fp->block_offset, copy_length);
+        fp->block_offset += copy_length;
+        output += copy_length;
+        bytes_read += copy_length;
+    }
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = htell(fp->fp);
+        fp->block_offset = fp->block_length = 0;
+    }
+    fp->uncompressed_address += bytes_read;
+    return bytes_read;
+}
+
+ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
+{
+    return hread(fp->fp, data, length);
+}
+
+#ifdef BGZF_MT
+
+typedef struct {
+    struct bgzf_mtaux_t *mt;
+    void *buf;
+    int i, errcode, toproc, compress_level;
+} worker_t;
+
+typedef struct bgzf_mtaux_t {
+    int n_threads, n_blks, curr, done;
+    volatile int proc_cnt;
+    void **blk;
+    int *len;
+    worker_t *w;
+    pthread_t *tid;
+    pthread_mutex_t lock;
+    pthread_cond_t cv;
+} mtaux_t;
+
+static int worker_aux(worker_t *w)
+{
+    int i, stop = 0;
+    // wait for condition: to process or all done
+    pthread_mutex_lock(&w->mt->lock);
+    while (!w->toproc && !w->mt->done)
+        pthread_cond_wait(&w->mt->cv, &w->mt->lock);
+    if (w->mt->done) stop = 1;
+    w->toproc = 0;
+    pthread_mutex_unlock(&w->mt->lock);
+    if (stop) return 1; // to quit the thread
+    w->errcode = 0;
+    for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
+        int clen = BGZF_MAX_BLOCK_SIZE;
+        if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->compress_level) != 0)
+            w->errcode |= BGZF_ERR_ZLIB;
+        memcpy(w->mt->blk[i], w->buf, clen);
+        w->mt->len[i] = clen;
+    }
+    __sync_fetch_and_add(&w->mt->proc_cnt, 1);
+    return 0;
+}
+
+static void *mt_worker(void *data)
+{
+    while (worker_aux((worker_t*)data) == 0);
+    return 0;
+}
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+    int i;
+    mtaux_t *mt;
+    pthread_attr_t attr;
+    if (!fp->is_write || fp->mt || n_threads <= 1) return -1;
+    mt = (mtaux_t*)calloc(1, sizeof(mtaux_t));
+    mt->n_threads = n_threads;
+    mt->n_blks = n_threads * n_sub_blks;
+    mt->len = (int*)calloc(mt->n_blks, sizeof(int));
+    mt->blk = (void**)calloc(mt->n_blks, sizeof(void*));
+    for (i = 0; i < mt->n_blks; ++i)
+        mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE);
+    mt->tid = (pthread_t*)calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master
+    mt->w = (worker_t*)calloc(mt->n_threads, sizeof(worker_t));
+    for (i = 0; i < mt->n_threads; ++i) {
+        mt->w[i].i = i;
+        mt->w[i].mt = mt;
+        mt->w[i].compress_level = fp->compress_level;
+        mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE);
+    }
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+    pthread_mutex_init(&mt->lock, 0);
+    pthread_cond_init(&mt->cv, 0);
+    for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread
+        pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]);
+    fp->mt = mt;
+    return 0;
+}
+
+static void mt_destroy(mtaux_t *mt)
+{
+    int i;
+    // signal all workers to quit
+    pthread_mutex_lock(&mt->lock);
+    mt->done = 1; mt->proc_cnt = 0;
+    pthread_cond_broadcast(&mt->cv);
+    pthread_mutex_unlock(&mt->lock);
+    for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread
+    // free other data allocated on heap
+    for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]);
+    for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf);
+    free(mt->blk); free(mt->len); free(mt->w); free(mt->tid);
+    pthread_cond_destroy(&mt->cv);
+    pthread_mutex_destroy(&mt->lock);
+    free(mt);
+}
+
+static void mt_queue(BGZF *fp)
+{
+    mtaux_t *mt = fp->mt;
+    assert(mt->curr < mt->n_blks); // guaranteed by the caller
+    memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset);
+    mt->len[mt->curr] = fp->block_offset;
+    fp->block_offset = 0;
+    ++mt->curr;
+}
+
+static int mt_flush_queue(BGZF *fp)
+{
+    int i;
+    mtaux_t *mt = fp->mt;
+    // signal all the workers to compress
+    pthread_mutex_lock(&mt->lock);
+    for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1;
+    mt->proc_cnt = 0;
+    pthread_cond_broadcast(&mt->cv);
+    pthread_mutex_unlock(&mt->lock);
+    // worker 0 is doing things here
+    worker_aux(&mt->w[0]);
+    // wait for all the threads to complete
+    while (mt->proc_cnt < mt->n_threads);
+    // dump data to disk
+    for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode;
+    for (i = 0; i < mt->curr; ++i)
+        if (hwrite(fp->fp, mt->blk[i], mt->len[i]) != mt->len[i]) {
+            fp->errcode |= BGZF_ERR_IO;
+            break;
+        }
+    mt->curr = 0;
+    return (fp->errcode == 0)? 0 : -1;
+}
+
+static int lazy_flush(BGZF *fp)
+{
+    if (fp->mt) {
+        if (fp->block_offset) mt_queue(fp);
+        return (fp->mt->curr < fp->mt->n_blks)? 0 : mt_flush_queue(fp);
+    }
+    else return bgzf_flush(fp);
+}
+
+#else  // ~ #ifdef BGZF_MT
+
+int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
+{
+    return 0;
+}
+
+static inline int lazy_flush(BGZF *fp)
+{
+    return bgzf_flush(fp);
+}
+
+#endif // ~ #ifdef BGZF_MT
+
+int bgzf_flush(BGZF *fp)
+{
+    if (!fp->is_write) return 0;
+#ifdef BGZF_MT
+    if (fp->mt) {
+        if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail
+        return mt_flush_queue(fp);
+    }
+#endif
+    while (fp->block_offset > 0) {
+        if ( fp->idx_build_otf )
+        {
+            bgzf_index_add_block(fp);
+            fp->idx->ublock_addr += fp->block_offset;
+        }
+        int block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) return -1;
+        if (hwrite(fp->fp, fp->compressed_block, block_length) != block_length) {
+            fp->errcode |= BGZF_ERR_IO; // possibly truncated file
+            return -1;
+        }
+        fp->block_address += block_length;
+    }
+    return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, ssize_t size)
+{
+    if (fp->block_offset + size > BGZF_BLOCK_SIZE) return lazy_flush(fp);
+    return 0;
+}
+
+ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
+{
+    if ( !fp->is_compressed )
+        return hwrite(fp->fp, data, length);
+
+    const uint8_t *input = (const uint8_t*)data;
+    ssize_t remaining = length;
+    assert(fp->is_write);
+    while (remaining > 0) {
+        uint8_t* buffer = (uint8_t*)fp->uncompressed_block;
+        int copy_length = BGZF_BLOCK_SIZE - fp->block_offset;
+        if (copy_length > remaining) copy_length = remaining;
+        memcpy(buffer + fp->block_offset, input, copy_length);
+        fp->block_offset += copy_length;
+        input += copy_length;
+        remaining -= copy_length;
+        if (fp->block_offset == BGZF_BLOCK_SIZE) {
+            if (lazy_flush(fp) != 0) return -1;
+        }
+    }
+    return length - remaining;
+}
+
+ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
+{
+    return hwrite(fp->fp, data, length);
+}
+
+int bgzf_close(BGZF* fp)
+{
+    int ret, block_length;
+    if (fp == 0) return -1;
+    if (fp->is_write && fp->is_compressed) {
+        if (bgzf_flush(fp) != 0) return -1;
+        fp->compress_level = -1;
+        block_length = deflate_block(fp, 0); // write an empty block
+        if (hwrite(fp->fp, fp->compressed_block, block_length) < 0
+            || hflush(fp->fp) != 0) {
+            fp->errcode |= BGZF_ERR_IO;
+            return -1;
+        }
+#ifdef BGZF_MT
+        if (fp->mt) mt_destroy(fp->mt);
+#endif
+    }
+    if ( fp->is_gzip )
+    {
+        if (!fp->is_write) (void)inflateEnd(fp->gz_stream);
+        else (void)deflateEnd(fp->gz_stream);
+        free(fp->gz_stream);
+    }
+    ret = hclose(fp->fp);
+    if (ret != 0) return -1;
+    bgzf_index_destroy(fp);
+    free(fp->uncompressed_block);
+    free(fp->compressed_block);
+    free_cache(fp);
+    free(fp);
+    return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+    if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+    uint8_t buf[28];
+    off_t offset = htell(fp->fp);
+    if (hseek(fp->fp, -28, SEEK_END) < 0) {
+        if (errno == ESPIPE) { hclearerr(fp->fp); return 2; }
+        else return -1;
+    }
+    if ( hread(fp->fp, buf, 28) != 28 ) return -1;
+    if ( hseek(fp->fp, offset, SEEK_SET) < 0 ) return -1;
+    return (memcmp("\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0", buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+    int block_offset;
+    int64_t block_address;
+
+    if (fp->is_write || where != SEEK_SET) {
+        fp->errcode |= BGZF_ERR_MISUSE;
+        return -1;
+    }
+    block_offset = pos & 0xFFFF;
+    block_address = pos >> 16;
+    if (hseek(fp->fp, block_address, SEEK_SET) < 0) {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block has not been loaded
+    fp->block_address = block_address;
+    fp->block_offset = block_offset;
+    return 0;
+}
+
+int bgzf_is_bgzf(const char *fn)
+{
+    uint8_t buf[16];
+    int n;
+    hFILE *fp;
+    if ((fp = hopen(fn, "r")) == 0) return 0;
+    n = hread(fp, buf, 16);
+    if ( hclose(fp) < 0 ) return -1;
+    if (n != 16) return 0;
+    return memcmp(g_magic, buf, 16) == 0? 1 : 0;
+}
+
+int bgzf_getc(BGZF *fp)
+{
+    int c;
+    if (fp->block_offset >= fp->block_length) {
+        if (bgzf_read_block(fp) != 0) return -2; /* error */
+        if (fp->block_length == 0) return -1; /* end-of-file */
+    }
+    c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = htell(fp->fp);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+    fp->uncompressed_address++;
+    return c;
+}
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
+{
+    int l, state = 0;
+    unsigned char *buf = (unsigned char*)fp->uncompressed_block;
+    str->l = 0;
+    do {
+        if (fp->block_offset >= fp->block_length) {
+            if (bgzf_read_block(fp) != 0) { state = -2; break; }
+            if (fp->block_length == 0) { state = -1; break; }
+        }
+        for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+        if (l < fp->block_length) state = 1;
+        l -= fp->block_offset;
+        if (str->l + l + 1 >= str->m) {
+            str->m = str->l + l + 2;
+            kroundup32(str->m);
+            str->s = (char*)realloc(str->s, str->m);
+        }
+        memcpy(str->s + str->l, buf + fp->block_offset, l);
+        str->l += l;
+        fp->block_offset += l + 1;
+        if (fp->block_offset >= fp->block_length) {
+            fp->block_address = htell(fp->fp);
+            fp->block_offset = 0;
+            fp->block_length = 0;
+        }
+    } while (state == 0);
+    if (str->l == 0 && state < 0) return state;
+    fp->uncompressed_address += str->l;
+    if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--;
+    str->s[str->l] = 0;
+    return str->l;
+}
+
+void bgzf_index_destroy(BGZF *fp)
+{
+    if ( !fp->idx ) return;
+    free(fp->idx->offs);
+    free(fp->idx);
+    fp->idx = NULL;
+    fp->idx_build_otf = 0;
+}
+
+int bgzf_index_build_init(BGZF *fp)
+{
+    bgzf_index_destroy(fp);
+    fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+    if ( !fp->idx ) return -1;
+    fp->idx_build_otf = 1;  // build index on the fly
+    return 0;
+}
+
+int bgzf_index_add_block(BGZF *fp)
+{
+    fp->idx->noffs++;
+    if ( fp->idx->noffs > fp->idx->moffs )
+    {
+        fp->idx->moffs = fp->idx->noffs;
+        kroundup32(fp->idx->moffs);
+        fp->idx->offs = (bgzidx1_t*) realloc(fp->idx->offs, fp->idx->moffs*sizeof(bgzidx1_t));
+        if ( !fp->idx->offs ) return -1;
+    }
+    fp->idx->offs[ fp->idx->noffs-1 ].uaddr = fp->idx->ublock_addr;
+    fp->idx->offs[ fp->idx->noffs-1 ].caddr = fp->block_address;
+    return 0;
+}
+
+int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
+{
+    if (bgzf_flush(fp) != 0) return -1;
+
+    assert(fp->idx);
+    char *tmp = NULL;
+    if ( suffix )
+    {
+        int blen = strlen(bname);
+        int slen = strlen(suffix);
+        tmp = (char*) malloc(blen + slen + 1);
+        if ( !tmp ) return -1;
+        memcpy(tmp,bname,blen);
+        memcpy(tmp+blen,suffix,slen+1);
+    }
+
+    FILE *idx = fopen(tmp?tmp:bname,"wb");
+    if ( tmp ) free(tmp);
+    if ( !idx ) return -1;
+
+    // Note that the index contains one extra record when indexing files opened
+    // for reading. The terminating record is not present when opened for writing.
+    // This is not a bug.
+
+    int i;
+    if ( fp->is_be )
+    {
+        uint64_t x = fp->idx->noffs - 1;
+        fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            x = fp->idx->offs[i].caddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+            x = fp->idx->offs[i].uaddr; fwrite(ed_swap_8p(&x), 1, sizeof(x), idx);
+        }
+    }
+    else
+    {
+        uint64_t x = fp->idx->noffs - 1;
+        fwrite(&x, 1, sizeof(x), idx);
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            fwrite(&fp->idx->offs[i].caddr, 1, sizeof(fp->idx->offs[i].caddr), idx);
+            fwrite(&fp->idx->offs[i].uaddr, 1, sizeof(fp->idx->offs[i].uaddr), idx);
+        }
+    }
+    fclose(idx);
+    return 0;
+}
+
+
+int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
+{
+    char *tmp = NULL;
+    if ( suffix )
+    {
+        int blen = strlen(bname);
+        int slen = strlen(suffix);
+        tmp = (char*) malloc(blen + slen + 1);
+        if ( !tmp ) return -1;
+        memcpy(tmp,bname,blen);
+        memcpy(tmp+blen,suffix,slen+1);
+    }
+
+    FILE *idx = fopen(tmp?tmp:bname,"rb");
+    if ( tmp ) free(tmp);
+    if ( !idx ) return -1;
+
+    fp->idx = (bgzidx_t*) calloc(1,sizeof(bgzidx_t));
+    uint64_t x;
+    if ( fread(&x, 1, sizeof(x), idx) != sizeof(x) ) return -1;
+
+    fp->idx->noffs = fp->idx->moffs = 1 + (fp->is_be ? ed_swap_8(x) : x);
+    fp->idx->offs  = (bgzidx1_t*) malloc(fp->idx->moffs*sizeof(bgzidx1_t));
+    fp->idx->offs[0].caddr = fp->idx->offs[0].uaddr = 0;
+
+    int i;
+    if ( fp->is_be )
+    {
+        int ret = 0;
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = ed_swap_8(x);
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = ed_swap_8(x);
+        }
+        if ( ret != ((intptr_t)(sizeof(x)*2*(fp->idx->noffs-1))) ) return -1;
+    }
+    else
+    {
+        int ret = 0;
+        for (i=1; i<fp->idx->noffs; i++)
+        {
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].caddr = x;
+            ret += fread(&x, 1, sizeof(x), idx); fp->idx->offs[i].uaddr = x;
+        }
+        if ( ret != ((intptr_t)(sizeof(x)*2*(fp->idx->noffs-1))) ) return -1;
+    }
+    fclose(idx);
+    return 0;
+
+}
+
+int bgzf_useek(BGZF *fp, long uoffset, int where)
+{
+    if ( !fp->is_compressed )
+    {
+        if (hseek(fp->fp, uoffset, SEEK_SET) < 0)
+        {
+            fp->errcode |= BGZF_ERR_IO;
+            return -1;
+        }
+        fp->block_length = 0;  // indicates current block has not been loaded
+        fp->block_address = uoffset;
+        fp->block_offset = 0;
+        bgzf_read_block(fp);
+        fp->uncompressed_address = uoffset;
+        return 0;
+    }
+
+    if ( !fp->idx )
+    {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+
+    // binary search
+    int ilo = 0, ihi = fp->idx->noffs - 1;
+    while ( ilo<=ihi )
+    {
+        int i = (ilo+ihi)*0.5;
+        if ( uoffset < ((intptr_t)fp->idx->offs[i].uaddr) ) ihi = i - 1;
+        else if ( uoffset >= ((intptr_t)fp->idx->offs[i].uaddr) ) ilo = i + 1;
+        else break;
+    }
+    int i = ilo-1;
+    if (hseek(fp->fp, fp->idx->offs[i].caddr, SEEK_SET) < 0)
+    {
+        fp->errcode |= BGZF_ERR_IO;
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block has not been loaded
+    fp->block_address = fp->idx->offs[i].caddr;
+    fp->block_offset = 0;
+    if ( bgzf_read_block(fp) < 0 ) return -1;
+    if ( uoffset - fp->idx->offs[i].uaddr > 0 )
+    {
+        fp->block_offset = uoffset - fp->idx->offs[i].uaddr;
+        assert( fp->block_offset <= fp->block_length );     // todo: skipped, unindexed, blocks
+    }
+    fp->uncompressed_address = uoffset;
+    return 0;
+}
+
+long bgzf_utell(BGZF *fp)
+{
+    return fp->uncompressed_address;    // currently maintained only when reading
+}
+
diff --git a/bgzf.h b/bgzf.h
new file mode 100644
index 0000000..acdb673
--- /dev/null
+++ b/bgzf.h
@@ -0,0 +1,313 @@
+/* The MIT License
+
+   Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+                 2011, 2012 Attractive Chaos <attractor at live.co.uk>
+   Copyright (C) 2009, 2013, 2014 Genome Research Ltd
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/* The BGZF library was originally written by Bob Handsaker from the Broad
+ * Institute. It was later improved by the SAMtools developers. */
+
+#ifndef HTSLIB_BGZF_H
+#define HTSLIB_BGZF_H
+
+#include "plink_common.h"
+#include <sys/types.h>
+
+#define BGZF_BLOCK_SIZE     0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE
+#define BGZF_MAX_BLOCK_SIZE 0x10000
+
+#define BGZF_ERR_ZLIB   1
+#define BGZF_ERR_HEADER 2
+#define BGZF_ERR_IO     4
+#define BGZF_ERR_MISUSE 8
+
+struct hFILE;
+struct bgzf_mtaux_t;
+typedef struct __bgzidx_t bgzidx_t;
+
+struct BGZF {
+    int errcode:16, is_write:2, is_be:2, compress_level:9, is_compressed:2, is_gzip:1;
+    int cache_size;
+    int block_length, block_offset;
+    int64_t block_address, uncompressed_address;
+    void *uncompressed_block, *compressed_block;
+    void *cache; // a pointer to a hash table
+    struct hFILE *fp; // actual file handle
+    struct bgzf_mtaux_t *mt; // only used for multi-threading
+    bgzidx_t *idx;      // BGZF index
+    int idx_build_otf;  // build index on the fly, set by bgzf_index_build_init()
+    z_stream *gz_stream;// for gzip-compressed files
+};
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+    size_t l, m;
+    char *s;
+} kstring_t;
+#endif
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+    /******************
+     * Basic routines *
+     ******************/
+
+    /**
+     * Open an existing file descriptor for reading or writing.
+     *
+     * @param fd    file descriptor
+     * @param mode  mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
+     *              writing, 'a' for appending, 'g' for gzip rather than BGZF
+     *              compression (with 'w' only), and digit specifies the zlib
+     *              compression level. 
+     *              Note that there is a distinction between 'u' and '0': the
+     *              first yields plain uncompressed output whereas the latter
+     *              outputs uncompressed data wrapped in the zlib format.
+     * @return      BGZF file handler; 0 on error
+     */
+    BGZF* bgzf_dopen(int fd, const char *mode);
+
+    #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility
+
+    /**
+     * Open the specified file for reading or writing.
+     */
+    BGZF* bgzf_open(const char* path, const char *mode);
+
+    /**
+     * Open an existing hFILE stream for reading or writing.
+     */
+    BGZF* bgzf_hopen(struct hFILE *fp, const char *mode);
+
+    /**
+     * Close the BGZF and free all associated resources.
+     *
+     * @param fp    BGZF file handler
+     * @return      0 on success and -1 on error
+     */
+    int bgzf_close(BGZF *fp);
+
+    /**
+     * Read up to _length_ bytes from the file storing into _data_.
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to read into
+     * @param length size of data to read
+     * @return       number of bytes actually read; 0 on end-of-file and -1 on error
+     */
+    ssize_t bgzf_read(BGZF *fp, void *data, size_t length);
+
+    /**
+     * Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
+     * the complete _length_ bytes will be written (or queued for writing).
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to write
+     * @param length size of data to write
+     * @return       number of bytes written (i.e., _length_); negative on error
+     */
+    ssize_t bgzf_write(BGZF *fp, const void *data, size_t length);
+
+    /**
+     * Read up to _length_ bytes directly from the underlying stream without
+     * decompressing.  Bypasses BGZF blocking, so must be used with care in
+     * specialised circumstances only.
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to read into
+     * @param length number of raw bytes to read
+     * @return       number of bytes actually read; 0 on end-of-file and -1 on error
+     */
+    ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length);
+
+    /**
+     * Write _length_ bytes directly to the underlying stream without
+     * compressing.  Bypasses BGZF blocking, so must be used with care
+     * in specialised circumstances only.
+     *
+     * @param fp     BGZF file handler
+     * @param data   data array to write
+     * @param length number of raw bytes to write
+     * @return       number of bytes actually written; -1 on error
+     */
+    ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length);
+
+    /**
+     * Write the data in the buffer to the file.
+     */
+    int bgzf_flush(BGZF *fp);
+
+    /**
+     * Return a virtual file pointer to the current location in the file.
+     * No interpetation of the value should be made, other than a subsequent
+     * call to bgzf_seek can be used to position the file at the same point.
+     * Return value is non-negative on success.
+     */
+    #define bgzf_tell(fp) (((fp)->block_address << 16) | ((fp)->block_offset & 0xFFFF))
+
+    /**
+     * Set the file to read from the location specified by _pos_.
+     *
+     * @param fp     BGZF file handler
+     * @param pos    virtual file offset returned by bgzf_tell()
+     * @param whence must be SEEK_SET
+     * @return       0 on success and -1 on error
+     */
+    int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence);
+
+    /**
+     * Check if the BGZF end-of-file (EOF) marker is present
+     *
+     * @param fp    BGZF file handler opened for reading
+     * @return      1 if the EOF marker is present and correct;
+     *              2 if it can't be checked, e.g., because fp isn't seekable;
+     *              0 if the EOF marker is absent;
+     *              -1 (with errno set) on error
+     */
+    int bgzf_check_EOF(BGZF *fp);
+
+    /**
+     * Check if a file is in the BGZF format
+     *
+     * @param fn    file name
+     * @return      1 if _fn_ is BGZF; 0 if not or on I/O error
+     */
+     int bgzf_is_bgzf(const char *fn);
+
+    /*********************
+     * Advanced routines *
+     *********************/
+
+    /**
+     * Set the cache size. Only effective when compiled with -DBGZF_CACHE.
+     *
+     * @param fp    BGZF file handler
+     * @param size  size of cache in bytes; 0 to disable caching (default)
+     */
+    void bgzf_set_cache_size(BGZF *fp, int size);
+
+    /**
+     * Flush the file if the remaining buffer size is smaller than _size_
+     * @return      0 if flushing succeeded or was not needed; negative on error
+     */
+    int bgzf_flush_try(BGZF *fp, ssize_t size);
+
+    /**
+     * Read one byte from a BGZF file. It is faster than bgzf_read()
+     * @param fp     BGZF file handler
+     * @return       byte read; -1 on end-of-file or error
+     */
+    int bgzf_getc(BGZF *fp);
+
+    /**
+     * Read one line from a BGZF file. It is faster than bgzf_getc()
+     *
+     * @param fp     BGZF file handler
+     * @param delim  delimitor
+     * @param str    string to write to; must be initialized
+     * @return       length of the string; 0 on end-of-file; negative on error
+     */
+    int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
+
+    /**
+     * Read the next BGZF block.
+     */
+    int bgzf_read_block(BGZF *fp);
+
+    /**
+     * Enable multi-threading (only effective on writing and when the
+     * library was compiled with -DBGZF_MT)
+     *
+     * @param fp          BGZF file handler; must be opened for writing
+     * @param n_threads   #threads used for writing
+     * @param n_sub_blks  #blocks processed by each thread; a value 64-256 is recommended
+     */
+    int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
+
+
+    /*******************
+     * bgzidx routines *
+     *******************/
+
+    /**
+     *  Position BGZF at the uncompressed offset
+     *
+     *  @param fp           BGZF file handler; must be opened for reading
+     *  @param uoffset      file offset in the uncompressed data
+     *  @param where        SEEK_SET supported atm
+     *
+     *  Returns 0 on success and -1 on error.
+     */
+    int bgzf_useek(BGZF *fp, long uoffset, int where);
+
+    /**
+     *  Position in uncompressed BGZF
+     *
+     *  @param fp           BGZF file handler; must be opened for reading
+     *
+     *  Returns the current offset on success and -1 on error.
+     */
+    long bgzf_utell(BGZF *fp);
+
+    /**
+     * Tell BGZF to build index while compressing.
+     *
+     * @param fp          BGZF file handler; can be opened for reading or writing.
+     *
+     * Returns 0 on success and -1 on error.
+     */
+    int bgzf_index_build_init(BGZF *fp);
+
+    /**
+     * Load BGZF index
+     *
+     * @param fp          BGZF file handler
+     * @param bname       base name
+     * @param suffix      suffix to add to bname (can be NULL)
+     *
+     * Returns 0 on success and -1 on error.
+     */
+    int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix);
+
+    /**
+     * Save BGZF index
+     *
+     * @param fp          BGZF file handler
+     * @param bname       base name
+     * @param suffix      suffix to add to bname (can be NULL)
+     *
+     * Returns 0 on success and -1 on error.
+     */
+    int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix);
+
+// #ifdef __cplusplus
+// }
+// #endif
+
+#endif
diff --git a/hfile.c b/hfile.c
new file mode 100644
index 0000000..9ab1ea9
--- /dev/null
+++ b/hfile.c
@@ -0,0 +1,578 @@
+/*  hfile.c -- buffered low-level input/output streams.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include "plink_common.h"
+#include <errno.h>
+
+#include "hfile.h"
+#include "hfile_internal.h"
+
+/* hFILE fields are used as follows:
+
+   char *buffer;     // Pointer to the start of the I/O buffer
+   char *begin;      // First not-yet-read character / unused position
+   char *end;        // First unfilled/unfillable position
+   char *limit;      // Pointer to the first position past the buffer
+
+   const hFILE_backend *backend;  // Methods to refill/flush I/O buffer
+
+   off_t offset;     // Offset within the stream of buffer position 0
+   int at_eof:1;     // For reading, whether EOF has been seen
+   int has_errno;    // Error number from the last failure on this stream
+
+For reading, begin is the first unread character in the buffer and end is the
+first unfilled position:
+
+   -----------ABCDEFGHIJKLMNO---------------
+   ^buffer    ^begin         ^end           ^limit
+
+For writing, begin is the first unused position and end is unused so remains
+equal to buffer:
+
+   ABCDEFGHIJKLMNOPQRSTUVWXYZ---------------
+   ^buffer                   ^begin         ^limit
+   ^end
+
+Thus if begin > end then there is a non-empty write buffer, if begin < end
+then there is a non-empty read buffer, and if begin == end then both buffers
+are empty.  In all cases, the stream's file position indicator corresponds
+to the position pointed to by begin.  */
+
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity)
+{
+    hFILE *fp = (hFILE *) malloc(struct_size);
+    if (fp == NULL) goto error;
+
+    if (capacity == 0) capacity = 32768;
+    // FIXME For now, clamp input buffer sizes so mpileup doesn't eat memory
+    if (strchr(mode, 'r') && capacity > 32768) capacity = 32768;
+
+    fp->buffer = (char *) malloc(capacity);
+    if (fp->buffer == NULL) goto error;
+
+    fp->begin = fp->end = fp->buffer;
+    fp->limit = &fp->buffer[capacity];
+
+    fp->offset = 0;
+    fp->at_eof = 0;
+    fp->has_errno = 0;
+    return fp;
+
+error:
+    hfile_destroy(fp);
+    return NULL;
+}
+
+void hfile_destroy(hFILE *fp)
+{
+    int save = errno;
+    if (fp) free(fp->buffer);
+    free(fp);
+    errno = save;
+}
+
+static inline int writebuffer_is_nonempty(hFILE *fp)
+{
+    return fp->begin > fp->end;
+}
+
+/* Refills the read buffer from the backend (once, so may only partially
+   fill the buffer), returning the number of additional characters read
+   (which might be 0), or negative when an error occurred.  */
+static ssize_t refill_buffer(hFILE *fp)
+{
+    ssize_t n;
+
+    // Move any unread characters to the start of the buffer
+    if (fp->begin > fp->buffer) {
+        fp->offset += fp->begin - fp->buffer;
+        memmove(fp->buffer, fp->begin, fp->end - fp->begin);
+        fp->end = &fp->buffer[fp->end - fp->begin];
+        fp->begin = fp->buffer;
+    }
+
+    // Read into the available buffer space at fp->[end,limit)
+    if (fp->at_eof || fp->end == fp->limit) n = 0;
+    else {
+        n = fp->backend->read(fp, fp->end, fp->limit - fp->end);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        else if (n == 0) fp->at_eof = 1;
+    }
+
+    fp->end += n;
+    return n;
+}
+
+/* Called only from hgetc(), when our buffer is empty.  */
+int hgetc2(hFILE *fp)
+{
+    return (refill_buffer(fp) > 0)? (unsigned char) *(fp->begin++) : EOF;
+}
+
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
+{
+    size_t n = fp->end - fp->begin;
+    while (n < nbytes) {
+        ssize_t ret = refill_buffer(fp);
+        if (ret < 0) return ret;
+        else if (ret == 0) break;
+        else n += ret;
+    }
+
+    if (n > nbytes) n = nbytes;
+    memcpy(buffer, fp->begin, n);
+    return n;
+}
+
+/* Called only from hread(); when called, our buffer is empty and nread bytes
+   have already been placed in the destination buffer.  */
+ssize_t hread2(hFILE *fp, void *destv, size_t nbytes, size_t nread)
+{
+    const size_t capacity = fp->limit - fp->buffer;
+    char *dest = (char *) destv;
+    dest += nread, nbytes -= nread;
+
+    // Read large requests directly into the destination buffer
+    while (nbytes * 2 >= capacity && !fp->at_eof) {
+        ssize_t n = fp->backend->read(fp, dest, nbytes);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        else if (n == 0) fp->at_eof = 1;
+        fp->offset += n;
+        dest += n, nbytes -= n;
+        nread += n;
+    }
+
+    while (nbytes > 0 && !fp->at_eof) {
+        size_t n;
+        ssize_t ret = refill_buffer(fp);
+        if (ret < 0) return ret;
+
+        n = fp->end - fp->begin;
+        if (n > nbytes) n = nbytes;
+        memcpy(dest, fp->begin, n);
+        fp->begin += n;
+        dest += n, nbytes -= n;
+        nread += n;
+    }
+
+    return nread;
+}
+
+/* Flushes the write buffer, fp->[buffer,begin), out through the backend
+   returning 0 on success or negative if an error occurred.  */
+static ssize_t flush_buffer(hFILE *fp)
+{
+    const char *buffer = fp->buffer;
+    while (buffer < fp->begin) {
+        ssize_t n = fp->backend->write(fp, buffer, fp->begin - buffer);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        buffer += n;
+        fp->offset += n;
+    }
+
+    fp->begin = fp->buffer;  // Leave the buffer empty
+    return 0;
+}
+
+int hflush(hFILE *fp)
+{
+    if (flush_buffer(fp) < 0) return EOF;
+    if (fp->backend->flush(fp) < 0) { fp->has_errno = errno; return EOF; }
+    return 0;
+}
+
+/* Called only from hputc(), when our buffer is already full.  */
+int hputc2(int c, hFILE *fp)
+{
+    if (flush_buffer(fp) < 0) return EOF;
+    *(fp->begin++) = c;
+    return c;
+}
+
+/* Called only from hwrite() and hputs2(); when called, our buffer is full and
+   ncopied bytes from the source have already been copied to our buffer.  */
+ssize_t hwrite2(hFILE *fp, const void *srcv, size_t totalbytes, size_t ncopied)
+{
+    const char *src = (const char *) srcv;
+    ssize_t ret;
+    const size_t capacity = fp->limit - fp->buffer;
+    size_t remaining = totalbytes - ncopied;
+    src += ncopied;
+
+    ret = flush_buffer(fp);
+    if (ret < 0) return ret;
+
+    // Write large blocks out directly from the source buffer
+    while (remaining * 2 >= capacity) {
+        ssize_t n = fp->backend->write(fp, src, remaining);
+        if (n < 0) { fp->has_errno = errno; return n; }
+        fp->offset += n;
+        src += n, remaining -= n;
+    }
+
+    // Just buffer any remaining characters
+    memcpy(fp->begin, src, remaining);
+    fp->begin += remaining;
+
+    return totalbytes;
+}
+
+/* Called only from hputs(), when our buffer is already full.  */
+int hputs2(const char *text, size_t totalbytes, size_t ncopied, hFILE *fp)
+{
+    return (hwrite2(fp, text, totalbytes, ncopied) >= 0)? 0 : EOF;
+}
+
+off_t hseek(hFILE *fp, off_t offset, int whence)
+{
+    off_t pos;
+
+    if (writebuffer_is_nonempty(fp)) {
+        int ret = flush_buffer(fp);
+        if (ret < 0) return ret;
+    }
+    else {
+        // Convert relative offsets from being relative to the hFILE's stream
+        // position (at begin) to being relative to the backend's physical
+        // stream position (at end, due to the buffering read-ahead).
+        if (whence == SEEK_CUR) offset -= fp->end - fp->begin;
+    }
+
+    pos = fp->backend->seek(fp, offset, whence);
+    if (pos < 0) { fp->has_errno = errno; return pos; }
+
+    // Seeking succeeded, so discard any non-empty read buffer
+    fp->begin = fp->end = fp->buffer;
+    fp->at_eof = 0;
+
+    fp->offset = pos;
+    return pos;
+}
+
+int hclose(hFILE *fp)
+{
+    int err = fp->has_errno;
+
+    if (writebuffer_is_nonempty(fp) && hflush(fp) < 0) err = fp->has_errno;
+    if (fp->backend->close(fp) < 0) err = errno;
+    hfile_destroy(fp);
+
+    if (err) {
+        errno = err;
+        return EOF;
+    }
+    else return 0;
+}
+
+void hclose_abruptly(hFILE *fp)
+{
+    int save = errno;
+    if (fp->backend->close(fp) < 0) { /* Ignore subsequent errors */ }
+    hfile_destroy(fp);
+    errno = save;
+}
+
+
+/***************************
+ * File descriptor backend *
+ ***************************/
+
+// #include <sys/socket.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+// #ifdef _WIN32
+// #define HAVE_CLOSESOCKET
+// #endif
+
+/* For Unix, it doesn't matter whether a file descriptor is a socket.
+   However Windows insists on send()/recv() and its own closesocket()
+   being used when fd happens to be a socket.  */
+
+typedef struct {
+    hFILE base;
+    int fd;
+  // int is_socket:1;
+} hFILE_fd;
+
+static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    ssize_t n;
+    do {
+      /*
+        n = fp->is_socket? recv(fp->fd, buffer, nbytes, 0)
+                         : read(fp->fd, buffer, nbytes);
+      */
+        n = read(fp->fd, buffer, nbytes);
+    } while (n < 0 && errno == EINTR);
+    return n;
+}
+
+static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    ssize_t n;
+    do {
+      /*
+        n = fp->is_socket?  send(fp->fd, buffer, nbytes, 0)
+                         : write(fp->fd, buffer, nbytes);
+      */
+        n = write(fp->fd, buffer, nbytes);
+    } while (n < 0 && errno == EINTR);
+    return n;
+}
+
+static off_t fd_seek(hFILE *fpv, off_t offset, int whence)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    return lseek(fp->fd, offset, whence);
+}
+
+static int fd_flush(hFILE *fpv)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+#ifdef _WIN32
+    // See the patch at
+    // https://lists.gnu.org/archive/html/bug-gnulib/2008-10/msg00004.html .
+    HANDLE hh = (HANDLE)_get_osfhandle(fp->fd);
+    DWORD err;
+    if (hh == INVALID_HANDLE_VALUE) {
+        errno = EBADF;
+        return -1;
+    }
+    if (!FlushFileBuffers(hh)) {
+        err = GetLastError();
+        switch (err) {
+        case ERROR_INVALID_HANDLE:
+	    errno = EINVAL;
+	    break;
+	default:
+	    errno = EIO;
+	}
+	return -1;
+    }
+    return 0;
+#else
+    int ret;
+    do {
+        ret = fsync(fp->fd);
+        // Ignore invalid-for-fsync(2) errors due to being, e.g., a pipe,
+        // and operation-not-supported errors (Mac OS X)
+        if (ret < 0 && (errno == EINVAL || errno == ENOTSUP)) ret = 0;
+    } while (ret < 0 && errno == EINTR);
+    return ret;
+#endif
+}
+
+static int fd_close(hFILE *fpv)
+{
+    hFILE_fd *fp = (hFILE_fd *) fpv;
+    int ret;
+    do {
+#ifdef HAVE_CLOSESOCKET
+        ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd);
+#else
+        ret = close(fp->fd);
+#endif
+    } while (ret < 0 && errno == EINTR);
+    return ret;
+}
+
+static const struct hFILE_backend fd_backend =
+{
+    fd_read, fd_write, fd_seek, fd_flush, fd_close
+};
+
+static size_t blksize(int fd)
+{
+    struct stat sbuf;
+    if (fstat(fd, &sbuf) != 0) return 0;
+#ifdef _WIN32
+    return 512;
+#else
+    return sbuf.st_blksize;
+#endif
+}
+
+static hFILE *hopen_fd(const char *filename, const char *mode)
+{
+    hFILE_fd *fp = NULL;
+    int fd = open(filename, hfile_oflags(mode), 0666);
+    if (fd < 0) goto error;
+
+    fp = (hFILE_fd *) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+    if (fp == NULL) goto error;
+
+    fp->fd = fd;
+    // fp->is_socket = 0;
+    fp->base.backend = &fd_backend;
+    return &fp->base;
+
+error:
+    if (fd >= 0) { int save = errno; (void) close(fd); errno = save; }
+    hfile_destroy((hFILE *) fp);
+    return NULL;
+}
+
+hFILE *hdopen(int fd, const char *mode)
+{
+    hFILE_fd *fp = (hFILE_fd*) hfile_init(sizeof (hFILE_fd), mode, blksize(fd));
+    if (fp == NULL) return NULL;
+
+    fp->fd = fd;
+    // fp->is_socket = (strchr(mode, 's') != NULL);
+    fp->base.backend = &fd_backend;
+    return &fp->base;
+}
+
+static hFILE *hopen_fd_stdinout(const char *mode)
+{
+    int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO;
+    // TODO Set binary mode (for Windows)
+    return hdopen(fd, mode);
+}
+
+int hfile_oflags(const char *mode)
+{
+    int rdwr = 0, flags = 0;
+    const char *s;
+    for (s = mode; *s; s++)
+        switch (*s) {
+        case 'r': rdwr = O_RDONLY;  break;
+        case 'w': rdwr = O_WRONLY; flags |= O_CREAT | O_TRUNC;  break;
+        case 'a': rdwr = O_WRONLY; flags |= O_CREAT | O_APPEND;  break;
+        case '+': rdwr = O_RDWR;  break;
+        default:  break;
+        }
+
+#ifdef O_BINARY
+    flags |= O_BINARY;
+#endif
+
+    return rdwr | flags;
+}
+
+
+/*********************
+ * In-memory backend *
+ *********************/
+
+typedef struct {
+    hFILE base;
+    const char *buffer;
+    size_t length, pos;
+} hFILE_mem;
+
+static ssize_t mem_read(hFILE *fpv, void *buffer, size_t nbytes)
+{
+    hFILE_mem *fp = (hFILE_mem *) fpv;
+    size_t avail = fp->length - fp->pos;
+    if (nbytes > avail) nbytes = avail;
+    memcpy(buffer, fp->buffer + fp->pos, nbytes);
+    fp->pos += nbytes;
+    return nbytes;
+}
+
+static off_t mem_seek(hFILE *fpv, off_t offset, int whence)
+{
+    hFILE_mem *fp = (hFILE_mem *) fpv;
+    size_t absoffset = (offset >= 0)? offset : -offset;
+    size_t origin;
+
+    switch (whence) {
+    case SEEK_SET: origin = 0; break;
+    case SEEK_CUR: origin = fp->pos; break;
+    case SEEK_END: origin = fp->length; break;
+    default: errno = EINVAL; return -1;
+    }
+
+    if ((offset  < 0 && absoffset > origin) ||
+        (offset >= 0 && absoffset > fp->length - origin)) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    fp->pos = origin + offset;
+    return fp->pos;
+}
+
+static int mem_close(hFILE *fpv)
+{
+    return 0;
+}
+
+static const struct hFILE_backend mem_backend =
+{
+    mem_read, NULL, mem_seek, NULL, mem_close
+};
+
+/*
+static hFILE *hopen_mem(const char *data, const char *mode)
+{
+    // TODO Implement write modes, which will require memory allocation
+    if (strchr(mode, 'r') == NULL) { errno = EINVAL; return NULL; }
+
+    hFILE_mem *fp = (hFILE_mem *) hfile_init(sizeof (hFILE_mem), mode, 0);
+    if (fp == NULL) return NULL;
+
+    fp->buffer = data;
+    fp->length = strlen(data);
+    fp->pos = 0;
+    fp->base.backend = &mem_backend;
+    return &fp->base;
+}
+*/
+
+
+/******************************
+ * hopen() backend dispatcher *
+ ******************************/
+
+hFILE *hopen(const char *fname, const char *mode)
+{
+  // if (strncmp(fname, "http://", 7) == 0 ||
+  //      strncmp(fname, "ftp://", 6) == 0) return hopen_net(fname, mode);
+#ifdef HAVE_IRODS
+  // else if (strncmp(fname, "irods:", 6) == 0) return hopen_irods(fname, mode);
+#endif
+  // else if (strncmp(fname, "data:", 5) == 0) return hopen_mem(fname + 5, mode);
+    if (strcmp(fname, "-") == 0) return hopen_fd_stdinout(mode);
+    else return hopen_fd(fname, mode);
+}
+
+/*
+int hisremote(const char *fname)
+{
+    // FIXME Make a new backend entry to return this
+    if (strncmp(fname, "http://", 7) == 0 ||
+        strncmp(fname, "https://", 8) == 0 ||
+        strncmp(fname, "ftp://", 6) == 0) return 1;
+#ifdef HAVE_IRODS
+    else if (strncmp(fname, "irods:", 6) == 0) return 1;
+#endif
+    else return 0;
+}
+*/
diff --git a/hfile.h b/hfile.h
new file mode 100644
index 0000000..f8b59f3
--- /dev/null
+++ b/hfile.h
@@ -0,0 +1,212 @@
+/*  hfile.h -- buffered low-level input/output streams.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_HFILE_H
+#define HTSLIB_HFILE_H
+
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "hts_defs.h"
+
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+/* These fields are declared here solely for the benefit of the inline functions
+   below.  They may change in future releases.  User code should not use them
+   directly; you should imagine that hFILE is an opaque incomplete type.  */
+struct hFILE_backend;
+typedef struct hFILE {
+    char *buffer, *begin, *end, *limit;
+    const struct hFILE_backend *backend;
+    off_t offset;
+    int at_eof:1;
+    int has_errno;
+} hFILE;
+
+/*!
+  @abstract  Open the named file or URL as a stream
+  @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
+*/
+hFILE *hopen(const char *filename, const char *mode) HTS_RESULT_USED;
+
+/*!
+  @abstract  Associate a stream with an existing open file descriptor
+  @return    An hFILE pointer, or NULL (with errno set) if an error occurred.
+  @notes     For socket descriptors (on Windows), mode should contain 's'.
+*/
+hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED;
+
+/*!
+  @abstract  Report whether the file name or URL denotes remote storage
+  @return    0 if local, 1 if remote.
+  @notes     "Remote" means involving e.g. explicit network access, with the
+    implication that callers may wish to cache such files' contents locally.
+*/
+// int hisremote(const char *filename) HTS_RESULT_USED;
+
+/*!
+  @abstract  Flush (for output streams) and close the stream
+  @return    0 if successful, or EOF (with errno set) if an error occurred.
+*/
+int hclose(hFILE *fp) HTS_RESULT_USED;
+
+/*!
+  @abstract  Close the stream, without flushing or propagating errors
+  @notes     For use while cleaning up after an error only.  Preserves errno.
+*/
+void hclose_abruptly(hFILE *fp);
+
+/*!
+  @abstract  Return the stream's error indicator
+  @return    Non-zero (in fact, an errno value) if an error has occurred.
+  @notes     This would be called herror() and return true/false to parallel
+    ferror(3), but a networking-related herror(3) function already exists.  */
+static inline int herrno(hFILE *fp)
+{
+    return fp->has_errno;
+}
+
+/*!
+  @abstract  Clear the stream's error indicator
+*/
+static inline void hclearerr(hFILE *fp)
+{
+    fp->has_errno = 0;
+}
+
+/*!
+  @abstract  Reposition the read/write stream offset
+  @return    The resulting offset within the stream (as per lseek(2)),
+    or negative if an error occurred.
+*/
+off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+/*!
+  @abstract  Report the current stream offset
+  @return    The offset within the stream, starting from zero.
+*/
+static inline off_t htell(hFILE *fp)
+{
+    return fp->offset + (fp->begin - fp->buffer);
+}
+
+/*!
+  @abstract  Read one character from the stream
+  @return    The character read, or EOF on end-of-file or error
+*/
+static inline int hgetc(hFILE *fp)
+{
+    extern int hgetc2(hFILE *);
+    return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp);
+}
+
+/*!
+  @abstract  Peek at characters to be read without removing them from buffers
+  @param fp      The file stream
+  @param buffer  The buffer to which the peeked bytes will be written
+  @param nbytes  The number of bytes to peek at; limited by the size of the
+    internal buffer, which could be as small as 4K.
+  @return    The number of bytes peeked, which may be less than nbytes if EOF
+    is encountered; or negative, if there was an I/O error.
+  @notes  The characters peeked at remain in the stream's internal buffer,
+    and will be returned by later hread() etc calls.
+*/
+ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+/*!
+  @abstract  Read a block of characters from the file
+  @return    The number of bytes read, or negative if an error occurred.
+  @notes     The full nbytes requested will be returned, except as limited
+    by EOF or I/O errors.
+*/
+static inline ssize_t HTS_RESULT_USED
+hread(hFILE *fp, void *buffer, size_t nbytes)
+{
+    extern ssize_t hread2(hFILE *, void *, size_t, size_t);
+
+    size_t n = fp->end - fp->begin;
+    if (n > nbytes) n = nbytes;
+    memcpy(buffer, fp->begin, n);
+    fp->begin += n;
+    return (n == nbytes)? (ssize_t) n : hread2(fp, buffer, nbytes, n);
+}
+
+/*!
+  @abstract  Write a character to the stream
+  @return    The character written, or EOF if an error occurred.
+*/
+static inline int hputc(int c, hFILE *fp)
+{
+    extern int hputc2(int, hFILE *);
+    if (fp->begin < fp->limit) *(fp->begin++) = c;
+    else c = hputc2(c, fp);
+    return c;
+}
+
+/*!
+  @abstract  Write a string to the stream
+  @return    0 if successful, or EOF if an error occurred.
+*/
+static inline int hputs(const char *text, hFILE *fp)
+{
+    extern int hputs2(const char *, size_t, size_t, hFILE *);
+
+    size_t nbytes = strlen(text), n = fp->limit - fp->begin;
+    if (n > nbytes) n = nbytes;
+    memcpy(fp->begin, text, n);
+    fp->begin += n;
+    return (n == nbytes)? 0 : hputs2(text, nbytes, n, fp);
+}
+
+/*!
+  @abstract  Write a block of characters to the file
+  @return    Either nbytes, or negative if an error occurred.
+  @notes     In the absence of I/O errors, the full nbytes will be written.
+*/
+static inline ssize_t HTS_RESULT_USED
+hwrite(hFILE *fp, const void *buffer, size_t nbytes)
+{
+    extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t);
+
+    size_t n = fp->limit - fp->begin;
+    if (n > nbytes) n = nbytes;
+    memcpy(fp->begin, buffer, n);
+    fp->begin += n;
+    return (n==nbytes)? (ssize_t) n : hwrite2(fp, buffer, nbytes, n);
+}
+
+/*!
+  @abstract  For writing streams, flush buffered output to the underlying stream
+  @return    0 if successful, or EOF if an error occurred.
+*/
+int hflush(hFILE *fp) HTS_RESULT_USED;
+
+// #ifdef __cplusplus
+// }
+// #endif
+
+#endif
diff --git a/hfile_internal.h b/hfile_internal.h
new file mode 100644
index 0000000..0997705
--- /dev/null
+++ b/hfile_internal.h
@@ -0,0 +1,76 @@
+/*  hfile_internal.h -- internal parts of low-level input/output streams.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HFILE_INTERNAL_H
+#define HFILE_INTERNAL_H
+
+#include "hfile.h"
+
+struct hFILE_backend {
+    /* As per read(2), returning the number of bytes read (possibly 0) or
+       negative (and setting errno) on errors.  Front-end code will call this
+       repeatedly if necessary to attempt to get the desired byte count.  */
+    ssize_t (*read)(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED;
+
+    /* As per write(2), returning the number of bytes written or negative (and
+       setting errno) on errors.  Front-end code will call this repeatedly if
+       necessary until the desired block is written or an error occurs.  */
+    ssize_t (*write)(hFILE *fp, const void *buffer, size_t nbytes)
+        HTS_RESULT_USED;
+
+    /* As per lseek(2), returning the resulting offset within the stream or
+       negative (and setting errno) on errors.  */
+    off_t (*seek)(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED;
+
+    /* Performs low-level flushing, if any, e.g., fsync(2); for writing streams
+       only.  Returns 0 for success or negative (and sets errno) on errors. */
+    int (*flush)(hFILE *fp) HTS_RESULT_USED;
+
+    /* Closes the underlying stream (for output streams, the buffer will
+       already have been flushed), returning 0 for success or negative (and
+       setting errno) on errors, as per close(2).  */
+    int (*close)(hFILE *fp) HTS_RESULT_USED;
+};
+
+/* These are called from the hopen() dispatcher, and should call hfile_init()
+   to malloc a struct "derived" from hFILE and initialise it appropriately,
+   including setting base.backend to their own backend vector.  */
+hFILE *hopen_irods(const char *filename, const char *mode);
+hFILE *hopen_net(const char *filename, const char *mode);
+
+/* May be called by hopen_*() functions to decode a fopen()-style mode into
+   open(2)-style flags.  */
+int hfile_oflags(const char *mode);
+
+/* Must be called by hopen_*() functions to allocate the hFILE struct and set
+   up its base.  Capacity is a suggested buffer size (e.g., via fstat(2))
+   or 0 for a default-sized buffer.  */
+hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity);
+
+/* May be called by hopen_*() functions to undo the effects of hfile_init()
+   in the event opening the stream subsequently fails.  (This is safe to use
+   even if fp is NULL.  This takes care to preserve errno.)  */
+void hfile_destroy(hFILE *fp);
+
+#endif
diff --git a/hts.h b/hts.h
new file mode 100644
index 0000000..084c162
--- /dev/null
+++ b/hts.h
@@ -0,0 +1,456 @@
+/*  hts.h -- format-neutral I/O, indexing, and iterator API functions.
+
+    Copyright (C) 2012-2014 Genome Research Ltd.
+    Copyright (C) 2012 Broad Institute.
+
+    Author: Heng Li <lh3 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_HTS_H
+#define HTSLIB_HTS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef HTS_BGZF_TYPEDEF
+typedef struct BGZF BGZF;
+#define HTS_BGZF_TYPEDEF
+#endif
+struct cram_fd;
+struct hFILE;
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+    size_t l, m;
+    char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/**
+ * hts_expand()  - expands memory block pointed to by $ptr;
+ * hts_expand0()   the latter sets the newly allocated part to 0.
+ *
+ * @param n     requested number of elements of type type_t
+ * @param m     size of memory allocated
+ */
+#define hts_expand(type_t, n, m, ptr) if ((n) > (m)) { \
+        (m) = (n); kroundup32(m); \
+        (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+    }
+#define hts_expand0(type_t, n, m, ptr) if ((n) > (m)) { \
+        int t = (m); (m) = (n); kroundup32(m); \
+        (ptr) = (type_t*)realloc((ptr), (m) * sizeof(type_t)); \
+        memset(((type_t*)ptr)+t,0,sizeof(type_t)*((m)-t)); \
+    }
+
+/************
+ * File I/O *
+ ************/
+
+// Add new entries only at the end (but before the *_maximum entry)
+// of these enums, as their numbering is part of the htslib ABI.
+
+enum htsFormatCategory {
+    unknown_category,
+    sequence_data,    // Sequence data -- SAM, BAM, CRAM, etc
+    variant_data,     // Variant calling data -- VCF, BCF, etc
+    index_file,       // Index file associated with some data file
+    region_list,      // Coordinate intervals or regions -- BED, etc
+    category_maximum = 32767
+};
+
+enum htsExactFormat {
+    unknown_format,
+    binary_format, text_format,
+    sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed,
+    format_maximum = 32767
+};
+
+enum htsCompression {
+    no_compression, gzip, bgzf, custom,
+    compression_maximum = 32767
+};
+
+typedef struct htsFormat {
+    enum htsFormatCategory category;
+    enum htsExactFormat format;
+    struct { short major, minor; } version;
+    enum htsCompression compression;
+    short compression_level;  // currently unused
+    void *specific;  // currently unused
+} htsFormat;
+
+// Maintainers note htsFile cannot be an opaque structure because some of its
+// fields are part of libhts.so's ABI (hence these fields must not be moved):
+//  - fp is used in the public sam_itr_next()/etc macros
+//  - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
+//  - is_write and is_cram are used directly in samtools <= 1.1
+//  - fp is used directly in samtools (up to and including current develop)
+//  - line is used directly in bcftools (up to and including current develop)
+typedef struct {
+    uint32_t is_bin:1, is_write:1, is_be:1, is_cram:1, dummy:28;
+    int64_t lineno;
+    kstring_t line;
+    char *fn, *fn_aux;
+    union {
+        BGZF *bgzf;
+        struct cram_fd *cram;
+        struct hFILE *hfile;
+        void *voidp;
+    } fp;
+    htsFormat format;
+} htsFile;
+
+// REQUIRED_FIELDS
+enum sam_fields {
+    SAM_QNAME = 0x00000001,
+    SAM_FLAG  = 0x00000002,
+    SAM_RNAME = 0x00000004,
+    SAM_POS   = 0x00000008,
+    SAM_MAPQ  = 0x00000010,
+    SAM_CIGAR = 0x00000020,
+    SAM_RNEXT = 0x00000040,
+    SAM_PNEXT = 0x00000080,
+    SAM_TLEN  = 0x00000100,
+    SAM_SEQ   = 0x00000200,
+    SAM_QUAL  = 0x00000400,
+    SAM_AUX   = 0x00000800,
+    SAM_RGAUX = 0x00001000,
+};
+
+enum cram_option {
+    CRAM_OPT_DECODE_MD,
+    CRAM_OPT_PREFIX,
+    CRAM_OPT_VERBOSITY,
+    CRAM_OPT_SEQS_PER_SLICE,
+    CRAM_OPT_SLICES_PER_CONTAINER,
+    CRAM_OPT_RANGE,
+    CRAM_OPT_VERSION,
+    CRAM_OPT_EMBED_REF,
+    CRAM_OPT_IGNORE_MD5,
+    CRAM_OPT_REFERENCE,
+    CRAM_OPT_MULTI_SEQ_PER_SLICE,
+    CRAM_OPT_NO_REF,
+    CRAM_OPT_USE_BZIP2,
+    CRAM_OPT_SHARED_REF,
+    CRAM_OPT_NTHREADS,
+    CRAM_OPT_THREAD_POOL,
+    CRAM_OPT_USE_LZMA,
+    CRAM_OPT_USE_RANS,
+    CRAM_OPT_REQUIRED_FIELDS,
+};
+
+/**********************
+ * Exported functions *
+ **********************/
+
+extern int hts_verbose;
+
+/*! @abstract Table for converting a nucleotide character to 4-bit encoding.
+The input character may be either an IUPAC ambiguity code, '=' for 0, or
+'0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
+for A/C/G/T or combinations of these bits for ambiguous bases.
+*/
+extern const unsigned char seq_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
+ambiguity code letter (or '=' when given 0).
+*/
+extern const char seq_nt16_str[];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
+Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
+*/
+extern const int seq_nt16_int[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+  @abstract  Get the htslib version number
+  @return    For released versions, a string like "N.N[.N]"; or git describe
+  output if using a library built within a Git repository.
+*/
+const char *hts_version(void);
+
+/*!
+  @abstract    Determine format by peeking at the start of a file
+  @param fp    File opened for reading, positioned at the beginning
+  @param fmt   Format structure that will be filled out on return
+  @return      0 for success, or negative if an error occurred.
+*/
+int hts_detect_format(struct hFILE *fp, htsFormat *fmt);
+
+/*!
+  @abstract    Get a human-readable description of the file format
+  @return      Description string, to be freed by the caller after use.
+*/
+char *hts_format_description(const htsFormat *format);
+
+/*!
+  @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
+  @param fn       The file name or "-" for stdin/stdout
+  @param mode     Mode matching /[rwa][bcuz0-9]+/
+  @discussion
+      With 'r' opens for reading; any further format mode letters are ignored
+      as the format is detected by checking the first few bytes or BGZF blocks
+      of the file.  With 'w' or 'a' opens for writing or appending, with format
+      specifier letters:
+        b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
+        c  CRAM format
+        g  gzip compressed
+        u  uncompressed
+        z  bgzf compressed
+        [0-9]  zlib compression level
+      Note that there is a distinction between 'u' and '0': the first yields
+      plain uncompressed output whereas the latter outputs uncompressed data
+      wrapped in the zlib format.
+  @example
+      [rw]b .. compressed BCF, BAM, FAI
+      [rw]u .. uncompressed BCF
+      [rw]z .. compressed VCF
+      [rw]  .. uncompressed VCF
+*/
+htsFile *hts_open(const char *fn, const char *mode);
+
+/*!
+  @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
+  @param fn       The already-open file handle
+  @param mode     Open mode, as per hts_open()
+*/
+htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode);
+
+/*!
+  @abstract  Close a file handle, flushing buffered data for output streams
+  @param fp  The file handle to be closed
+  @return    0 for success, or negative if an error occurred.
+*/
+int hts_close(htsFile *fp);
+
+/*!
+  @abstract  Returns the file's format information
+  @param fp  The file handle
+  @return    Read-only pointer to the file's htsFormat.
+*/
+const htsFormat *hts_get_format(htsFile *fp);
+
+/*!
+  @abstract  Sets a specified CRAM option on the open file handle.
+  @param fp  The file handle open the open file.
+  @param opt The CRAM_OPT_* option.
+  @param ... Optional arguments, dependent on the option used.
+  @return    0 for success, or negative if an error occurred.
+*/
+int hts_set_opt(htsFile *fp, enum cram_option opt, ...);
+
+int hts_getline(htsFile *fp, int delimiter, kstring_t *str);
+char **hts_readlines(const char *fn, int *_n);
+/*!
+    @abstract       Parse comma-separated list or read list from a file
+    @param list     File name or comma-separated list
+    @param is_file
+    @param _n       Size of the output array (number of items read)
+    @return         NULL on failure or pointer to newly allocated array of
+                    strings
+*/
+char **hts_readlist(const char *fn, int is_file, int *_n);
+
+/*!
+  @abstract  Create extra threads to aid compress/decompression for this file
+  @param fp  The file handle
+  @param n   The number of worker threads to create
+  @return    0 for success, or negative if an error occurred.
+  @notes     THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
+*/
+int hts_set_threads(htsFile *fp, int n);
+
+/*!
+  @abstract  Set .fai filename for a file opened for reading
+  @return    0 for success, negative on failure
+  @discussion
+      Called before *_hdr_read(), this provides the name of a .fai file
+      used to provide a reference list if the htsFile contains no @SQ headers.
+*/
+int hts_set_fai_filename(htsFile *fp, const char *fn_aux);
+
+#ifdef __cplusplus
+}
+#endif
+
+/************
+ * Indexing *
+ ************/
+
+/*!
+These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
+producing iterators operating as follows:
+ - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
+ - HTS_IDX_START  iterates over the entire file
+ - HTS_IDX_REST   iterates from the current position to the end of the file
+ - HTS_IDX_NONE   always returns "no more alignment records"
+When one of these special tid values is used, beg and end are ignored.
+When REST or NONE is used, idx is also ignored and may be NULL.
+*/
+#define HTS_IDX_NOCOOR (-2)
+#define HTS_IDX_START  (-3)
+#define HTS_IDX_REST   (-4)
+#define HTS_IDX_NONE   (-5)
+
+#define HTS_FMT_CSI 0
+#define HTS_FMT_BAI 1
+#define HTS_FMT_TBI 2
+#define HTS_FMT_CRAI 3
+
+struct __hts_idx_t;
+typedef struct __hts_idx_t hts_idx_t;
+
+typedef struct {
+    uint64_t u, v;
+} hts_pair64_t;
+
+typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end);
+
+typedef struct {
+    uint32_t read_rest:1, finished:1, dummy:29;
+    int tid, beg, end, n_off, i;
+    int curr_tid, curr_beg, curr_end;
+    uint64_t curr_off;
+    hts_pair64_t *off;
+    hts_readrec_func *readrec;
+    struct {
+        int n, m;
+        int *a;
+    } bins;
+} hts_itr_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7)
+    #define hts_bin_parent(l) (((l) - 1) >> 3)
+
+    hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls);
+    void hts_idx_destroy(hts_idx_t *idx);
+    int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped);
+    void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset);
+
+    void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt);
+    hts_idx_t *hts_idx_load(const char *fn, int fmt);
+
+    uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta);
+    void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy);
+
+    int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped);
+    uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx);
+
+    const char *hts_parse_reg(const char *s, int *beg, int *end);
+    hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+    void hts_itr_destroy(hts_itr_t *iter);
+
+    typedef int (*hts_name2id_f)(void*, const char*);
+    typedef const char *(*hts_id2name_f)(void*, int);
+    typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
+
+    hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec);
+    int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data);
+    const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values
+
+    /**
+     * hts_file_type() - Convenience function to determine file type
+     * DEPRECATED:  This function has been replaced by hts_detect_format().
+     * It and these FT_* macros will be removed in a future HTSlib release.
+     */
+    #define FT_UNKN   0
+    #define FT_GZ     1
+    #define FT_VCF    2
+    #define FT_VCF_GZ (FT_GZ|FT_VCF)
+    #define FT_BCF    (1<<2)
+    #define FT_BCF_GZ (FT_GZ|FT_BCF)
+    #define FT_STDIN  (1<<3)
+    int hts_file_type(const char *fname);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
+{
+    int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
+    for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
+        if (beg>>s == end>>s) return t + (beg>>s);
+    return 0;
+}
+
+static inline int hts_bin_bot(int bin, int n_lvls)
+{
+    int l, b;
+    for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin
+    return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
+}
+
+/**************
+ * Endianness *
+ **************/
+
+static inline int ed_is_big(void)
+{
+    long one= 1;
+    return !(*((char *)(&one)));
+}
+static inline uint16_t ed_swap_2(uint16_t v)
+{
+    return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *ed_swap_2p(void *x)
+{
+    *(uint16_t*)x = ed_swap_2(*(uint16_t*)x);
+    return x;
+}
+static inline uint32_t ed_swap_4(uint32_t v)
+{
+    v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+    return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *ed_swap_4p(void *x)
+{
+    *(uint32_t*)x = ed_swap_4(*(uint32_t*)x);
+    return x;
+}
+static inline uint64_t ed_swap_8(uint64_t v)
+{
+    v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+    v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+    return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *ed_swap_8p(void *x)
+{
+    *(uint64_t*)x = ed_swap_8(*(uint64_t*)x);
+    return x;
+}
+
+#endif
diff --git a/hts_defs.h b/hts_defs.h
new file mode 100644
index 0000000..f0cab80
--- /dev/null
+++ b/hts_defs.h
@@ -0,0 +1,55 @@
+/*  hts_defs.h -- Miscellaneous definitions.
+
+    Copyright (C) 2013-2015 Genome Research Ltd.
+
+    Author: John Marshall <jm18 at sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#ifndef HTSLIB_HTS_DEFS_H
+#define HTSLIB_HTS_DEFS_H
+
+#if __clang__major__ >= 2 || __GNUC__ >= 3
+#define HTS_NORETURN __attribute__ ((__noreturn__))
+#else
+#define HTS_NORETURN
+#endif
+
+#if (defined __clang__ && __clang_major__ >= 3) || \
+    (defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__==4 && __GNUC_MINOR__ >= 5)))
+#define HTS_RESULT_USED __attribute__ ((__warn_unused_result__))
+#else
+#define HTS_RESULT_USED
+#endif
+
+#if defined __clang__ || \
+    (defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95)))
+#define HTS_UNUSED __attribute__ ((__unused__))
+#else
+#define HTS_UNUSED
+#endif
+
+#if (defined __clang__ && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 1))) || \
+    (defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)))
+#define HTS_DEPRECATED(x) __attribute__ ((__deprecated__(x))) 
+#else
+#define HTS_DEPRECATED(x)
+#endif
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..5e55088
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,619 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor at live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/pigz.c b/pigz.c
index 6605925..2a4b7ab 100644
--- a/pigz.c
+++ b/pigz.c
@@ -291,12 +291,6 @@
    input buffers to about the same number.
  */
 
-#include <stdint.h>
-#include <inttypes.h>
-#define BLOCKSIZE 131072LU
-// extra allocated input buffer space, to simplify callback function logic
-#define SUPERSIZE 131072LU
-
 #ifdef _WIN32
 // stopgap non-parallel code for Windows
 
@@ -305,44 +299,162 @@
 #include <windows.h>
 #include "zlib-1.2.8/zlib.h"
 
+#include "pigz.h"
+
 void pigz_init(uint32_t setprocs) {
   return;
 }
 
-void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
-  unsigned char buf[BLOCKSIZE + SUPERSIZE];
+void parallel_compress(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
+  // minor issue: this currently writes \n instead of \r\n linebreaks.
   uint32_t overflow_ct = 0;
   gzFile gz_outfile = gzopen(out_fname, do_append? "ab": "wb");
+  unsigned char* write_ptr;
   uint32_t last_size;
   if (!gz_outfile) {
     printf("\nError: Failed to open %s.\n", out_fname);
     exit(2);
   }
   do {
-    last_size = emitn(overflow_ct, buf);
-    if (last_size > BLOCKSIZE) {
-      overflow_ct = last_size - BLOCKSIZE;
-      last_size = BLOCKSIZE;
+    last_size = emitn(overflow_ct, overflow_buf);
+    if (last_size > PIGZ_BLOCK_SIZE) {
+      overflow_ct = last_size - PIGZ_BLOCK_SIZE;
+      last_size = PIGZ_BLOCK_SIZE;
     } else {
       overflow_ct = 0;
     }
     if (last_size) {
-      if (!gzwrite(gz_outfile, buf, last_size)) {
-	printf("\nError: File write failure.\n");
+      if (!gzwrite(gz_outfile, overflow_buf, last_size)) {
+	fputs("\nError: File write failure.\n", stdout);
 	gzclose(gz_outfile);
 	exit(6);
       }
     }
     if (overflow_ct) {
-      memcpy(buf, &(buf[BLOCKSIZE]), overflow_ct);
+      write_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
+      while (overflow_ct > PIGZ_BLOCK_SIZE) {
+	if (!gzwrite(gz_outfile, write_ptr, PIGZ_BLOCK_SIZE)) {
+	  fputs("\nError: File write failure.\n", stdout);
+	  gzclose(gz_outfile);
+	  exit(6);
+	}
+	write_ptr = &(write_ptr[PIGZ_BLOCK_SIZE]);
+	overflow_ct -= PIGZ_BLOCK_SIZE;
+      }
+      memcpy(overflow_buf, write_ptr, overflow_ct);
     }
   } while (last_size);
   if (gzclose(gz_outfile) != Z_OK) {
-    printf("\nError: File write failure.\n");
+    fputs("\nError: File write failure.\n", stdout);
     exit(6);
   }
 }
 
+int32_t pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+    ps_ptr->outfile = fopen(out_fname, do_append? "ab" : "wb");
+    ps_ptr->gz_outfile = NULL;
+    if (!ps_ptr->outfile) {
+        printf("\nError: Failed to open %s.\n", out_fname);
+        return 2; // RET_OPEN_FAIL
+    }
+    ps_ptr->overflow_buf = overflow_buf;
+    return 0;
+}
+
+void compressed_pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+    ps_ptr->outfile = NULL;
+    ps_ptr->gz_outfile = gzopen(out_fname, do_append? "ab" : "wb");
+    if (!ps_ptr->gz_outfile) {
+        printf("\nError: Failed to open %s.\n", out_fname);
+        exit(2);
+    }
+    ps_ptr->overflow_buf = overflow_buf;
+}
+
+int32_t flex_pzwrite_init(uint32_t output_gz, char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+    if (!output_gz) {
+        return pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+    } else {
+        compressed_pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+        return 0;
+    }
+}
+
+int32_t force_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+    unsigned char* writep = (unsigned char*)(*writep_ptr);
+    if (ps_ptr->overflow_buf != writep) {
+        if (!fwrite(ps_ptr->overflow_buf, writep - ps_ptr->overflow_buf, 1, ps_ptr->outfile)) {
+	    return 6; // RET_WRITE_FAIL
+	}
+        *writep_ptr = (char*)(ps_ptr->overflow_buf);
+    }
+    return 0;
+}
+
+void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+    unsigned char* writep = (unsigned char*)(*writep_ptr);
+    if (ps_ptr->overflow_buf != writep) {
+        if (!gzwrite(ps_ptr->gz_outfile, ps_ptr->overflow_buf, writep - ps_ptr->overflow_buf)) {
+	    fputs("\nError: File write failure.\n", stdout);
+            gzclose(ps_ptr->gz_outfile);
+            exit(6);
+        }
+        *writep_ptr = (char*)(ps_ptr->overflow_buf);
+    }
+}
+
+int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen) {
+    unsigned char* writep = (unsigned char*)(*writep_ptr);
+    unsigned char* readp = (unsigned char*)ss;
+    uint32_t cur_write_space = 2 * PIGZ_BLOCK_SIZE - ((uintptr_t)(writep - ps_ptr->overflow_buf));
+    while (sslen > cur_write_space) {
+        memcpy(writep, readp, cur_write_space);
+	if (is_uncompressed_pzwrite(ps_ptr)) {
+	    if (!fwrite(ps_ptr->overflow_buf, 2 * PIGZ_BLOCK_SIZE, 1, ps_ptr->outfile)) {
+                return 6;
+	    }
+	} else {
+	    if (!gzwrite(ps_ptr->gz_outfile, ps_ptr->overflow_buf, 2 * PIGZ_BLOCK_SIZE)) {
+	        fputs("\nError: File write failure.\n", stdout);
+	        gzclose(ps_ptr->gz_outfile);
+	        exit(6);
+	    }
+	}
+        writep = ps_ptr->overflow_buf;
+        readp = &(readp[cur_write_space]);
+	sslen -= cur_write_space;
+	cur_write_space = 2 * PIGZ_BLOCK_SIZE;
+    }
+    memcpy(writep, readp, sslen);
+    *writep_ptr = (char*)(&(writep[sslen]));
+    return flex_pzwrite(ps_ptr, writep_ptr);
+}
+
+int32_t pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+    force_pzwrite(ps_ptr, &writep, 0);
+    int32_t ii = ferror(ps_ptr->outfile);
+    int32_t jj = fclose(ps_ptr->outfile);
+    ps_ptr->overflow_buf = NULL;
+    return ii || jj;
+}
+
+void compressed_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+    force_compressed_pzwrite(ps_ptr, &writep, 0);
+    ps_ptr->overflow_buf = NULL;
+    if (gzclose(ps_ptr->gz_outfile) != Z_OK) {
+        fputs("\nError: File write failure.\n", stdout);
+        exit(6);
+    }
+}
+
+int32_t flex_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+    if (is_uncompressed_pzwrite(ps_ptr)) {
+        return pzwrite_close_null(ps_ptr, writep);
+    } else {
+        compressed_pzwrite_close_null(ps_ptr, writep);
+        return 0;
+    }
+}
 #else
 
 #define VERSION "pigz 2.3\n"
@@ -399,6 +511,9 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
                            release(), peek_lock(), free_lock(), yarn_name */
 #endif
 
+#include "pigz.h"
+
+
 /* for local functions and globals */
 #define local static
 
@@ -667,37 +782,6 @@ local unsigned long crc32_comb(unsigned long crc1, unsigned long crc2,
 #define BASE 65521U     /* largest prime smaller than 65536 */
 #define LOW16 0xffff    /* mask lower 16 bits */
 
-
-/* -- pool of spaces for buffer management -- */
-
-/* These routines manage a pool of spaces.  Each pool specifies a fixed size
-   buffer to be contained in each space.  Each space has a use count, which
-   when decremented to zero returns the space to the pool.  If a space is
-   requested from the pool and the pool is empty, a space is immediately
-   created unless a specified limit on the number of spaces has been reached.
-   Only if the limit is reached will it wait for a space to be returned to the
-   pool.  Each space knows what pool it belongs to, so that it can be returned.
- */
-
-/* a space (one buffer for each space) */
-struct space {
-    lock *use;              /* use count -- return to pool when zero */
-    unsigned char *buf;     /* buffer of size size */
-    size_t size;            /* current size of this buffer */
-    size_t len;             /* for application usage (initially zero) */
-    struct pool *pool;      /* pool to return to */
-    struct space *next;     /* for pool linked list */
-};
-
-/* pool of spaces (one pool for each type needed) */
-struct pool {
-    lock *have;             /* unused spaces available, lock for list */
-    struct space *head;     /* linked list of available buffers */
-    size_t size;            /* size of new buffers in this pool */
-    int limit;              /* number of new spaces allowed, or -1 */
-    int made;               /* number of buffers made */
-};
-
 /* initialize a pool (pool structure itself provided, not allocated) -- the
    limit is the maximum number of spaces in the pool, or -1 to indicate no
    limit, i.e., to never wait for a buffer to return to the pool */
@@ -892,7 +976,7 @@ local void setup_jobs(void)
     /* initialize buffer pools (initial size for out_pool not critical, since
        buffers will be grown in size if needed -- initial size chosen to make
        this unlikely -- same for lens_pool) */
-    new_pool(&in_pool, g.block + SUPERSIZE, INBUFS(g.procs));
+    new_pool(&in_pool, g.block, INBUFS(g.procs));
     new_pool(&out_pool, OUTPOOL(g.block), -1);
     new_pool(&dict_pool, DICT, -1);
     new_pool(&lens_pool, g.block >> (RSYNCBITS - 1), -1);
@@ -1192,17 +1276,24 @@ local void write_thread(void *dummy)
    value calculations and one other thread for writing the output -- compress
    threads will be launched and left running (waiting actually) to support
    subsequent calls of parallel_compress() */
-void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*))
+void parallel_compress(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*))
 {
-    unsigned char overflow_buf[SUPERSIZE];
+    // overflow_buf must have size >= PIGZ_BLOCK_SIZE + maximum emission 
+
+    // if overflow_ct is nonzero, this points to the first uncompressed
+    // character in overflow_buf
+    unsigned char* read_ptr = NULL;
+
     uint32_t overflow_ct;
     long seq;                       /* sequence number */
     struct space *curr;             /* input data to compress */
     struct space *next;             /* input data that follows curr */
     struct space *dict;             /* dictionary for next compression */
     struct job *job;                /* job for compress, then write */
+
     int more;                       /* true if more input to read */
     size_t len;                     /* for various length computations */
+    uint32_t cur_len;
 
     g.outf = out_fname;
     g.outd = open(g.outf, O_WRONLY | (do_append? O_APPEND : (O_CREAT | O_TRUNC)), 0644);
@@ -1217,14 +1308,17 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
      the output of the compress threads) */
     seq = 0;
     next = get_space(&in_pool);
-    next->len = emitn(0, next->buf);
-    if (next->len > BLOCKSIZE) {
-	overflow_ct = next->len - BLOCKSIZE;
-	memcpy(overflow_buf, &(next->buf[BLOCKSIZE]), overflow_ct);
-	next->len = BLOCKSIZE;
+    cur_len = emitn(0, overflow_buf);
+    if (cur_len > PIGZ_BLOCK_SIZE) {
+        memcpy(next->buf, overflow_buf, PIGZ_BLOCK_SIZE);
+	next->len = PIGZ_BLOCK_SIZE;
+	read_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
     } else {
-	overflow_ct = 0;
+	memcpy(next->buf, overflow_buf, cur_len);
+        next->len = cur_len;
     }
+    overflow_ct = cur_len - next->len;
+
     dict = NULL;
     do {
         /* create a new job */
@@ -1238,14 +1332,27 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
 
         /* get more input if we don't already have some */
 	next = get_space(&in_pool);
-	memcpy(next->buf, overflow_buf, overflow_ct);
-	next->len = emitn(overflow_ct, next->buf);
-	if (next->len > BLOCKSIZE) {
-	    overflow_ct = next->len - BLOCKSIZE;
-	    memcpy(overflow_buf, &(next->buf[BLOCKSIZE]), overflow_ct);
-	    next->len = BLOCKSIZE;
+	if (overflow_ct >= PIGZ_BLOCK_SIZE) {
+	    // no need to call emitn(), since we still have >= 128K of text
+	    // from the previous call to compress
+	    memcpy(next->buf, read_ptr, PIGZ_BLOCK_SIZE);
+	    next->len = PIGZ_BLOCK_SIZE;
+	    read_ptr = &(read_ptr[PIGZ_BLOCK_SIZE]);
+	    overflow_ct -= PIGZ_BLOCK_SIZE;
 	} else {
-	    overflow_ct = 0;
+	    if (overflow_ct) {
+	        memcpy(overflow_buf, read_ptr, overflow_ct);
+	    }
+	    cur_len = emitn(overflow_ct, overflow_buf);
+	    if (cur_len > PIGZ_BLOCK_SIZE) {
+	        memcpy(next->buf, overflow_buf, PIGZ_BLOCK_SIZE);
+		next->len = PIGZ_BLOCK_SIZE;
+		read_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
+	    } else {
+	        memcpy(next->buf, overflow_buf, cur_len);
+		next->len = cur_len;
+	    }
+	    overflow_ct = cur_len - next->len;
 	}
 
         /* if rsyncable, generate block lengths and prepare curr for job to
@@ -1302,6 +1409,203 @@ void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(ui
     finish_jobs();
 }
 
+
+// about time to implement this without the awkward callback interface...
+int32_t pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+    // unbuffered, and doesn't need to support Windows
+    ps_ptr->outd = open(out_fname, O_WRONLY | (do_append? O_APPEND : (O_CREAT | O_TRUNC)), 0644);
+    if (ps_ptr->outd == -1) {
+        printf("\nError: Failed to open %s.\n", out_fname);
+        return 2; // RET_OPEN_FAIL
+    }
+    ps_ptr->overflow_buf = overflow_buf;
+    return 0;
+}
+
+void compressed_pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+    ps_ptr->outd = -1;
+    g.outf = out_fname;
+    g.outd = open(g.outf, O_WRONLY | (do_append? O_APPEND : (O_CREAT | O_TRUNC)), 0644);
+
+    /* if first time or after an option change, setup the job lists */
+    setup_jobs();
+
+    /* start write thread */
+    writeth = launch(write_thread, NULL);
+
+    ps_ptr->overflow_buf = overflow_buf;
+    ps_ptr->next = NULL;
+}
+
+int32_t flex_pzwrite_init(uint32_t output_gz, char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr) {
+    if (!output_gz) {
+        return pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+    } else {
+        compressed_pzwrite_init(out_fname, overflow_buf, do_append, ps_ptr);
+        return 0;
+    }
+}
+
+int32_t force_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+    unsigned char* writep = (unsigned char*)(*writep_ptr);
+    unsigned char* buf = ps_ptr->overflow_buf;
+    uint32_t len = (uintptr_t)(writep - buf);
+    ssize_t ret;
+    while (len) {
+        ret = write(ps_ptr->outd, ps_ptr->overflow_buf, len);
+	if (ret < 1) {
+	    return 6; // RET_WRITE_FAIL
+	}
+        buf += ret;
+        len -= ret;
+    }
+    *writep_ptr = (char*)(ps_ptr->overflow_buf);
+    return 0;
+}
+
+void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min) {
+    // Caller must not request a length-0 write until it's time to close the
+    // file.
+    unsigned char* writep = (unsigned char*)(*writep_ptr);
+    unsigned char* readp = ps_ptr->overflow_buf;
+    uint32_t cur_len = (uintptr_t)(writep - readp);
+
+    struct space* curr;             /* input data to compress */
+    struct job *job;                /* job for compress, then write */
+
+    int more;                       /* true if more input to read */
+    size_t len;                     /* for various length computations */
+    if (!ps_ptr->next) {
+        ps_ptr->seq = 0;
+        ps_ptr->next = get_space(&in_pool);
+        if (cur_len > PIGZ_BLOCK_SIZE) {
+	    memcpy(ps_ptr->next->buf, readp, PIGZ_BLOCK_SIZE);
+            ps_ptr->next->len = PIGZ_BLOCK_SIZE;
+            readp = &(readp[PIGZ_BLOCK_SIZE]);
+	    cur_len -= PIGZ_BLOCK_SIZE;
+	} else {
+	    memcpy(ps_ptr->next->buf, readp, cur_len);
+            ps_ptr->next->len = cur_len;
+	    readp = writep;
+	    cur_len = 0;
+	}
+        ps_ptr->dict = NULL;
+	if ((cur_len <= PIGZ_BLOCK_SIZE) && write_min) {
+	    // need more input to handle dict properly
+	    if (cur_len) {
+		memcpy(ps_ptr->overflow_buf, readp, cur_len);
+	    }
+	    *writep_ptr = (char*)(&(ps_ptr->overflow_buf[cur_len]));
+	    return;
+	}
+    }
+
+    do {
+	// create a new job
+	job = (struct job*)malloc(sizeof(struct job));
+	if (job == NULL) {
+	    bail("not enough memory", "");
+	}
+	job->calc = new_lock(0);
+        curr = ps_ptr->next;
+	ps_ptr->next = get_space(&in_pool);
+        if (cur_len > PIGZ_BLOCK_SIZE) {
+	    memcpy(ps_ptr->next->buf, readp, PIGZ_BLOCK_SIZE);
+	    ps_ptr->next->len = PIGZ_BLOCK_SIZE;
+            readp = &(readp[PIGZ_BLOCK_SIZE]);
+	} else {
+	    memcpy(ps_ptr->next->buf, readp, cur_len);
+	    ps_ptr->next->len = cur_len;
+	    readp = writep;
+	}
+	job->lens = NULL;
+	job->in = curr;
+	more = (cur_len != 0);
+	job->more = more;
+        job->out = ps_ptr->dict;
+	if (more) {
+	    if (curr->len >= DICT || job->out == NULL) {
+	        ps_ptr->dict = curr;
+	        use_space(ps_ptr->dict);
+	    } else {
+	        ps_ptr->dict = get_space(&dict_pool);
+                len = DICT - curr->len;
+                memcpy(ps_ptr->dict->buf, job->out->buf + (job->out->len - len), len);
+                memcpy(ps_ptr->dict->buf + len, curr->buf, curr->len);
+                ps_ptr->dict->len = DICT;
+	    }
+	}
+	job->seq = ps_ptr->seq;
+        if (++(ps_ptr->seq) < 1) {
+	    bail("input too long: ", "");
+	}
+	if (cthreads < ps_ptr->seq && cthreads < g.procs) {
+	    (void)launch(compress_thread, NULL);
+            cthreads++;
+	}
+        possess(compress_have);
+        job->next = NULL;
+        *compress_tail = job;
+        compress_tail = &(job->next);
+	twist(compress_have, BY, +1);
+	cur_len = (uintptr_t)(writep - readp);
+    } while ((cur_len >= write_min) && more);
+    if (cur_len) {
+        memcpy(ps_ptr->overflow_buf, readp, cur_len);
+    }
+    *writep_ptr = (char*)(&(ps_ptr->overflow_buf[cur_len]));
+}
+
+int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen) {
+    unsigned char* writep = (unsigned char*)(*writep_ptr);
+    unsigned char* readp = (unsigned char*)ss;
+    uint32_t cur_write_space = 2 * PIGZ_BLOCK_SIZE - ((uintptr_t)(writep - ps_ptr->overflow_buf));
+    int32_t ii;
+    while (sslen > cur_write_space) {
+        memcpy(writep, readp, cur_write_space);
+	if (is_uncompressed_pzwrite(ps_ptr)) {
+	    ii = force_pzwrite(ps_ptr, (char**)(&writep), PIGZ_BLOCK_SIZE + 1);
+	    if (ii) {
+	        return ii;
+	    }
+	} else {
+	    force_compressed_pzwrite(ps_ptr, (char**)(&writep), PIGZ_BLOCK_SIZE + 1);
+	}
+        readp = &(readp[cur_write_space]);
+	sslen -= cur_write_space;
+	cur_write_space = 2 * PIGZ_BLOCK_SIZE;
+    }
+    memcpy(writep, readp, sslen);
+    *writep_ptr = (char*)(&(writep[sslen]));
+    return flex_pzwrite(ps_ptr, writep_ptr);
+}
+
+int32_t pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+    int32_t ii = force_pzwrite(ps_ptr, &writep, 0);
+    int32_t jj = close(ps_ptr->outd);
+    ps_ptr->overflow_buf = NULL;
+    return ii || jj;
+}
+
+void compressed_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+    force_compressed_pzwrite(ps_ptr, &writep, 0);
+    drop_space(ps_ptr->next);
+    /* wait for the write thread to complete (we leave the compress threads out
+       there and waiting in case there is another stream to compress) */
+    join(writeth);
+    writeth = NULL;
+    finish_jobs();
+    ps_ptr->overflow_buf = NULL;
+}
+
+int32_t flex_pzwrite_close_null(Pigz_state* ps_ptr, char* writep) {
+    if (is_uncompressed_pzwrite(ps_ptr)) {
+        return pzwrite_close_null(ps_ptr, writep);
+    } else {
+        compressed_pzwrite_close_null(ps_ptr, writep);
+        return 0;
+    }
+}
 #endif
 
 /* catch termination signal */
@@ -1330,45 +1634,56 @@ void pigz_init(uint32_t setprocs)
 #endif
     yarn_prefix = g.prog;
     yarn_abort = cut_short;
-    g.block = BLOCKSIZE;            /* 128K */
+    g.block = PIGZ_BLOCK_SIZE;            /* 128K */
     g.verbosity = 1;                /* normal message level */
 }
 #endif // _WIN32
 
 // provide identical interface for uncompressed writing, to simplify code that
 // can generate either compressed or uncompressed output
-int32_t write_uncompressed(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
-  unsigned char buf[BLOCKSIZE + SUPERSIZE];
+int32_t write_uncompressed(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*)) {
   uint32_t overflow_ct = 0;
   // if it's potentially worth compressing, it should be text, hence mode "w"
   // instead of "wb"
+  // (er, that actually does the wrong thing on Windows.  Fixed in pzwrite.)
   FILE* outfile = fopen(out_fname, do_append? "a" : "w");
+  unsigned char* write_ptr;
   uint32_t last_size;
   if (!outfile) {
     printf("\nError: Failed to open %s.\n", out_fname);
     return 2; // RET_OPEN_FAIL
   }
   do {
-    last_size = emitn(overflow_ct, buf);
-    if (last_size > BLOCKSIZE) {
-      overflow_ct = last_size - BLOCKSIZE;
-      last_size = BLOCKSIZE;
+    last_size = emitn(overflow_ct, overflow_buf);
+    if (last_size > PIGZ_BLOCK_SIZE) {
+      overflow_ct = last_size - PIGZ_BLOCK_SIZE;
+      last_size = PIGZ_BLOCK_SIZE;
     } else {
       overflow_ct = 0;
     }
     if (last_size) {
-      if (!fwrite(buf, last_size, 1, outfile)) {
-	printf("\nError: File write failure.\n");
+      if (!fwrite(overflow_buf, last_size, 1, outfile)) {
+	fputs("\nError: File write failure.\n", stdout);
 	fclose(outfile);
 	return 6; // RET_WRITE_FAIL
       }
     }
     if (overflow_ct) {
-      memcpy(buf, &(buf[BLOCKSIZE]), overflow_ct);
+      write_ptr = &(overflow_buf[PIGZ_BLOCK_SIZE]);
+      while (overflow_ct > PIGZ_BLOCK_SIZE) {
+	if (!fwrite(write_ptr, PIGZ_BLOCK_SIZE, 1, outfile)) {
+	  fputs("\nError: File write failure.\n", stdout);
+	  fclose(outfile);
+	  return 6;
+	}
+	write_ptr = &(write_ptr[PIGZ_BLOCK_SIZE]);
+	overflow_ct -= PIGZ_BLOCK_SIZE;
+      }
+      memcpy(overflow_buf, write_ptr, overflow_ct);
     }
   } while (last_size);
   if (fclose(outfile)) {
-    printf("\nError: File write failure.\n");
+    fputs("\nError: File write failure.\n", stdout);
     return 6;
   }
   return 0;
diff --git a/pigz.h b/pigz.h
index c77926b..37913cf 100644
--- a/pigz.h
+++ b/pigz.h
@@ -6,10 +6,154 @@
 
 #define PIGZ_BLOCK_SIZE 131072
 
-void parallel_compress(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
+#ifndef _WIN32
+/* -- pool of spaces for buffer management -- */
+
+/* These routines manage a pool of spaces.  Each pool specifies a fixed size
+   buffer to be contained in each space.  Each space has a use count, which
+   when decremented to zero returns the space to the pool.  If a space is
+   requested from the pool and the pool is empty, a space is immediately
+   created unless a specified limit on the number of spaces has been reached.
+   Only if the limit is reached will it wait for a space to be returned to the
+   pool.  Each space knows what pool it belongs to, so that it can be returned.
+ */
+
+#include "yarn.h"
+
+/* a space (one buffer for each space) */
+struct space {
+    lock *use;              /* use count -- return to pool when zero */
+    unsigned char *buf;     /* buffer of size size */
+    size_t size;            /* current size of this buffer */
+    size_t len;             /* for application usage (initially zero) */
+    struct pool *pool;      /* pool to return to */
+    struct space *next;     /* for pool linked list */
+};
+
+/* pool of spaces (one pool for each type needed) */
+struct pool {
+    lock *have;             /* unused spaces available, lock for list */
+    struct space *head;     /* linked list of available buffers */
+    size_t size;            /* size of new buffers in this pool */
+    int limit;              /* number of new spaces allowed, or -1 */
+    int made;               /* number of buffers made */
+};
+
+// Note that this does NOT actually capture anywhere near all of pigz's state;
+// there are plenty of global variables that prevent multiple
+// parallel_compress2 instances from running concurrently.  It's just the bare
+// minimum to remove parallel_compress's callback requirement.
+typedef struct {
+    unsigned char* overflow_buf;
+    long seq;
+    struct space* dict;
+    struct space* next;
+    int outd; // uncompressed writing
+} Pigz_state;
+
+static inline uint32_t is_uncompressed_pzwrite(Pigz_state* ps_ptr) {
+    return ps_ptr->outd != -1;
+}
+#else
+typedef struct {
+    unsigned char* overflow_buf;
+    FILE* outfile;
+    gzFile gz_outfile;
+} Pigz_state;
+
+static inline uint32_t is_uncompressed_pzwrite(Pigz_state* ps_ptr) {
+    return (ps_ptr->outfile != NULL);
+}
+#endif // _WIN32 / NOTHREAD
+
+// This interface is obsolete; compressed_pzwrite/flex_pzwrite is far easier to
+// use.
+void parallel_compress(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
+
+
+static inline void pzwrite_init_null(Pigz_state* ps_ptr) {
+    ps_ptr->overflow_buf = NULL;
+}
+
+int32_t pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr);
+
+void compressed_pzwrite_init(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr);
+
+int32_t flex_pzwrite_init(uint32_t output_gz, char* out_fname, unsigned char* overflow_buf, uint32_t do_append, Pigz_state* ps_ptr);
+
+int32_t force_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min);
+
+void force_compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr, uint32_t write_min);
+
+static inline int32_t pzwrite(Pigz_state* ps_ptr, char** writep_ptr) {
+    if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - ps_ptr->overflow_buf) >= PIGZ_BLOCK_SIZE + 1) {
+        return force_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+    }
+    return 0;
+}
+
+static inline void compressed_pzwrite(Pigz_state* ps_ptr, char** writep_ptr) {
+    if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - ps_ptr->overflow_buf) >= PIGZ_BLOCK_SIZE + 1) {
+        force_compressed_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+    }
+}
+
+static inline int32_t flex_pzwrite(Pigz_state* ps_ptr, char** writep_ptr) {
+    if ((uintptr_t)(((unsigned char*)(*writep_ptr)) - ps_ptr->overflow_buf) >= PIGZ_BLOCK_SIZE + 1) {
+        if (is_uncompressed_pzwrite(ps_ptr)) {
+	    return force_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+        }
+	force_compressed_pzwrite(ps_ptr, writep_ptr, PIGZ_BLOCK_SIZE + 1);
+    }
+    return 0;
+}
+
+// Assumes overflow_buf has size 2 * PIGZ_BLOCK_SIZE.
+int32_t flex_pzputs_std(Pigz_state* ps_ptr, char** writep_ptr, char* ss, uint32_t sslen);
+
+// designed to write allele codes, which are usually length-1, but could have
+// length in the millions.  Assumes overflow_buf has size 2 * PIGZ_BLOCK_SIZE.
+static inline int32_t flex_pzputs_allele(Pigz_state* ps_ptr, char** writep_ptr, char* allele_code, uint32_t allele_len) {
+    // optimize the common case
+    if (allele_len == 1) {
+        **writep_ptr = *allele_code;
+	*writep_ptr += 1;
+	return flex_pzwrite(ps_ptr, writep_ptr);
+    }
+    return flex_pzputs_std(ps_ptr, writep_ptr, allele_code, allele_len);
+}
+
+int32_t pzwrite_close_null(Pigz_state* ps_ptr, char* writep);
+
+void compressed_pzwrite_close_null(Pigz_state* ps_ptr, char* writep);
+
+int32_t flex_pzwrite_close_null(Pigz_state* ps_ptr, char* writep);
+
+static inline void pzwrite_close_cond(Pigz_state* ps_ptr, char* writep) {
+    if (ps_ptr->overflow_buf) {
+        pzwrite_close_null(ps_ptr, writep);
+    }
+}
+
+static inline void compressed_pzwrite_close_cond(Pigz_state* ps_ptr, char* writep) {
+    if (ps_ptr->overflow_buf) {
+        compressed_pzwrite_close_null(ps_ptr, writep);
+    }
+}
+
+static inline void flex_pzwrite_close_cond(Pigz_state* ps_ptr, char* writep) {
+    if (ps_ptr->overflow_buf) {
+        if (is_uncompressed_pzwrite(ps_ptr)) {
+	    pzwrite_close_null(ps_ptr, writep);
+        } else {
+	    compressed_pzwrite_close_null(ps_ptr, writep);
+	}
+    }
+}
+
 
 void pigz_init(uint32_t setprocs);
 
-int32_t write_uncompressed(char* out_fname, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
+int32_t write_uncompressed(char* out_fname, unsigned char* overflow_buf, uint32_t do_append, uint32_t(* emitn)(uint32_t, unsigned char*));
 
 #endif // __PIGZ_H__
diff --git a/plink.c b/plink.c
index 151e2e7..b50c0c7 100644
--- a/plink.c
+++ b/plink.c
@@ -91,7 +91,7 @@
 
 const char ver_str[] =
 #ifdef STABLE_BUILD
-  "PLINK v1.90b3b"
+  "PLINK v1.90b3l"
 #else
   "PLINK v1.90p"
 #endif
@@ -104,7 +104,7 @@ const char ver_str[] =
   " 32-bit"
 #endif
   // include trailing space if day < 10, so character length stays the same
-  " (17 Jan 2015)";
+  " (18 Apr 2015)";
 const char ver_str2[] =
 #ifdef STABLE_BUILD
   "" // (don't want this when version number has a trailing letter)
@@ -276,18 +276,14 @@ static inline uint32_t are_marker_cms_needed(uint64_t calculation_type, char* cm
 }
 
 static inline uint32_t are_marker_alleles_needed(uint64_t calculation_type, char* freqname, Homozyg_info* homozyg_ptr, Two_col_params* a1alleles, Two_col_params* a2alleles, uint32_t ld_modifier, uint32_t snp_only, uint32_t clump_modifier, uint32_t cluster_modifier) {
-  return (freqname || (calculation_type & (CALC_FREQ | CALC_HARDY | CALC_MAKE_BED | CALC_MAKE_BIM | CALC_RECODE | CALC_REGRESS_PCS | CALC_MODEL | CALC_GLM | CALC_LASSO | CALC_LIST_23_INDELS | CALC_EPI | CALC_TESTMISHAP | CALC_SCORE | CALC_MENDEL | CALC_TDT | CALC_FLIPSCAN | CALC_QFAM | CALC_HOMOG | CALC_DUPVAR | CALC_RPLUGIN)) || ((calculation_type & CALC_HOMOZYG) && (homozyg_ptr->modifier & HOMOZYG_GROUP_VERBOSE)) || ((calculation_type & CALC_LD) && (ld_modifier & LD_INPHASE)) || ((calc [...]
+  return (freqname || (calculation_type & (CALC_FREQ | CALC_HARDY | CALC_MAKE_BED | CALC_MAKE_BIM | CALC_RECODE | CALC_REGRESS_PCS | CALC_MODEL | CALC_GLM | CALC_LASSO | CALC_LIST_23_INDELS | CALC_EPI | CALC_TESTMISHAP | CALC_SCORE | CALC_MENDEL | CALC_TDT | CALC_FLIPSCAN | CALC_QFAM | CALC_HOMOG | CALC_DUPVAR | CALC_RPLUGIN | CALC_DFAM)) || ((calculation_type & CALC_HOMOZYG) && (homozyg_ptr->modifier & HOMOZYG_GROUP_VERBOSE)) || ((calculation_type & CALC_LD) && (ld_modifier & LD_INPHASE [...]
 }
 
 static inline int32_t relationship_or_ibc_req(uint64_t calculation_type) {
   return (relationship_req(calculation_type) || (calculation_type & CALC_IBC));
 }
 
-static inline int32_t distance_wt_req(uint64_t calculation_type, char* read_dists_fname, uint32_t dist_calc_type) {
-  return (((calculation_type & CALC_DISTANCE) || ((!read_dists_fname) && ((calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE))))) && (!(dist_calc_type & DISTANCE_FLAT_MISSING)));
-}
-
-int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, char* famname, char* cm_map_fname, char* cm_map_chrname, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* freqname, char* read_dists_fname, char* read_dists_id_fname, char* evecname, char* mergename1, char* mergename2, char* mergename3, char* missing_mid_template, char* missing_marker_id_match, char* makephen [...]
+int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, char* famname, char* cm_map_fname, char* cm_map_chrname, char* phenoname, char* extractname, char* excludename, char* keepname, char* removename, char* keepfamname, char* removefamname, char* filtername, char* freqname, char* distance_wts_fname, char* read_dists_fname, char* read_dists_id_fname, char* evecname, char* mergename1, char* mergename2, char* mergename3, char* missing_mid_template, char* missing_marke [...]
   FILE* bedfile = NULL;
   FILE* phenofile = NULL;
   uintptr_t unfiltered_marker_ct = 0;
@@ -345,9 +341,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   uintptr_t* ac_excl_bitfield = NULL;
   double* pheno_d = NULL;
   double* orig_pheno_d = NULL;
-  double* marker_weights = NULL;
-  uint32_t marker_weight_sum = 0;
-  uint32_t* marker_weights_i = NULL;
   char* sample_ids = NULL;
   uintptr_t max_sample_id_len = 4;
   char* paternal_ids = NULL;
@@ -369,7 +362,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   uint64_t dists_alloc = 0;
   double missing_phenod = (double)missing_pheno;
   double ci_zt = 0.0;
-  uint32_t wt_needed = distance_wt_req(calculation_type, read_dists_fname, dist_calc_type);
   uintptr_t bed_offset = 3;
   uint32_t* marker_pos = NULL;
   uint32_t hh_exists = 0;
@@ -406,13 +398,10 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   int32_t* hwe_haph_allfs = NULL;
   pthread_t threads[MAX_THREADS];
   uint32_t* uiptr;
-  double* dptr;
-  double* dptr2;
   double* rel_ibc;
   uintptr_t uljj;
   uint32_t ujj;
   uint32_t ukk;
-  double dxx;
   char* outname_end2;
   int32_t ii;
   int64_t llyy;
@@ -422,8 +411,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   uint32_t sample_f_male_ct;
   Pedigree_rel_info pri;
   uintptr_t marker_uidx;
-  uintptr_t marker_uidx_stop;
-  uintptr_t marker_idx;
 
   if ((cm_map_fname || update_cm) && (!marker_cms_needed)) {
     LOGPRINTF("Error: --%s results would never be used.  (Did you forget --make-bed?)\n", cm_map_fname? "cm-map" : "update-cm");
@@ -724,15 +711,17 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	} else if ((calculation_type & CALC_GLM) && (glm_modifier & GLM_LOGISTIC)) {
 	  logprint("Error: --logistic without --all-pheno requires a case/control phenotype.\n");
 	  goto plink_ret_INVALID_CMDLINE;
-	} else if (calculation_type & (CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT)) {
+	} else if (calculation_type & (CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_DFAM)) {
 	  if (calculation_type & CALC_CMH) {
 	    logprint("Error: --mh and --mh2 require a case/control phenotype.\n");
 	  } else if (calculation_type & CALC_HOMOG) {
 	    logprint("Error: --homog requires a case/control phenotype.\n");
 	  } else if (calculation_type & CALC_TESTMISS) {
 	    logprint("Error: --test-missing requires a case/control phenotype.\n");
-	  } else {
+	  } else if (calculation_type & CALC_TDT) {
 	    logprint("Error: --tdt requires a case/control phenotype.\n");
+	  } else {
+	    logprint("Error: --dfam requires a case/control phenotype.\n");
 	  }
 	  goto plink_ret_INVALID_CMDLINE;
 	}
@@ -1074,6 +1063,17 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
       LOGPRINTF("%d %s removed due to founder status (--filter-%s).\n", ii, species_str(ii), (filter_flags & FILTER_BINARY_FOUNDERS)? "founders" : "nonfounders");
     }
 
+    if (thin_keep_sample_prob != 1.0) {
+      if (random_thin_samples(thin_keep_sample_prob, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct)) {
+        goto plink_ret_ALL_SAMPLES_EXCLUDED;
+      }
+    } else if (thin_keep_sample_ct) {
+      retval = random_thin_samples_ct(thin_keep_sample_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct);
+      if (retval) {
+        goto plink_ret_1;
+      }
+    }
+
     if (mind_thresh < 1.0) {
       retval = mind_filter(bedfile, bed_offset, outname, outname_end, mind_thresh, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, unfiltered_sample_ct, sample_exclude, &sample_exclude_ct, sample_ids, max_sample_id_len, sex_male, chrom_info_ptr, om_ip);
       if (retval) {
@@ -1110,7 +1110,15 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
   }
   if (g_thread_ct > 1) {
-    if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier &  [...]
+    if ((calculation_type & (CALC_RELATIONSHIP | CALC_REL_CUTOFF | CALC_GDISTANCE_MASK | CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE | CALC_GENOME | CALC_REGRESS_REL | CALC_UNRELATED_HERITABILITY | CALC_LD | CALC_PCA | CALC_MAKE_PERM_PHENO | CALC_QFAM)) || ((calculation_type & CALC_MODEL) && (model_modifier & (MODEL_PERM | MODEL_MPERM))) || ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_PERM | GLM_MPERM))) || ((calculation_type & CALC_TESTMISS) && (testmiss_modifier &  [...]
+#ifndef _WIN32
+        || ((calculation_type & CALC_FREQ) && (misc_flags & MISC_FREQ_GZ))
+        || ((calculation_type & CALC_MISSING_REPORT) && (misc_flags & MISC_MISSING_GZ))
+        || ((calculation_type & CALC_HARDY) && (hwe_modifier & HWE_GZ))
+        || ((calculation_type & CALC_HET) && (misc_flags & MISC_HET_GZ))
+	|| ((calculation_type & CALC_RECODE) && (((recode_modifier & (RECODE_VCF | RECODE_BGZ)) == (RECODE_VCF | RECODE_BGZ))))
+#endif
+) {
       LOGPRINTF("Using up to %u threads (change this with --threads).\n", g_thread_ct);
     } else {
       logprint("Using 1 thread (no multithreaded calculations invoked).\n");
@@ -1207,13 +1215,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
     fill_ulong_zero(marker_reverse, uii);
     if (bedfile) {
-      retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
+      retval = calc_freqs_and_hwe(bedfile, outname, outname_end, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_exclude, sample_exclude_ct, sample_ids, max_sample_id_len, founder_info, nonfounders, (misc_flags / MISC_MAF_SUCC) & 1, set_allele_freqs, bed_offset, (hwe_thresh > 0.0) || (calculation_type & CALC_HARDY), hwe_modifier & HWE_THRESH_ALL, (pheno_nm_ct && pheno_c)? ((calculation_type / CALC [...]
       if (retval) {
 	goto plink_ret_1;
       }
 
       if (freqname) {
-	retval = read_external_freqs(freqname, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, marker_allele_ptrs, set_allele_freqs, nchrobs, (misc_flags / MISC_MAF_SUCC) & 1, exponent, wt_needed, marker_weights);
+	retval = read_external_freqs(freqname, unfiltered_marker_ct, marker_exclude, marker_exclude_ct, marker_ids, max_marker_id_len, chrom_info_ptr, marker_allele_ptrs, set_allele_freqs, nchrobs, (misc_flags / MISC_MAF_SUCC) & 1);
 	if (retval) {
 	  goto plink_ret_1;
 	}
@@ -1245,24 +1253,16 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	if (misc_flags & MISC_FREQ_COUNTS) {
 	  logprint("Note: --freq 'counts' modifier has no effect on cluster-stratified report.\n");
 	}
-	memcpy(outname_end, ".frq.strat", 11);
-	retval = write_stratified_freqs(bedfile, bed_offset, outname, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
+	retval = write_stratified_freqs(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_FREQ_GZ) & 1, plink_maxsnp, unfiltered_marker_ct, marker_exclude, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, unfiltered_sample_ct, sample_ct, sample_f_ct, founder_info, nonfounders, sex_male, sample_f_male_ct, marker_reverse, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len);
       } else {
-	if (misc_flags & MISC_FREQX) {
-	  memcpy(outname_end, ".frqx", 6);
-	} else if (misc_flags & MISC_FREQ_COUNTS) {
-	  memcpy(outname_end, ".frq.count", 11);
-	} else {
-	  memcpy(outname_end, ".frq", 5);
-	}
-	retval = write_freqs(outname, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
+	retval = write_freqs(outname, outname_end, plink_maxsnp, unfiltered_marker_ct, marker_exclude, set_allele_freqs, chrom_info_ptr, marker_ids, max_marker_id_len, marker_allele_ptrs, max_marker_allele_len, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, hwe_hapl_allfs, hwe_haph_allfs, sample_f_ct, sample_f_male_ct, nonfounders, misc_flags, marker_reverse);
       }
       if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ))))) {
 	goto plink_ret_1;
       }
     }
     if (calculation_type & CALC_MISSING_REPORT) {
-      retval = write_missingness_reports(bedfile, bed_offset, outname, outname_end, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, chrom_info_ptr, om_ip, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_ct, sample_exclude, pheno_nm, sex_male, sample_male_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, hh_exists);
+      retval = write_missingness_reports(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_MISSING_GZ) & 1, plink_maxfid, plink_maxiid, plink_maxsnp, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, chrom_info_ptr, om_ip, marker_ids, max_marker_id_len, unfiltered_sample_ct, sample_ct, sample_exclude, pheno_nm, sex_male, sample_male_ct, sample_ids, max_sample_id_len, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, hh_exists);
       if (retval || (!(calculation_type & (~(CALC_MERGE | CALC_WRITE_CLUSTER | CALC_FREQ | CALC_MISSING_REPORT))))) {
 	goto plink_ret_1;
       }
@@ -1327,10 +1327,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	  }
 	}
       }
-
-      if (wt_needed) {
-	calc_marker_weights(exponent, unfiltered_marker_ct, marker_exclude, unfiltered_marker_ct - marker_exclude_ct, hwe_ll_allfs, hwe_lh_allfs, hwe_hh_allfs, marker_weights);
-      }
       wkspace_reset(hwe_lls);
     }
     if (sip->fname) {
@@ -1372,50 +1368,6 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
     }
   }
 
-  if (wt_needed) {
-    // normalize included marker weights to add to just under 2^32.  (switch to
-    // 2^64 if/when 32-bit performance becomes less important than accuracy on
-    // 50+ million marker sets.)
-    dxx = 0.0;
-    marker_uidx = 0;
-    marker_idx = 0;
-    do {
-      marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
-      marker_uidx_stop = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
-      marker_idx += marker_uidx_stop - marker_uidx;
-      dptr = &(marker_weights[marker_uidx]);
-      dptr2 = &(marker_weights[marker_uidx_stop]);
-      marker_uidx = marker_uidx_stop;
-      do {
-        dxx += *dptr++;
-      } while (dptr < dptr2);
-    } while (marker_idx < marker_ct);
-    // subtract marker_ct to guard against marker_weight_sum overflow from
-    // rounding
-    dxx = (4294967296.0 - ((double)((intptr_t)marker_ct))) / dxx;
-    if (wkspace_alloc_ui_checked(&marker_weights_i, marker_idx * sizeof(int32_t))) {
-      goto plink_ret_NOMEM;
-    }
-    marker_uidx = 0;
-    marker_idx = 0;
-    uiptr = marker_weights_i;
-    do {
-      marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
-      marker_uidx_stop = next_set_ul(marker_exclude, marker_uidx, unfiltered_marker_ct);
-      marker_idx += marker_uidx_stop - marker_uidx;
-      dptr = &(marker_weights[marker_uidx]);
-      dptr2 = &(marker_weights[marker_uidx_stop]);
-      marker_uidx = marker_uidx_stop;
-      do {
-        uii = (uint32_t)((*dptr++) * dxx + 0.5);
-        marker_weight_sum += uii;
-        *uiptr++ = uii;
-      } while (dptr < dptr2);
-    } while (marker_idx < marker_ct);
-    wkspace_left += topsize;
-    topsize = 0;
-  }
-
   if (relationship_or_ibc_req(calculation_type)) {
     if (relip->pca_cluster_names_flattened || relip->pca_clusters_fname) {
       retval = extract_clusters(unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, cluster_starts, cluster_ids, max_cluster_id_len, relip->pca_cluster_names_flattened, relip->pca_clusters_fname, &pca_sample_exclude, &pca_sample_ct);
@@ -1434,7 +1386,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	ulii = unfiltered_sample_ct - pca_sample_ct;
       }
     }
-    retval = calc_rel(threads, parallel_idx, parallel_tot, calculation_type, relip, bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ct, unfiltered_sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? (&ulii) : (&sample_exclude_ct), sample_ids, max_sample_id_len, set_allele_freqs, &rel_ibc, chrom_info_ptr);
+    retval = calc_rel(threads, parallel_idx, parallel_tot, calculation_type, relip, bedfile, bed_offset, outname, outname_end, distance_wts_fname, (dist_calc_type & DISTANCE_WTS_NOHEADER), unfiltered_marker_ct, marker_exclude, marker_reverse, marker_ct, marker_ids, max_marker_id_len, unfiltered_sample_ct, pca_sample_exclude? pca_sample_exclude : sample_exclude, pca_sample_exclude? (&ulii) : (&sample_exclude_ct), sample_ids, max_sample_id_len, set_allele_freqs, &rel_ibc, chrom_info_ptr);
     if (retval) {
       goto plink_ret_1;
     }
@@ -1748,7 +1700,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   } else
   */
   if (distance_req(calculation_type, read_dists_fname)) {
-    retval = calc_distance(threads, parallel_idx, parallel_tot, bedfile, bed_offset, outname, outname_end, calculation_type, dist_calc_type, marker_exclude, marker_ct, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr, wt_needed, marker_weight_sum, marker_weights_i, exponent);
+    retval = calc_distance(threads, parallel_idx, parallel_tot, bedfile, bed_offset, outname, outname_end, read_dists_fname, distance_wts_fname, distance_exp, calculation_type, dist_calc_type, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr);
     if (retval) {
       goto plink_ret_1;
     }
@@ -1811,7 +1763,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   }
 
   if (calculation_type & CALC_HET) {
-    retval = het_report(bedfile, bed_offset, outname, outname_end, unfiltered_marker_ct, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, (misc_flags & MISC_HET_SMALL_SAMPLE)? founder_info : NULL, chrom_info_ptr, set_allele_freqs);
+    retval = het_report(bedfile, bed_offset, outname, outname_end, (misc_flags / MISC_HET_GZ) & 1, unfiltered_marker_ct, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, plink_maxfid, plink_maxiid, max_sample_id_len, (misc_flags & MISC_HET_SMALL_SAMPLE)? founder_info : NULL, chrom_info_ptr, set_allele_freqs);
     if (retval) {
       goto plink_ret_1;
     }
@@ -1858,7 +1810,7 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
   }
 #endif
 
-  if (calculation_type & (CALC_MODEL | CALC_GXE | CALC_GLM | CALC_LASSO | CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_QFAM)) {
+  if (calculation_type & (CALC_MODEL | CALC_GXE | CALC_GLM | CALC_LASSO | CALC_CMH | CALC_HOMOG | CALC_TESTMISS | CALC_TDT | CALC_DFAM | CALC_QFAM)) {
     // can't use pheno_ctrl_ct in here since new phenotypes may be loaded, and
     // we don't bother updating it...
     if ((!pheno_all) && (!loop_assoc_fname)) {
@@ -2038,7 +1990,13 @@ int32_t plink(char* outname, char* outname_end, char* bedname, char* bimname, ch
 	}
       }
       if ((calculation_type & CALC_TDT) && pheno_c) {
-	retval = tdt(threads, bedfile, bed_offset, outname, outname_end2, ci_size, ci_zt, pfilter, output_min_p, mtest_adjust, adjust_lambda, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, mperm_save, pheno_nm, pheno_c, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids,  [...]
+	retval = tdt(threads, bedfile, bed_offset, outname, outname_end2, ci_size, ci_zt, pfilter, output_min_p, mtest_adjust, adjust_lambda, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_pos, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, apip, mperm_save, pheno_nm, pheno_c, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal [...]
+	if (retval) {
+	  goto plink_ret_1;
+	}
+      }
+      if ((calculation_type & CALC_DFAM) && pheno_c) {
+	retval = dfam(threads, bedfile, bed_offset, outname, outname_end2, pfilter, output_min_p, mtest_adjust, adjust_lambda, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, loop_assoc_fname? NULL : cluster_starts, apip, mperm_save, pheno_c, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids,  [...]
 	if (retval) {
 	  goto plink_ret_1;
 	}
@@ -2936,6 +2894,7 @@ int32_t main(int32_t argc, char** argv) {
   char* filtervals_flattened = NULL;
   char* evecname = NULL;
   char* filtername = NULL;
+  char* distance_wts_fname = NULL;
   char* read_dists_fname = NULL;
   char* read_dists_id_fname = NULL;
   char* freqname = NULL;
@@ -3006,13 +2965,15 @@ int32_t main(int32_t argc, char** argv) {
   uint64_t misc_flags = 0;
   uint64_t filter_flags = 0;
   double thin_keep_prob = 1.0;
+  double thin_keep_sample_prob = 1.0;
   uint32_t thin_keep_ct = 0;
+  uint32_t thin_keep_sample_ct = 0;
   uint32_t min_bp_space = 0;
   uint32_t check_sex_f_yobs = 0;
   uint32_t check_sex_m_yobs = 0;
   double check_sex_fthresh = 0.2;
   double check_sex_mthresh = 0.8;
-  double exponent = 0.0;
+  double distance_exp = 0.0;
   double min_maf = 0.0;
   double max_maf = 0.5;
   double geno_thresh = 1.0;
@@ -3568,7 +3529,6 @@ int32_t main(int32_t argc, char** argv) {
 	  memcpy(flagptr, "snp", 4);
 	  break;
 	} else if (!strcmp(argptr, "exponent")) {
-	  fputs("Note: --exponent flag has been renamed to --distance-exp.\n", stdout);
 	  memcpy(flagptr, "distance-exp", 13);
 	  break;
 	}
@@ -3644,6 +3604,9 @@ int32_t main(int32_t argc, char** argv) {
 	} else if (!strcmp(argptr, "max-ac")) {
 	  memcpy(flagptr, "max-mac", 8);
 	  break;
+	} else if (!strcmp(argptr, "max-indv")) {
+	  memcpy(flagptr, "thin-indiv-count", 17);
+	  break;
 	}
 	goto main_flag_copy;
       case 'n':
@@ -4012,7 +3975,7 @@ int32_t main(int32_t argc, char** argv) {
     case 'R':
       if (*argptr2 == '\0') {
 #if defined __cplusplus && !defined _WIN32
-        UNSTABLE;
+        UNSTABLE("R");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 2)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -4587,7 +4550,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(cluster.cmh_mperm_val))) {
-	      sprintf(logbuf, "Error: Invalid --bd mperm parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(logbuf, "Error: Invalid --bd mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             cluster.modifier |= CLUSTER_CMH_MPERM;
@@ -4936,7 +4899,7 @@ int32_t main(int32_t argc, char** argv) {
         cluster.modifier |= CLUSTER_MISSING;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "file", 5)) {
-        UNSTABLE;
+        UNSTABLE("cfile");
 	if (load_rare || load_params) {
 	  goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
 	}
@@ -4958,7 +4921,7 @@ int32_t main(int32_t argc, char** argv) {
 	memcpy(memcpya(mapname, sptr, uii), ".cnv.map", 9);
 	load_rare = LOAD_RARE_CNV;
       } else if (!memcmp(argptr2, "nv-count", 9)) {
-	UNSTABLE;
+	UNSTABLE("cnv-count");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -4968,15 +4931,15 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_intersect_filter_type = CNV_COUNT;
       } else if (!memcmp(argptr2, "nv-del", 7)) {
-	UNSTABLE;
+	UNSTABLE("cnv-del");
 	cnv_calc_type |= CNV_DEL;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "nv-disrupt", 11)) {
-	UNSTABLE;
+	UNSTABLE("cnv-disrupt");
 	cnv_overlap_type = CNV_DISRUPT;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "nv-dup", 7)) {
-	UNSTABLE;
+	UNSTABLE("cnv-dup");
 	if (cnv_calc_type & CNV_DEL) {
 	  logprint("Error: --cnv-dup cannot be used with --cnv-del.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
@@ -4984,7 +4947,7 @@ int32_t main(int32_t argc, char** argv) {
 	cnv_calc_type |= CNV_DUP;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "nv-enrichment-test", 19)) {
-	UNSTABLE;
+	UNSTABLE("cnv-enrichment-test");
 	if (!cnv_intersect_filter_type) {
 	  logprint("Error: --cnv-enrichment-test must be used with --cnv-count.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
@@ -5000,7 +4963,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_calc_type |= CNV_ENRICHMENT_TEST;
       } else if (!memcmp(argptr2, "nv-exclude", 11)) {
-	UNSTABLE;
+	UNSTABLE("cnv-exclude");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -5014,11 +4977,11 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_intersect_filter_type = CNV_EXCLUDE;
       } else if (!memcmp(argptr2, "nv-exclude-off-by-1", 20)) {
-	UNSTABLE;
+	UNSTABLE("cnv-exclude-off-by-1");
         cnv_calc_type |= CNV_EXCLUDE_OFF_BY_1;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "nv-freq-exclude-above", 22)) {
-	UNSTABLE;
+	UNSTABLE("cnv-freq-exclude-above");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -5028,7 +4991,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_freq_type = CNV_FREQ_EXCLUDE_ABOVE;
       } else if (!memcmp(argptr2, "nv-freq-exclude-below", 22)) {
-	UNSTABLE;
+	UNSTABLE("cnv-freq-exclude-below");
 	if (cnv_freq_type) {
 	  logprint("Error: --cnv-freq-exclude-below cannot be used with --cnv-freq-exclude-above.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5042,7 +5005,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_freq_type = CNV_FREQ_EXCLUDE_BELOW;
       } else if (!memcmp(argptr2, "nv-freq-exclude-exact", 22)) {
-	UNSTABLE;
+	UNSTABLE("cnv-freq-exclude-exact");
 	if (cnv_freq_type) {
 	  logprint("Error: --cnv-freq-exclude-exact cannot be used with\n--cnv-freq-exclude-above/-below.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5056,7 +5019,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_freq_type = CNV_FREQ_EXCLUDE_EXACT;
       } else if (!memcmp(argptr2, "nv-freq-include-exact", 22)) {
-	UNSTABLE;
+	UNSTABLE("cnv-freq-include-exact");
 	if (cnv_freq_type) {
 	  logprint("Error: --cnv-freq-include-exact cannot be used with\n--cnv-freq-exclude-above/-below/-exact.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5070,7 +5033,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_freq_type = CNV_FREQ_INCLUDE_EXACT;
       } else if (!memcmp(argptr2, "nv-freq-method2", 16)) {
-	UNSTABLE;
+	UNSTABLE("cnv-freq-method2");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
           goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -5086,7 +5049,7 @@ int32_t main(int32_t argc, char** argv) {
 	  cnv_freq_val2 = SMALLISH_EPSILON;
 	}
       } else if (!memcmp(argptr2, "nv-freq-overlap", 16)) {
-	UNSTABLE;
+	UNSTABLE("cnv-freq-overlap");
 	if (!(cnv_freq_type & CNV_FREQ_FILTER)) {
 	  logprint("Error: --cnv-freq-overlap must be used with --cnv-freq-include-exact or\n--cnv-freq-exclude-above/-below/-exact.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5108,7 +5071,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_freq_type |= CNV_FREQ_OVERLAP;
       } else if (!memcmp(argptr2, "nv-indiv-perm", 14)) {
-	UNSTABLE;
+	UNSTABLE("cnv-indiv-perm");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -5120,7 +5083,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_calc_type |= CNV_SAMPLE_PERM;
       } else if (!memcmp(argptr2, "nv-intersect", 13)) {
-	UNSTABLE;
+	UNSTABLE("cnv-intersect");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -5134,7 +5097,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_intersect_filter_type = CNV_INTERSECT;
       } else if (!memcmp(argptr2, "nv-kb", 6)) {
-	UNSTABLE;
+	UNSTABLE("cnv-kb");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -5144,7 +5107,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_min_seglen = (int32_t)(dxx * 1000 * (1 + SMALL_EPSILON));
       } else if (!memcmp(argptr2, "nv-list", 8)) {
-	UNSTABLE;
+	UNSTABLE("cnv-list");
 	if ((load_rare & (~LOAD_RARE_CNV)) || load_params) {
 	  goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
 	}
@@ -5158,7 +5121,7 @@ int32_t main(int32_t argc, char** argv) {
 	strcpya(pedname, argv[cur_arg + 1]);
 	load_rare = LOAD_RARE_CNV;
       } else if (!memcmp(argptr2, "nv-make-map", 12)) {
-	UNSTABLE;
+	UNSTABLE("cnv-make-map");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-make-map cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5176,7 +5139,7 @@ int32_t main(int32_t argc, char** argv) {
 	  cnv_calc_type |= CNV_MAKE_MAP | CNV_MAKE_MAP_LONG;
 	}
       } else if (!memcmp(argptr2, "nv-max-kb", 10)) {
-	UNSTABLE;
+	UNSTABLE("cnv-max-kb");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-max-kb cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5194,7 +5157,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE;
 	}
       } else if (!memcmp(argptr2, "nv-max-score", 13)) {
-	UNSTABLE;
+	UNSTABLE("cnv-max-score");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-max-score cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5207,7 +5170,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "nv-max-sites", 13)) {
-	UNSTABLE;
+	UNSTABLE("cnv-max-sites");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-max-sites cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5220,7 +5183,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "nv-overlap", 11)) {
-	UNSTABLE;
+	UNSTABLE("cnv-overlap");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-overlap cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5247,7 +5210,7 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	}
       } else if (!memcmp(argptr2, "nv-region-overlap", 18)) {
-	UNSTABLE;
+	UNSTABLE("cnv-region-overlap");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-region-overlap cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5264,7 +5227,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_overlap_type = CNV_OVERLAP_REGION;
       } else if (!memcmp(argptr2, "nv-score", 9)) {
-	UNSTABLE;
+	UNSTABLE("cnv-score");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-score cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5281,7 +5244,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE;
 	}
       } else if (!memcmp(argptr2, "nv-sites", 9)) {
-	UNSTABLE;
+	UNSTABLE("cnv-sites");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-sites cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5298,7 +5261,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE;
 	}
       } else if (!memcmp(argptr2, "nv-subset", 10)) {
-	UNSTABLE;
+	UNSTABLE("cnv-subset");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-subset cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5314,7 +5277,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_1;
 	}
       } else if (!memcmp(argptr2, "nv-test", 8)) {
-	UNSTABLE;
+	UNSTABLE("cnv-test");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-test cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5349,7 +5312,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_calc_type |= CNV_TEST;
       } else if (!memcmp(argptr2, "nv-test-1sided", 15)) {
-	UNSTABLE;
+	UNSTABLE("cnv-test-1sided");
 	if (cnv_calc_type & CNV_TEST_FORCE_2SIDED) {
 	  logprint("Error: --cnv-test cannot be both 1-sided and 2-sided at the same time.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5357,7 +5320,7 @@ int32_t main(int32_t argc, char** argv) {
 	logprint("Note: --cnv-test-1sided flag deprecated.  Use '--cnv-test 1sided'.\n");
 	cnv_calc_type |= CNV_TEST_FORCE_1SIDED;
       } else if (!memcmp(argptr2, "nv-test-2sided", 15)) {
-	UNSTABLE;
+	UNSTABLE("cnv-test-2sided");
 	if (cnv_calc_type & CNV_TEST_FORCE_1SIDED) {
 	  logprint("Error: --cnv-test cannot be both 1-sided and 2-sided at the same time.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5365,7 +5328,7 @@ int32_t main(int32_t argc, char** argv) {
 	logprint("Note: --cnv-test-2sided flag deprecated.  Use '--cnv-test 2sided'.\n");
 	cnv_calc_type |= CNV_TEST_FORCE_2SIDED;
       } else if (!memcmp(argptr2, "nv-test-region", 15)) {
-	UNSTABLE;
+	UNSTABLE("cnv-test-region");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-test-region cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5381,7 +5344,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_calc_type |= CNV_TEST_REGION;
       } else if (!memcmp(argptr2, "nv-test-window", 15)) {
-	UNSTABLE;
+	UNSTABLE("cnv-test-window");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-test-window cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5400,7 +5363,7 @@ int32_t main(int32_t argc, char** argv) {
 	  cnv_test_window = (int32_t)(dxx * (1 + SMALL_EPSILON));
 	}
       } else if (!memcmp(argptr2, "nv-union-overlap", 17)) {
-	UNSTABLE;
+	UNSTABLE("cnv-union-overlap");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-union-overlap cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5417,7 +5380,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_overlap_type = CNV_OVERLAP_UNION;
       } else if (!memcmp(argptr2, "nv-write", 9)) {
-	UNSTABLE;
+	UNSTABLE("cnv-write");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-write cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5438,7 +5401,7 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	cnv_calc_type |= CNV_WRITE;
       } else if (!memcmp(argptr2, "nv-write-freq", 14)) {
-	UNSTABLE;
+	UNSTABLE("cnv-write-freq");
 	if (!(load_rare & LOAD_RARE_CNV)) {
 	  logprint("Error: --cnv-write freq cannot be used without a .cnv fileset.\n");
 	  goto main_ret_INVALID_CMDLINE;
@@ -5945,12 +5908,46 @@ int32_t main(int32_t argc, char** argv) {
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
-	if (scan_double(argv[cur_arg + 1], &exponent)) {
+	if (scan_double(argv[cur_arg + 1], &distance_exp)) {
 	  sprintf(logbuf, "Error: Invalid --distance-exp parameter '%s'.\n", argv[cur_arg + 1]);
 	  goto main_ret_INVALID_CMDLINE_WW;
 	}
+	fputs("Note: '--distance-exp [x]' deprecated.  Use '--distance-weights exp=[x]' instead.\n", stdout);
+      } else if (!memcmp(argptr2, "istance-wts", 12)) {
+	if (distance_exp != 0.0) {
+	  logprint("Error: --distance-wts cannot be used with --distance-exp.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	} else if (calculation_type & CALC_PLINK1_DISTANCE_MATRIX) {
+	  logprint("Error: --distance-wts cannot be used with --distance-matrix.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 2)) {
+	  goto main_ret_INVALID_CMDLINE_2A;
+        }
+	if ((strlen(argv[cur_arg + 1]) > 4) && (!memcmp(argv[cur_arg + 1], "exp=", 4))) {
+	  if (scan_double(&(argv[cur_arg + 1][4]), &distance_exp)) {
+	    sprintf(logbuf, "Error: Invalid --distance-wts exponent '%s'.\n", &(argv[cur_arg + 1][4]));
+	    goto main_ret_INVALID_CMDLINE_WW;
+	  }
+	} else {
+	  UNSTABLE("distance-wts");
+	  uii = 1;
+	  if (param_ct == 2) {
+	    if (!strcmp(argv[cur_arg + 1], "noheader")) {
+	      uii = 2;
+	    } else if (strcmp(argv[cur_arg + 2], "noheader")) {
+	      sprintf(logbuf, "Error: Invalid --distance-wts parameter '%s'.\n", argv[cur_arg + 2]);
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    dist_calc_type |= DISTANCE_WTS_NOHEADER;
+	  }
+	  retval = alloc_fname(&distance_wts_fname, argv[cur_arg + uii], argptr, 0);
+	  if (retval) {
+	    goto main_ret_1;
+	  }
+	}
       } else if (!memcmp(argptr2, "istance-matrix", 15)) {
-	if (exponent != 0.0) {
+	if (distance_exp != 0.0) {
 	  logprint("Error: --distance-matrix cannot be used with --distance-exp.\n");
 	  goto main_ret_INVALID_CMDLINE;
 	}
@@ -6152,6 +6149,56 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	load_rare = LOAD_RARE_DOSAGE;
+      } else if (!memcmp(argptr2, "fam", 4)) {
+	UNSTABLE("dfam");
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 4)) {
+	  goto main_ret_INVALID_CMDLINE_2A;
+	}
+	for (uii = 1; uii <= param_ct; uii++) {
+	  if (!strcmp(argv[cur_arg + uii], "no-unrelateds")) {
+	    family_info.dfam_modifier |= DFAM_NO_UNRELATEDS;
+	  } else if (!strcmp(argv[cur_arg + uii], "perm")) {
+	    if (family_info.dfam_modifier & DFAM_MPERM) {
+	      logprint("Error: --dfam 'mperm' and 'perm' cannot be used together.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    }
+	    family_info.dfam_modifier |= DFAM_PERM;
+	  } else if (!strcmp(argv[cur_arg + uii], "perm-count")) {
+	    family_info.dfam_modifier |= DFAM_PERM_COUNT;
+	  } else if ((strlen(argv[cur_arg + uii]) > 6) && (!memcmp(argv[cur_arg + uii], "mperm=", 6))) {
+	    if (family_info.dfam_modifier & DFAM_PERM) {
+	      logprint("Error: --dfam 'mperm' and 'perm' cannot be used together.\n");
+	      goto main_ret_INVALID_CMDLINE_A;
+	    } else if (family_info.dfam_modifier & DFAM_MPERM) {
+	      logprint("Error: Duplicate --dfam 'mperm' modifier.\n");
+	      goto main_ret_INVALID_CMDLINE;
+	    }
+	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &family_info.dfam_mperm_val)) {
+	      sprintf(logbuf, "Error: Invalid --dfam mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
+	      goto main_ret_INVALID_CMDLINE_WWA;
+	    }
+	    family_info.dfam_modifier |= DFAM_MPERM;
+	  } else if (!strcmp(argv[cur_arg + uii], "set-test")) {
+	    family_info.dfam_modifier |= DFAM_SET_TEST;
+	  } else if (!strcmp(argv[cur_arg + uii], "mperm")) {
+	    logprint("Error: Improper --dfam mperm syntax.  (Use '--dfam mperm=[value]'.)\n");
+	    goto main_ret_INVALID_CMDLINE;
+	  } else {
+	    sprintf(logbuf, "Error: Invalid --dfam parameter '%s'.\n", argv[cur_arg + uii]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	}
+	calculation_type |= CALC_DFAM;
+      } else if (!memcmp(argptr2, "fam-no-unrelateds", 18)) {
+	// keep this undocumented flag since it makes DFAM correspond to the
+	// original sib-TDT.
+	if (!(calculation_type & CALC_DFAM)) {
+	  logprint("Error: --dfam-no-unrelateds must be used with --dfam.\n");
+	  goto main_ret_INVALID_CMDLINE;
+	}
+	family_info.dfam_modifier |= DFAM_NO_UNRELATEDS;
+	logprint("Note: --dfam-no-unrelateds flag deprecated.  Use '--dfam no-unrelateds'.\n");
+	goto main_param_zero;
       } else if (!memcmp(argptr2, "prime", 6)) {
 	logprint("Note: --dprime flag deprecated.  Use e.g. '--r2 dprime'.\n");
 	ld_info.modifier |= LD_DPRIME;
@@ -6374,19 +6421,24 @@ int32_t main(int32_t argc, char** argv) {
 	filter_flags |= FILTER_FAM_REQ | FILTER_BINARY_NONFOUNDERS;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "req", 4)) {
-	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
-	if (param_ct) {
-	  if (strcmp(argv[cur_arg + 1], "counts")) {
-            sprintf(logbuf, "Error: Invalid --freq parameter '%s'.\n", argv[cur_arg + 1]);
+	for (uii = 1; uii <= param_ct; uii++) {
+	  if (!strcmp(argv[cur_arg + uii], "counts")) {
+	    misc_flags |= MISC_FREQ_COUNTS;
+	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
+	    misc_flags |= MISC_FREQ_GZ;
+	  } else {
+            sprintf(logbuf, "Error: Invalid --freq parameter '%s'.\n", argv[cur_arg + uii]);
 	    goto main_ret_INVALID_CMDLINE_WWA;
 	  }
-	  misc_flags |= MISC_FREQ_COUNTS;
 	}
 	calculation_type |= CALC_FREQ;
 	if (misc_flags & MISC_FREQ_COUNTS) {
 	  // --keep-allele-order also set for backward compatibility
+	  // placed here instead of a few lines up because '--freq --counts' is
+	  // permitted
 	  misc_flags |= MISC_KEEP_ALLELE_ORDER;
 	}
       } else if (!memcmp(argptr2, "reqx", 5)) {
@@ -6394,9 +6446,18 @@ int32_t main(int32_t argc, char** argv) {
 	  logprint("Error: --freqx cannot be used with --freq.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+	  goto main_ret_INVALID_CMDLINE_2A;
+	}
+	if (param_ct) {
+	  if (strcmp(argv[cur_arg + 1], "gz")) {
+	    sprintf(logbuf, "Error: Invalid --freqx parameter '%s'.\n", argv[cur_arg + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  misc_flags |= MISC_FREQ_GZ;
+	}
 	calculation_type |= CALC_FREQ;
 	misc_flags |= MISC_FREQX;
-	goto main_param_zero;
       } else if (!memcmp(argptr2, "rom", 4)) {
 	if (chrom_flag_present) {
 	  logprint("Error: --from cannot be used with --autosome{-xy} or --{not-}chr.\n");
@@ -6597,6 +6658,10 @@ int32_t main(int32_t argc, char** argv) {
 	logprint("Note: --flip-scan-verbose flag deprecated.  Use '--flip-scan verbose'.\n");
         ld_info.modifier |= LD_FLIPSCAN_VERBOSE;
       } else if (!memcmp(argptr2, "amily", 6)) {
+	if (calculation_type & CALC_DFAM) {
+	  logprint("Error: --family cannot be used with --dfam.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	}
         misc_flags |= MISC_FAMILY_CLUSTERS;
 	filter_flags |= FILTER_FAM_REQ;
 	goto main_param_zero;
@@ -6785,7 +6850,7 @@ int32_t main(int32_t argc, char** argv) {
 	mtest_adjust |= ADJUST_GC;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "file", 5)) {
-	UNSTABLE;
+	UNSTABLE("gfile");
 	if (load_rare || (load_params & (~LOAD_PARAMS_FAM))) {
 	  goto main_ret_INVALID_CMDLINE_INPUT_CONFLICT;
 	}
@@ -6978,30 +7043,36 @@ int32_t main(int32_t argc, char** argv) {
 	hwe_modifier |= HWE_THRESH_ALL;
 	goto main_param_zero;
       } else if (!memcmp(argptr2, "et", 3)) {
-	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
-	if (param_ct) {
-	  if (strcmp(argv[cur_arg + 1], "small-sample")) {
-            sprintf(logbuf, "Error: Invalid --het parameter '%s'.\n", argv[cur_arg + 1]);
+	for (uii = 1; uii <= param_ct; uii++) {
+	  if (!strcmp(argv[cur_arg + uii], "small-sample")) {
+	    misc_flags |= MISC_HET_SMALL_SAMPLE;
+	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
+	    misc_flags |= MISC_HET_GZ;
+	  } else {
+            sprintf(logbuf, "Error: Invalid --het parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
-	  misc_flags |= MISC_HET_SMALL_SAMPLE;
 	}
         calculation_type |= CALC_HET;
       } else if ((!memcmp(argptr2, "ardy", 5)) || (!memcmp(argptr2, "ardy midp", 10))) {
-	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	if (argptr2[4]) {
 	  hwe_modifier |= HWE_MIDP;
 	}
-        if (param_ct) {
-	  if (strcmp(argv[cur_arg + 1], "midp")) {
-            sprintf(logbuf, "Error: Invalid --hardy parameter '%s'.\n", argv[cur_arg + 1]);
+	for (uii = 1; uii <= param_ct; uii++) {
+	  if (!strcmp(argv[cur_arg + uii], "midp")) {
+            hwe_modifier |= HWE_MIDP;
+	  } else if (!strcmp(argv[cur_arg + uii], "gz")) {
+	    hwe_modifier |= HWE_GZ;
+	  } else {
+            sprintf(logbuf, "Error: Invalid --hardy parameter '%s'.\n", argv[cur_arg + uii]);
             goto main_ret_INVALID_CMDLINE_WWA;
 	  }
-          hwe_modifier |= HWE_MIDP;
 	}
 	calculation_type |= CALC_HARDY;
       } else if (!memcmp(argptr2, "omozyg", 7)) {
@@ -7923,6 +7994,10 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	}
 	calculation_type |= CALC_DUPVAR;
+      } else if (!memcmp(argptr2, "d-pred", 7)) {
+	logprint("Error: --ld-pred is currently under development.\n");
+	retval = RET_CALC_NOT_YET_SUPPORTED;
+	goto main_ret_1;
       } else if ((!memcmp(argptr2, "ookup", 6)) ||
                  (!memcmp(argptr2, "ookup-list", 11)) ||
                  (!memcmp(argptr2, "ookup-gene", 11)) ||
@@ -8099,6 +8174,10 @@ int32_t main(int32_t argc, char** argv) {
 	  logprint("Error: --make-grm-bin cannot be used with --make-grm-gz.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
+	if (distance_exp != 0.0) {
+	  logprint("Error: '--distance-wts exp=[x]' cannot be used with --make-grm-gz.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -8136,6 +8215,10 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	calculation_type |= CALC_RELATIONSHIP;
       } else if (!memcmp(argptr2, "ake-grm-bin", 12)) {
+	if (distance_exp != 0.0) {
+	  logprint("Error: '--distance-wts exp=[x]' cannot be used with --make-grm-bin.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -8160,6 +8243,10 @@ int32_t main(int32_t argc, char** argv) {
 	  logprint("Error: --make-rel cannot be used with --make-grm-gz/--make-grm-bin.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
+	if (distance_exp != 0.0) {
+	  logprint("Error: '--distance-wts exp=[x]' cannot be used with --make-rel.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	}
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 3)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -8538,6 +8625,9 @@ int32_t main(int32_t argc, char** argv) {
 	} else if (glm_modifier & (GLM_PERM | GLM_MPERM)) {
 	  sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_PERM)? "" : "m");
 	  goto main_ret_INVALID_CMDLINE_2A;
+	} else if (family_info.dfam_modifier & (DFAM_PERM | DFAM_MPERM)) {
+	  sprintf(logbuf, "Error: --mperm cannot be used with --dfam %sperm.\n", (family_info.dfam_modifier & DFAM_PERM)? "" : "m");
+	  goto main_ret_INVALID_CMDLINE_2A;
 	} else if (cluster.modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM)) {
 	  sprintf(logbuf, "Error: --mperm cannot be used with --%s %sperm.\n", (cluster.modifier & CLUSTER_CMH_BD)? "bd" : "mh", (cluster.modifier & CLUSTER_CMH_PERM)? "" : "m");
 	  goto main_ret_INVALID_CMDLINE_2A;
@@ -8581,6 +8671,8 @@ int32_t main(int32_t argc, char** argv) {
           testmiss_modifier |= TESTMISS_MPERM;
 	  family_info.tdt_mperm_val = mperm_val;
 	  family_info.tdt_modifier |= TDT_MPERM;
+	  family_info.dfam_mperm_val = mperm_val;
+	  family_info.dfam_modifier |= DFAM_MPERM;
 	  family_info.qfam_mperm_val = mperm_val;
 	  family_info.qfam_modifier |= QFAM_MPERM;
           cluster.cmh_mperm_val = mperm_val;
@@ -8763,8 +8855,17 @@ int32_t main(int32_t argc, char** argv) {
 	}
 	filter_flags |= FILTER_FAM_REQ | FILTER_MAKE_FOUNDERS;
       } else if (!memcmp(argptr2, "issing", 7)) {
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 1)) {
+	  goto main_ret_INVALID_CMDLINE_2A;
+	}
+        if (param_ct) {
+	  if (strcmp(argv[cur_arg + 1], "gz")) {
+	    sprintf(logbuf, "Error: Invalid --missing parameter '%s'.\n", argv[cur_arg + 1]);
+	    goto main_ret_INVALID_CMDLINE_WWA;
+	  }
+	  misc_flags |= MISC_MISSING_GZ;
+	}
 	calculation_type |= CALC_MISSING_REPORT;
-	goto main_param_zero;
       } else if (!memcmp(argptr2, "h", 2)) {
 	if (calculation_type & CALC_CMH) {
 	  logprint("Error: --mh is redundant with --bd.\n");
@@ -8789,7 +8890,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(cluster.cmh_mperm_val))) {
-	      sprintf(logbuf, "Error: Invalid --mh mperm parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(logbuf, "Error: Invalid --mh mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             cluster.modifier |= CLUSTER_CMH_MPERM;
@@ -9079,7 +9180,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_NOMEM;
 	}
       } else if (!memcmp(argptr2, "ac", 3)) {
-	UNSTABLE;
+	UNSTABLE("mac");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -9088,7 +9189,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
       } else if (!memcmp(argptr2, "ax-mac", 7)) {
-	UNSTABLE;
+	UNSTABLE("max-mac");
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -9322,7 +9423,11 @@ int32_t main(int32_t argc, char** argv) {
 	}
       } else if (!memcmp(argptr2, "xford-single-chr", 17)) {
 	if (!(load_params & LOAD_PARAMS_OXGEN)) {
-	  logprint("Error: --oxford-single-chr must be used with .gen input.\n");
+	  if (load_params & LOAD_PARAMS_OXBGEN) {
+	    logprint("Error: --oxford-single-chr must be used with .gen input.  (Single-chromosome\n.bgen files do not require this, since they still contain chromosome codes.)\n");
+	  } else {
+	    logprint("Error: --oxford-single-chr must be used with .gen input.\n");
+	  }
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
         if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
@@ -9472,14 +9577,19 @@ int32_t main(int32_t argc, char** argv) {
 	  ppc_gap = (int32_t)(dxx * (1 + SMALL_EPSILON));
 	}
       } else if (!memcmp(argptr2, "erm", 4)) {
-	if ((model_modifier & MODEL_MPERM) && (calculation_type & CALC_MODEL)) {
-	  sprintf(logbuf, "Error: --perm cannot be used with --%s mperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model");
-	  goto main_ret_INVALID_CMDLINE_2A;
+	if (model_modifier & MODEL_MPERM) {
+          if (calculation_type & CALC_MODEL) {
+	    sprintf(logbuf, "Error: --perm cannot be used with --%s mperm.\n", (model_modifier & MODEL_ASSOC)? "assoc" : "model");
+	    goto main_ret_INVALID_CMDLINE_2A;
+	  } else {
+	    logprint("Error: --perm cannot be used with --mperm.\n");
+	    goto main_ret_INVALID_CMDLINE_A;
+	  }
 	} else if ((calculation_type & CALC_GLM) && (glm_modifier & (GLM_MPERM | GLM_NO_SNP))) {
 	  sprintf(logbuf, "Error: --perm cannot be used with --%s %s.\n", (glm_modifier & GLM_LOGISTIC)? "logistic" : "linear", (glm_modifier & GLM_MPERM)? "mperm" : "no-snp");
 	  goto main_ret_INVALID_CMDLINE_2A;
-	} else if (model_modifier & MODEL_MPERM) {
-	  logprint("Error: --perm cannot be used with --mperm.\n");
+	} else if (family_info.dfam_modifier & DFAM_MPERM) {
+	  logprint("Error: --perm cannot be used with --dfam mperm.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	} else if (calculation_type & CALC_CMH) {
           if (cluster.modifier & CLUSTER_CMH_MPERM) {
@@ -9494,6 +9604,7 @@ int32_t main(int32_t argc, char** argv) {
         glm_modifier |= GLM_PERM;
         testmiss_modifier |= TESTMISS_PERM;
 	family_info.tdt_modifier |= TDT_PERM;
+	family_info.dfam_modifier |= DFAM_PERM;
 	family_info.qfam_modifier |= QFAM_PERM;
 	cluster.modifier |= CLUSTER_CMH_PERM;
 	logprint("Note: --perm flag deprecated.  Use e.g. '--model perm'.\n");
@@ -9502,6 +9613,8 @@ int32_t main(int32_t argc, char** argv) {
 	model_modifier |= MODEL_PERM_COUNT;
 	glm_modifier |= GLM_PERM_COUNT;
         testmiss_modifier |= TESTMISS_PERM_COUNT;
+	family_info.tdt_modifier |= TDT_PERM_COUNT;
+	family_info.dfam_modifier |= DFAM_PERM_COUNT;
         family_info.qfam_modifier |= QFAM_PERM_COUNT;
 	cluster.modifier |= CLUSTER_CMH_PERM_COUNT;
 	logprint("Note: --perm-count flag deprecated.  Use e.g. '--model perm-count'.\n");
@@ -9526,7 +9639,7 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
 	pfilter = dxx;
-      } else if (!memcmp(argptr2, "erm-batch-size", 1)) {
+      } else if (!memcmp(argptr2, "erm-batch-size", 15)) {
 	if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
@@ -9815,11 +9928,13 @@ int32_t main(int32_t argc, char** argv) {
 	  logprint("Error: Only one QFAM test can be run at a time.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
-	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 2)) {
+	if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 3)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	for (uii = 1; uii <= param_ct; uii++) {
-	  if (!strcmp(argv[cur_arg + uii], "perm")) {
+	  if (!strcmp(argv[cur_arg + uii], "emp-se")) {
+	    family_info.qfam_modifier |= QFAM_EMP_SE;
+	  } else if (!strcmp(argv[cur_arg + uii], "perm")) {
 	    if (family_info.qfam_modifier & QFAM_MPERM) {
 	      sprintf(logbuf, "Error: --%s 'mperm' and 'perm' cannot be used together.\n", argptr);
 	      goto main_ret_INVALID_CMDLINE_2A;
@@ -9834,7 +9949,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE_2;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &(family_info.qfam_mperm_val))) {
-	      sprintf(logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, argv[cur_arg + uii]);
+	      sprintf(logbuf, "Error: Invalid --%s mperm parameter '%s'.\n", argptr, &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             family_info.qfam_modifier |= QFAM_MPERM;
@@ -10201,6 +10316,8 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE_A;
 	    }
 	    recode_modifier |= RECODE_DELIMX;
+	  } else if (!strcmp(argv[cur_arg + uii], "bgz")) {
+	    recode_modifier |= RECODE_BGZ;
 	  } else if (!strcmp(argv[cur_arg + uii], "beagle")) {
 	    if (recode_type_set(&recode_modifier, RECODE_BEAGLE)) {
 	      goto main_ret_INVALID_CMDLINE_A;
@@ -10295,6 +10412,10 @@ int32_t main(int32_t argc, char** argv) {
 	  logprint("Error: --recode 'include-alt' modifier must be used with 'A' or 'AD'.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
 	}
+	if ((recode_modifier & RECODE_BGZ) && (!(recode_modifier & RECODE_VCF))) {
+	  logprint("Error: --recode 'bgz' modifier must be used with VCF output.\n");
+	  goto main_ret_INVALID_CMDLINE_A;
+	}
 	calculation_type |= CALC_RECODE;
       } else if (!memcmp(argptr2, "ecode-whap", 11)) {
         logprint("Error: --recode-whap flag retired since WHAP is no longer supported.\n");
@@ -10854,6 +10975,12 @@ int32_t main(int32_t argc, char** argv) {
 	  }
 	  glm_modifier |= GLM_SET_TEST;
 	}
+	if (calculation_type & CALC_TDT) {
+	  family_info.tdt_modifier |= TDT_SET_TEST;
+	}
+	if (calculation_type & CALC_DFAM) {
+	  family_info.dfam_modifier |= DFAM_SET_TEST;
+	}
 	if ((calculation_type & CALC_CMH) && (!(cluster.modifier & CLUSTER_CMH2))) {
 	  cluster.modifier |= CLUSTER_CMH_SET_TEST;
 	}
@@ -11399,6 +11526,35 @@ int32_t main(int32_t argc, char** argv) {
 	  goto main_ret_INVALID_CMDLINE_WWA;
 	}
 	filter_flags |= FILTER_BIM_REQ | FILTER_DOSAGEMAP | FILTER_NOCNV;
+      } else if (!memcmp(argptr2, "hin-indiv", 10)) {
+	UNSTABLE("thin-indiv");
+        if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
+          goto main_ret_INVALID_CMDLINE_2A;
+        }
+        if (scan_double(argv[cur_arg + 1], &thin_keep_sample_prob)) {
+          sprintf(logbuf, "Error: Invalid --thin-indiv %s retention probability '%s'.\n", g_species_singular, argv[cur_arg + 1]);
+          goto main_ret_INVALID_CMDLINE_WWA;
+        }
+        if (thin_keep_sample_prob < (0.5 / 4294967296.0)) {
+          LOGPRINTF("Error: --thin-indiv %s retention probability too small.\n", g_species_singular);
+          goto main_ret_INVALID_CMDLINE_A;
+        } else if (thin_keep_sample_prob >= (4294967295.5 / 4294967296.0)) {
+          LOGPRINTF("Error: --thin-indiv %s retention probability too large.\n", g_species_singular);
+          goto main_ret_INVALID_CMDLINE_A;
+        }
+      } else if (!memcmp(argptr2, "hin-indiv-count", 16)) {
+	UNSTABLE("thin-indiv-count");
+        if (thin_keep_sample_prob != 1.0) {
+          logprint("Error: --thin-indiv cannot be used with --thin-indiv-count.\n");
+          goto main_ret_INVALID_CMDLINE_WWA;
+        }
+        if (enforce_param_ct_range(param_ct, argv[cur_arg], 1, 1)) {
+          goto main_ret_INVALID_CMDLINE_2A;
+        }
+        if (scan_posint_defcap(argv[cur_arg + 1], &thin_keep_sample_ct)) {
+          sprintf(logbuf, "Error: Invalid --thin-indiv-count parameter '%s'.\n", argv[cur_arg + 1]);
+          goto main_ret_INVALID_CMDLINE_WWA;
+        }
       } else if (!memcmp(argptr2, "ests", 5)) {
 	if (!(calculation_type & CALC_GLM)) {
 	  logprint("Error: --tests must be used with --linear or --logistic.\n");
@@ -11460,7 +11616,7 @@ int32_t main(int32_t argc, char** argv) {
               goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &testmiss_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --test-missing mperm parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(logbuf, "Error: Invalid --test-missing mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             testmiss_modifier |= TESTMISS_MPERM;
@@ -11481,7 +11637,7 @@ int32_t main(int32_t argc, char** argv) {
         calculation_type |= CALC_TESTMISHAP;
         goto main_param_zero;
       } else if (!memcmp(argptr2, "dt", 3)) {
-        if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 4)) {
+        if (enforce_param_ct_range(param_ct, argv[cur_arg], 0, 5)) {
 	  goto main_ret_INVALID_CMDLINE_2A;
 	}
 	for (uii = 1; uii <= param_ct; uii++) {
@@ -11515,6 +11671,8 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE_A;
 	    }
 	    family_info.tdt_modifier |= TDT_PERM;
+	  } else if (!strcmp(argv[cur_arg + uii], "perm-count")) {
+	    family_info.tdt_modifier |= TDT_PERM_COUNT;
 	  } else if ((strlen(argv[cur_arg + uii]) > 6) && (!memcmp(argv[cur_arg + uii], "mperm=", 6))) {
 	    if (family_info.tdt_modifier & TDT_PERM) {
 	      logprint("Error: --tdt 'mperm' and 'perm' cannot be used together.\n");
@@ -11524,7 +11682,7 @@ int32_t main(int32_t argc, char** argv) {
 	      goto main_ret_INVALID_CMDLINE;
 	    }
 	    if (scan_posint_defcap(&(argv[cur_arg + uii][6]), &family_info.tdt_mperm_val)) {
-	      sprintf(logbuf, "Error: Invalid --tdt mperm parameter '%s'.\n", argv[cur_arg + uii]);
+	      sprintf(logbuf, "Error: Invalid --tdt mperm parameter '%s'.\n", &(argv[cur_arg + uii][6]));
               goto main_ret_INVALID_CMDLINE_WWA;
 	    }
             family_info.tdt_modifier |= TDT_MPERM;
@@ -11634,7 +11792,7 @@ int32_t main(int32_t argc, char** argv) {
         logprint("Error: --unrelated-heritability requires " PROG_NAME_CAPS " to be built with LAPACK.\n");
 	goto main_ret_INVALID_CMDLINE;
 #else
-	UNSTABLE;
+	UNSTABLE("unrelated-heritability");
 	if (rel_info.modifier & REL_CALC_COV) {
 	  logprint("Error: --unrelated-heritability flag cannot coexist with a covariance\nmatrix calculation.\n");
 	  goto main_ret_INVALID_CMDLINE_A;
@@ -11760,7 +11918,7 @@ int32_t main(int32_t argc, char** argv) {
 	if (retval) {
 	  goto main_ret_1;
 	}
-        filter_flags |= FILTER_BIM_REQ;
+        filter_flags |= FILTER_FAM_REQ;
       } else if (!memcmp(argptr2, "pdate-map", 10)) {
 	if (cnv_calc_type & CNV_MAKE_MAP) {
 	  logprint("--update-map cannot be used with --cnv-make-map.\n");
@@ -12315,8 +12473,8 @@ int32_t main(int32_t argc, char** argv) {
     calculation_type |= CALC_PLINK1_IBS_MATRIX;
   }
   if (calculation_type & CALC_PLINK1_IBS_MATRIX) {
-    if (exponent != 0.0) {
-      logprint("Error: --ibs-matrix cannot be used with --distance-exp.\n");
+    if (distance_wts_fname || (distance_exp != 0.0)) {
+      logprint("Error: --ibs-matrix cannot be used with --distance-wts.\n");
       goto main_ret_INVALID_CMDLINE;
     }
     if (dist_calc_type & DISTANCE_IBS) {
@@ -12336,6 +12494,10 @@ int32_t main(int32_t argc, char** argv) {
       goto main_ret_INVALID_CMDLINE_A;
     }
   }
+  if (distance_wts_fname && (!(calculation_type & (CALC_DISTANCE | CALC_RELATIONSHIP)))) {
+    logprint("Error: --distance-wts must be used with --distance, --make-rel, --make-grm-bin,\nor --make-grm-gz.\n");
+    goto main_ret_INVALID_CMDLINE_A;
+  }
   if ((parallel_tot > 1) && (!(calculation_type & (CALC_LD | CALC_DISTANCE | CALC_GENOME | CALC_RELATIONSHIP)))) {
     if ((!(calculation_type & CALC_EPI)) || (!(epi_info.modifier & (EPI_FAST | EPI_REG)))) {
       logprint("Error: --parallel only affects --r/--r2, --distance, --genome, --make-rel,\n--make-grm-gz/--make-grm-bin, and --epistasis/--fast-epistasis.\n");
@@ -12367,8 +12529,11 @@ int32_t main(int32_t argc, char** argv) {
       goto main_ret_INVALID_CMDLINE_A;
     }
   }
-  if ((family_info.mendel_modifier & (MENDEL_DUOS | MENDEL_MULTIGEN)) && (!(calculation_type & CALC_MENDEL)) && (!(family_info.mendel_modifier & MENDEL_FILTER)) && (!(misc_flags & MISC_SET_ME_MISSING))) {
-    logprint("Error: --mendel-duos/--mendel-multigen must be used with\n--me/--mendel/--set-me-missing.\n");
+  if ((family_info.mendel_modifier & MENDEL_DUOS) && (!(calculation_type & CALC_MENDEL)) && (!(family_info.mendel_modifier & MENDEL_FILTER)) && (!(misc_flags & MISC_SET_ME_MISSING))) {
+    logprint("Error: --mendel-duos must be used with --me/--mendel/--set-me-missing.\n");
+    goto main_ret_INVALID_CMDLINE;
+  } else if ((family_info.mendel_modifier & MENDEL_MULTIGEN) && (!(calculation_type & (CALC_MENDEL | CALC_TDT | CALC_DFAM | CALC_QFAM))) && (!(family_info.mendel_modifier & MENDEL_FILTER)) && (!(misc_flags & MISC_SET_ME_MISSING))) {
+    logprint("Error: --mendel-multigen must be used with --me, --mendel, --set-me-missing, or\nan association test which checks for Mendel errors.\n");
     goto main_ret_INVALID_CMDLINE;
   }
   if (flip_subset_fname && (load_rare || (calculation_type != CALC_MAKE_BED) || (min_maf != 0.0) || (max_maf != 0.5) || (hwe_thresh != 0.0))) {
@@ -12519,8 +12684,18 @@ int32_t main(int32_t argc, char** argv) {
       goto main_ret_1;
       uii = 1;
     }
+    if (family_info.dfam_modifier & DFAM_SET_TEST) {
+      if (!(family_info.dfam_modifier & (DFAM_PERM | DFAM_MPERM))) {
+        logprint("Error: --dfam set-test requires permutation.\n");
+        goto main_ret_INVALID_CMDLINE_A;
+      }
+      logprint("Error: --dfam set-test is currently under development.\n");
+      retval = RET_CALC_NOT_YET_SUPPORTED;
+      goto main_ret_1;
+      uii = 1;
+    }
     if (cluster.modifier & CLUSTER_CMH_SET_TEST) {
-      if (!(family_info.tdt_modifier & (TDT_PERM | TDT_MPERM))) {
+      if (!(cluster.modifier & (CLUSTER_CMH_PERM | CLUSTER_CMH_MPERM))) {
         logprint("Error: --mh/--bd set-test requires permutation.\n");
         goto main_ret_INVALID_CMDLINE_A;
       }
@@ -12727,6 +12902,10 @@ int32_t main(int32_t argc, char** argv) {
     logprint("Error: --gen/--bgen cannot be used without --data or --sample.\n");
     goto main_ret_INVALID_CMDLINE_A;
   }
+  if ((merge_type & MERGE_EQUAL_POS) && (!(calculation_type & CALC_MERGE))) {
+    logprint("Error: --merge-equal-pos must be used with --merge/--bmerge/--merge-list.\n(Note that you are permitted to merge a fileset with itself.)\n");
+    goto main_ret_INVALID_CMDLINE_A;
+  }
   // short batch job?
   uii = 0;
   if ((!calculation_type) && (!(load_rare & (LOAD_RARE_LGEN | LOAD_RARE_DUMMY | LOAD_RARE_SIMULATE | LOAD_RARE_TRANSPOSE_MASK | LOAD_RARE_23 | LOAD_RARE_CNV | LOAD_RARE_VCF | LOAD_RARE_BCF)))) {
@@ -12860,6 +13039,7 @@ int32_t main(int32_t argc, char** argv) {
       logprint("Error: --dosage cannot be used with other PLINK computations.\n");
       goto main_ret_INVALID_CMDLINE;
     }
+    pigz_init(g_thread_ct);
     retval = plink1_dosage(&dosage_info, famname, mapname, outname, outname_end, phenoname, extractname, excludename, keepname, removename, keepfamname, removefamname, filtername, makepheno_str, phenoname_str, covar_fname, qual_filter, update_map, update_name, update_ids_fname, update_parents_fname, update_sex_fname, filtervals_flattened, filter_attrib_fname, filter_attrib_liststr, filter_attrib_sample_fname, filter_attrib_sample_liststr, qual_min_thresh, qual_max_thresh, thin_keep_prob, [...]
     // unconditional; note that plink1_dosage() currently doesn't even bother
     // to pop stuff off the stack when it's done
@@ -12956,7 +13136,7 @@ int32_t main(int32_t argc, char** argv) {
     } else if (!rel_info.ibc_type) {
       rel_info.ibc_type = 1;
     }
-    retval = plink(outname, outname_end, pedname, mapname, famname, cm_map_fname, cm_map_chrname, phenoname, extractname, excludename, keepname, removename, keepfamname, removefamname, filtername, freqname, read_dists_fname, read_dists_id_fname, evecname, mergename1, mergename2, mergename3, missing_mid_template, missing_marker_id_match, makepheno_str, phenoname_str, a1alleles, a2alleles, recode_allele_name, covar_fname, update_alleles_fname, read_genome_fname, qual_filter, update_chr, up [...]
+    retval = plink(outname, outname_end, pedname, mapname, famname, cm_map_fname, cm_map_chrname, phenoname, extractname, excludename, keepname, removename, keepfamname, removefamname, filtername, freqname, distance_wts_fname, read_dists_fname, read_dists_id_fname, evecname, mergename1, mergename2, mergename3, missing_mid_template, missing_marker_id_match, makepheno_str, phenoname_str, a1alleles, a2alleles, recode_allele_name, covar_fname, update_alleles_fname, read_genome_fname, qual_fi [...]
   }
   while (0) {
   main_ret_NOMEM:
@@ -13004,7 +13184,11 @@ int32_t main(int32_t argc, char** argv) {
 #ifdef STABLE_BUILD
     break;
   main_unstable_disabled:
-    logprint("Error: This flag's implementation is unfinished or unstable.  If you wish to\ntest it, use the latest development build.\n");
+    // see the UNSTABLE macro in plink_common.h
+    memcpy(logbuf, "Error: --", 9);
+    strcpy(sptr, " is either unfinished or not yet well-tested. If you wish to help with testing, use the latest development build.\n");
+    wordwrap(logbuf, 0);
+    logprintb();
     retval = RET_CALC_NOT_YET_SUPPORTED;
 #endif
   }
@@ -13026,6 +13210,7 @@ int32_t main(int32_t argc, char** argv) {
   free_cond(filtervals_flattened);
   free_cond(evecname);
   free_cond(filtername);
+  free_cond(distance_wts_fname);
   free_cond(read_dists_fname);
   free_cond(read_dists_id_fname);
   free_cond(freqname);
diff --git a/plink_assoc.c b/plink_assoc.c
index 9ef4a8e..dded1e3 100644
--- a/plink_assoc.c
+++ b/plink_assoc.c
@@ -810,7 +810,7 @@ void transpose_perms(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno_
 	  memcpy(perm_vecst, wbuf, 16);
 	  perm_vecst = &(perm_vecst[4]);
 	transpose_perms_loop_start:
-	  fill_ulong_zero((uintptr_t*)wbuf, 2);
+	  fill_uint_zero(wbuf, 4);
 	  wshift = 0;
 	}
 	wbptr = wbuf;
@@ -864,7 +864,7 @@ void transpose_perm1s(uintptr_t* perm_vecs, uint32_t perm_vec_ct, uint32_t pheno
 	  memcpy(perm_vecst, wbuf, 16);
 	  perm_vecst = &(perm_vecst[4]);
 	transpose_perm1s_loop_start:
-	  fill_ulong_zero((uintptr_t*)wbuf, 2);
+	  fill_uint_zero(wbuf, 2);
 	  wshift = 0;
 	}
 	wbptr = wbuf;
@@ -6421,7 +6421,7 @@ THREAD_RET_TYPE model_set_best_thread(void* arg) {
   }
 }
 
-int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* outname_end2, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_inf [...]
+int32_t model_assoc_set_test(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* outname_end2, uint32_t model_modifier, uint32_t model_mperm_val, double pfilter, double output_min_p, uint32_t mtest_adjust, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, uintptr_t* marker_exclude_mid, uintptr_t marker_ct_mid, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, Chrom_info* chrom_inf [...]
   // Could reuse more of the code in model_assoc() since there's considerable
   // overlap, but there are enough differences between the regular and set
   // permutation tests that separating this out and doing a fair bit of
@@ -8347,7 +8347,7 @@ int32_t model_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, cha
 	}
       }
     } else {
-      retval = model_assoc_set_test(threads, bedfile, bed_offset, outname, outname_end, outname_end2, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, pheno_c, founder_pnm, gender_req, ld_ignore_x, hh_exists, perm_batch_size, sip, loadbuf_raw);
+      retval = model_assoc_set_test(threads, bedfile, bed_offset, outname, outname_end, outname_end2, model_modifier, model_mperm_val, pfilter, output_min_p, mtest_adjust, unfiltered_marker_ct, marker_exclude_orig, marker_ct_orig, marker_exclude, marker_ct, marker_ids, max_marker_id_len, marker_reverse, chrom_info_ptr, unfiltered_sample_ct, sex_male, apip, pheno_nm_ct, pheno_nm, founder_pnm, gender_req, ld_ignore_x, hh_exists, perm_batch_size, sip, loadbuf_raw);
       if (retval) {
         goto model_assoc_ret_1;
       }
diff --git a/plink_calc.c b/plink_calc.c
index ef4e2a9..547c404 100644
--- a/plink_calc.c
+++ b/plink_calc.c
@@ -117,7 +117,7 @@ void rel_cleanup(Rel_info* relip) {
   free_cond(relip->pca_clusters_fname);
 }
 
-void update_rel_ibc(double* rel_ibc, uintptr_t* geno, double* set_allele_freqs, int32_t ibc_type, uint32_t sample_ct, uint32_t window_size) {
+void update_rel_ibc(double* rel_ibc, uintptr_t* geno, double* set_allele_freqs, double* main_weights, int32_t ibc_type, uint32_t sample_ct, uint32_t window_size) {
   // first calculate weight array, then loop
   uint32_t uii;
   uint32_t ujj;
@@ -199,134 +199,10 @@ void update_rel_ibc(double* rel_ibc, uintptr_t* geno, double* set_allele_freqs,
         }
       }
     }
-  }
-  for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
-    wtptr = &(wtarr[16 * ukk]);
-#ifdef __LP64__
-    if ((ukk == 2) || (ukk == 7)) {
-      for (uii = 0; uii < 8; uii++) {
-	twt = wtptr[uii + 8];
-	for (ujj = 0; ujj < 8; ujj++) {
-	  *wptr++ = twt + wtptr[ujj];
-	}
-	wptr = &(wptr[8]);
-      }
-    } else {
-      for (uii = 0; uii < 8; uii++) {
-	twt = wtptr[uii + 8];
-	for (ujj = 0; ujj < 8; ujj++) {
-	  *wptr++ = twt + wtptr[ujj];
-	}
-      }
-    }
-#else
-    if (ukk == 2) {
-      for (uii = 0; uii < 8; uii++) {
-	twt = wtptr[uii + 8];
-	for (ujj = 0; ujj < 8; ujj++) {
-	  *wptr++ = twt + wtptr[ujj];
-	}
-	wptr = &(wptr[8]);
-      }
-    } else {
-      for (uii = 0; uii < 8; uii++) {
-	twt = wtptr[uii + 8];
-	for (ujj = 0; ujj < 8; ujj++) {
-	  *wptr++ = twt + wtptr[ujj];
-	}
-      }
-    }
-#endif
-  }
-  for (umm = 0; umm < sample_ct; umm++) {
-    ulii = *geno++;
-#ifdef __LP64__
-    *rel_ibc += weights9[ulii >> 57] + weights8[(ulii >> 51) & 63] + weights7[(ulii >> 44) & 127] + weights6[(ulii >> 38) & 63] + weights5[(ulii >> 32) & 63] + weights4[(ulii >> 25) & 63] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
-#else
-    *rel_ibc += weights4[ulii >> 25] + weights3[(ulii >> 19) & 63] + weights2[(ulii >> 12) & 127] + weights1[(ulii >> 6) & 63] + weights[ulii & 63];
-#endif
-    rel_ibc++;
-  }
-}
-
-void update_rel_f_ibc(float* rel_ibc, uintptr_t* geno, float* set_allele_freqs, int32_t ibc_type, uint32_t sample_ct, uint32_t window_size) {
-  // first calculate weight array, then loop
-  uint32_t uii;
-  uint32_t ujj;
-  uint32_t ukk;
-  uint32_t umm;
-  float twt;
-  float* wtptr;
-  float mult = 1.0;
-  uintptr_t ulii;
-  float weights[BITCT * 12];
-  float* weights1 = &(weights[64]);
-  float* weights2 = &(weights[128]);
-  float* weights3 = &(weights[256]);
-  float* weights4 = &(weights[320]);
-#ifdef __LP64__
-  float* weights5 = &(weights[384]);
-  float* weights6 = &(weights[448]);
-  float* weights7 = &(weights[512]);
-  float* weights8 = &(weights[640]);
-  float* weights9 = &(weights[704]);
-#endif
-  float wtarr[BITCT2 * 5];
-  float *wptr = weights;
-  fill_float_zero(wtarr, BITCT2 * 5);
-  for (uii = 0; uii < window_size; uii += 1) {
-    if ((set_allele_freqs[uii] != 0.0) && (set_allele_freqs[uii] < (1.0 - EPSILON))) {
-      if (ibc_type) {
-        if (ibc_type == 2) {
-          wtarr[uii * 8] = 2;
-          wtarr[uii * 8 + 2] = 2.0 - 1.0 / (2 * set_allele_freqs[uii] * (1.0 - set_allele_freqs[uii]));
-          wtarr[uii * 8 + 3] = 2;
-        } else {
-          twt = 2 * set_allele_freqs[uii];
-          if (ibc_type == 1) {
-            mult = 1 / (twt * (1.0 - set_allele_freqs[uii]));
-          }
-          wtarr[uii * 8] = twt * twt * mult;
-          wtarr[uii * 8 + 2] = (1.0 - twt) * (1.0 - twt) * mult;
-          wtarr[uii * 8 + 3] = (2.0 - twt) * (2.0 - twt) * mult;
-        }
-      } else {
-        twt = 1.0 - set_allele_freqs[uii];
-        mult = 1 / (set_allele_freqs[uii] * twt);
-        wtarr[uii * 8] = 1.0 + set_allele_freqs[uii] * set_allele_freqs[uii] * mult;
-        wtarr[uii * 8 + 3] = 1.0 + twt * twt * mult;
-      }
-    } else {
-      if (ibc_type) {
-        if (ibc_type == -1) {
-          twt = 2 * set_allele_freqs[uii];
-          wtarr[uii * 8] = twt * twt;
-          wtarr[uii * 8 + 2] = (1.0 - twt) * (1.0 - twt);
-          wtarr[uii * 8 + 3] = (2.0 - twt) * (2.0 - twt);
-        } else if (ibc_type == 1) {
-	  wtarr[uii * 8 + 2] = INFINITY;
-          if (set_allele_freqs[uii] == 0.0) {
-            wtarr[uii * 8] = 0;
-            wtarr[uii * 8 + 3] = INFINITY;
-          } else {
-            wtarr[uii * 8] = INFINITY;
-            wtarr[uii * 8 + 3] = 0;
-          }
-        } else {
-          // need to set to 1 instead of 2 for agreement with GCTA
-          wtarr[uii * 8] = 1;
-          wtarr[uii * 8 + 2] = -INFINITY;
-          wtarr[uii * 8 + 3] = 1;
-        }
-      } else {
-        if (set_allele_freqs[uii] == 0.0) {
-          wtarr[uii * 8] = 1;
-          wtarr[uii * 8 + 3] = INFINITY;
-        } else {
-          wtarr[uii * 8] = INFINITY;
-          wtarr[uii * 8 + 3] = 1;
-        }
-      }
+    if (main_weights) {
+      wtarr[uii * 8] *= main_weights[uii];
+      wtarr[uii * 8 + 2] *= main_weights[uii];
+      wtarr[uii * 8 + 3] *= main_weights[uii];
     }
   }
   for (ukk = 0; ukk < (BITCT * 5) / 32; ukk++) {
@@ -378,7 +254,7 @@ void update_rel_f_ibc(float* rel_ibc, uintptr_t* geno, float* set_allele_freqs,
   }
 }
 
-void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
+void fill_subset_weights(double* subset_weights, double* main_weights) {
   uint32_t uii;
   uint32_t ujj;
   uint32_t ukk;
@@ -390,7 +266,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
 #ifdef __LP64__
   double twt[5];
   double twtf;
-  __m128d* wpairs = (__m128d*)weights;
+  __m128d* swpairs = (__m128d*)subset_weights;
   __m128d vpen;
   __m128d vfinal1;
   __m128d vfinal2;
@@ -399,9 +275,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
   uint32_t uqq;
   double twt[7];
 #endif
-  for (uii = 0; uii < MULTIPLEX_DIST_EXP / 2; uii++) {
-    wtarr[uii] = pow(2 * set_allele_freqs[uii] * (1.0 - set_allele_freqs[uii]), -exponent);
-  }
+  memcpy(wtarr, main_weights, (MULTIPLEX_DIST_EXP / 2) * sizeof(double));
   for (uoo = 0; uoo < 2; uoo++) {
     wt = &(wtarr[7 * uoo]);
 #ifdef __LP64__
@@ -438,19 +312,19 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
 #ifdef __LP64__
 	      twtf = twt[4];
 	      vpen = _mm_set1_pd(twtf);
-	      *wpairs++ = _mm_add_pd(vpen, vfinal1);
-	      *wpairs++ = _mm_add_pd(vpen, vfinal2);
+	      *swpairs++ = _mm_add_pd(vpen, vfinal1);
+	      *swpairs++ = _mm_add_pd(vpen, vfinal2);
 	      twtf += wt[1];
 	      vpen = _mm_set1_pd(twtf);
-	      *wpairs++ = _mm_add_pd(vpen, vfinal1);
-	      *wpairs++ = _mm_add_pd(vpen, vfinal2);
-	      *wpairs = *(wpairs - 2);
-	      wpairs++;
-	      *wpairs = *(wpairs - 2);
-	      wpairs++;
+	      *swpairs++ = _mm_add_pd(vpen, vfinal1);
+	      *swpairs++ = _mm_add_pd(vpen, vfinal2);
+	      *swpairs = *(swpairs - 2);
+	      swpairs++;
+	      *swpairs = *(swpairs - 2);
+	      swpairs++;
 	      vpen = _mm_set1_pd(twtf + wt[1]);
-	      *wpairs++ = _mm_add_pd(vpen, vfinal1);
-	      *wpairs++ = _mm_add_pd(vpen, vfinal2);
+	      *swpairs++ = _mm_add_pd(vpen, vfinal1);
+	      *swpairs++ = _mm_add_pd(vpen, vfinal2);
 #else
               twt[5] = twt[4];
               for (upp = 0; upp < 4; upp++) {
@@ -462,7 +336,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
                   if (uqq & 1) {
                     twt[6] += wt[0];
                   }
-                  *weights++ = twt[6];
+                  *subset_weights++ = twt[6];
                 }
               }
 #endif
@@ -499,19 +373,19 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
 	    }
 	    twtf = twt[3];
 	    vpen = _mm_set1_pd(twtf);
-	    *wpairs++ = _mm_add_pd(vpen, vfinal1);
-	    *wpairs++ = _mm_add_pd(vpen, vfinal2);
+	    *swpairs++ = _mm_add_pd(vpen, vfinal1);
+	    *swpairs++ = _mm_add_pd(vpen, vfinal2);
 	    twtf += wt[1];
 	    vpen = _mm_set1_pd(twtf);
-	    *wpairs++ = _mm_add_pd(vpen, vfinal1);
-	    *wpairs++ = _mm_add_pd(vpen, vfinal2);
-	    *wpairs = *(wpairs - 2);
-	    wpairs++;
-	    *wpairs = *(wpairs - 2);
-	    wpairs++;
+	    *swpairs++ = _mm_add_pd(vpen, vfinal1);
+	    *swpairs++ = _mm_add_pd(vpen, vfinal2);
+	    *swpairs = *(swpairs - 2);
+	    swpairs++;
+	    *swpairs = *(swpairs - 2);
+	    swpairs++;
 	    vpen = _mm_set1_pd(twtf + wt[1]);
-	    *wpairs++ = _mm_add_pd(vpen, vfinal1);
-	    *wpairs++ = _mm_add_pd(vpen, vfinal2);
+	    *swpairs++ = _mm_add_pd(vpen, vfinal1);
+	    *swpairs++ = _mm_add_pd(vpen, vfinal2);
           }
 	}
       }
@@ -520,7 +394,7 @@ void fill_weights(double* weights, double* set_allele_freqs, double exponent) {
 #endif
 }
 
-void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std) {
+void fill_subset_weights_r(double* subset_weights, double* set_allele_freqs, double* main_weights, uint32_t var_std) {
   uint32_t uii;
   uint32_t ujj;
   uint32_t ukk;
@@ -528,7 +402,7 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
   uint32_t unn;
   // 20 markers to process in quintuplets, for 64-bit; 10, for 32-bit.
   // Each quintuplet of markers requires 40 wtarr entries, and induces
-  // 2^15 writes to weights[].
+  // 2^15 writes to subset_weights[].
   double wtarr_raw[BITCT2 * 5 + 1];
   double* wtarr = wtarr_raw;
   double twt;
@@ -542,7 +416,7 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
   double mult = 1.0;
   double aux;
 #ifdef __LP64__
-  __m128d* wpairs = (__m128d*)weights;
+  __m128d* swpairs = (__m128d*)subset_weights;
   __m128d vpen;
   __m128d vfinal1;
   __m128d vfinal2;
@@ -609,6 +483,11 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
         wtarr[uii * 8 + 6] = 0;
       }
     }
+    if (main_weights) {
+      for (ujj = 0; ujj < 7; ujj++) {
+	wtarr[uii * 8 + ujj] *= main_weights[uii];
+      }
+    }
     wtarr[uii * 8 + 7] = 0;
   }
   for (unn = 0; unn < BITCT / 16; unn++) {
@@ -629,133 +508,13 @@ void fill_weights_r(double* weights, double* set_allele_freqs, uint32_t var_std)
             twt4 = twt3 + wtptr[umm + 8];
 #ifdef __LP64__
             vpen = _mm_set1_pd(twt4);
-            *wpairs++ = _mm_add_pd(vpen, vfinal1);
-            *wpairs++ = _mm_add_pd(vpen, vfinal2);
-            *wpairs++ = _mm_add_pd(vpen, vfinal3);
-            *wpairs++ = _mm_add_pd(vpen, vfinal4);
+            *swpairs++ = _mm_add_pd(vpen, vfinal1);
+            *swpairs++ = _mm_add_pd(vpen, vfinal2);
+            *swpairs++ = _mm_add_pd(vpen, vfinal3);
+            *swpairs++ = _mm_add_pd(vpen, vfinal4);
 #else
             for (uoo = 0; uoo < 8; uoo++) {
-              *weights++ = twt4 + wtptr[uoo];
-            }
-#endif
-          }
-        }
-      }
-    }
-  }
-}
-
-void fill_weights_r_f(float* weights_f, float* set_allele_freqs_f, uint32_t var_std) {
-  uint32_t uii;
-  uint32_t ujj;
-  uint32_t ukk;
-  uint32_t umm;
-  uint32_t unn;
-  // 20 markers to process in quintuplets, for 64-bit; 10, for 32-bit.
-  // Each quintuplet of markers requires 40 wtarr entries, and induces
-  // 2^15 writes to weights_f[].
-  float wtarr_raw[BITCT2 * 5 + 3];
-  float* wtarr = wtarr_raw;
-  float twt;
-  float twt2;
-  float twt3;
-  float twt4;
-  float* wtptr;
-  float mean;
-  float mean_m1;
-  float mean_m2;
-  float mult = 1.0;
-  float aux;
-#ifdef __LP64__
-  __m128* wquads = (__m128*)weights_f;
-  __m128 vpen;
-  __m128 vfinal1;
-  __m128 vfinal2;
-#else
-  uint32_t uoo;
-#endif
-  uii = (((uintptr_t)wtarr) & 15);
-  if (uii) {
-    // force 16-byte alignment; can't do this at compile-time since stack
-    // pointer has no 16-byte align guarantee.
-    // yes, this assumes floats are 4 bytes.
-    wtarr = &(wtarr[4 - (uii / 4)]);
-  }
-  for (uii = 0; uii < MULTIPLEX_REL / 3; uii += 1) {
-    if (((set_allele_freqs_f[uii] != 0.0) && (set_allele_freqs_f[uii] < (1.0 - EPSILON))) || (!var_std)) {
-      if (set_allele_freqs_f[uii] < 0.5) {
-	mean = 2 * set_allele_freqs_f[uii];
-	mean_m1 = mean - 1.0;
-	mean_m2 = mean - 2.0;
-        if (var_std) {
-	  mult = 1 / (mean * (1.0 - set_allele_freqs_f[uii]));
-        }
-        aux = mean * mult;
-	wtarr[uii * 8] = mean * aux;
-        wtarr[uii * 8 + 1] = 0;
-	wtarr[uii * 8 + 2] = mean_m1 * aux;
-	wtarr[uii * 8 + 3] = mean_m2 * aux;
-	wtarr[uii * 8 + 4] = mean_m1 * mean_m1 * mult;
-	wtarr[uii * 8 + 5] = mean_m2 * mean_m1 * mult;
-	wtarr[uii * 8 + 6] = mean_m2 * mean_m2 * mult;
-      } else {
-	mean = 2 * (1.0 - set_allele_freqs_f[uii]);
-	mean_m1 = mean - 1.0;
-	mean_m2 = mean - 2.0;
-        if (var_std) {
-	  mult = 1 / (mean * set_allele_freqs_f[uii]);
-        }
-        aux = mean_m2 * mult;
-	wtarr[uii * 8] = mean_m2 * aux;
-        wtarr[uii * 8 + 1] = 0;
-	wtarr[uii * 8 + 2] = mean_m1 * aux;
-	wtarr[uii * 8 + 3] = mean * aux;
-	wtarr[uii * 8 + 4] = mean_m1 * mean_m1 * mult;
-	wtarr[uii * 8 + 5] = mean_m1 * mean * mult;
-	wtarr[uii * 8 + 6] = mean * mean * mult;
-      }
-    } else {
-      if (set_allele_freqs_f[uii] == 0.0) {
-        wtarr[uii * 8] = 0;
-        wtarr[uii * 8 + 1] = 0;
-        wtarr[uii * 8 + 2] = -1;
-        wtarr[uii * 8 + 3] = -2;
-        wtarr[uii * 8 + 4] = INFINITY;
-        wtarr[uii * 8 + 5] = INFINITY;
-        wtarr[uii * 8 + 6] = INFINITY;
-      } else {
-        wtarr[uii * 8] = INFINITY;
-        wtarr[uii * 8 + 1] = 0;
-        wtarr[uii * 8 + 2] = INFINITY;
-        wtarr[uii * 8 + 3] = -2;
-        wtarr[uii * 8 + 4] = INFINITY;
-        wtarr[uii * 8 + 5] = -1;
-        wtarr[uii * 8 + 6] = 0;
-      }
-    }
-    wtarr[uii * 8 + 7] = 0;
-  }
-  for (unn = 0; unn < BITCT / 16; unn++) {
-    wtptr = &(wtarr[40 * unn]);
-#ifdef __LP64__
-    vfinal1 = _mm_load_ps(wtptr);
-    vfinal2 = _mm_load_ps(&(wtptr[4]));
-#endif
-    for (uii = 0; uii < 8; uii++) {
-      twt = wtptr[uii + 32];
-      for (ujj = 0; ujj < 8; ujj++) {
-        twt2 = twt + wtptr[ujj + 24];
-        for (ukk = 0; ukk < 8; ukk++) {
-          twt3 = twt2 + wtptr[ukk + 16];
-          for (umm = 0; umm < 8; umm++) {
-            twt4 = twt3 + wtptr[umm + 8];
-#ifdef __LP64__
-            vpen = _mm_set1_ps(twt4);
-            *wquads++ = _mm_add_ps(vpen, vfinal1);
-            *wquads++ = _mm_add_ps(vpen, vfinal2);
-#else
-            for (uoo = 0; uoo < 8; uoo++) {
-              *weights_f++ = twt4 + wtptr[uoo];
+              *subset_weights++ = twt4 + wtptr[uoo];
             }
 #endif
           }
@@ -968,8 +727,8 @@ static int32_t* g_idists;
 static uintptr_t* g_pheno_nm = NULL;
 static uintptr_t* g_pheno_c = NULL;
 static unsigned char* g_geno = NULL;
-static double* g_weights;
-static uint32_t* g_weights_i;
+static double* g_subset_weights;
+static uint32_t* g_subset_weights_i;
 static double g_reg_tot_xy;
 static double g_reg_tot_x;
 static double g_reg_tot_y;
@@ -1262,7 +1021,7 @@ void incr_dists_i(uint32_t* idists, uintptr_t* geno, uintptr_t* masks, uint32_t
   }
 }
 
-void incr_wt_dist_missing(uint32_t* mtw, uint32_t* weights_i, uintptr_t* mmasks, uint32_t start_idx, uint32_t end_idx) {
+void incr_wt_dist_missing(uint32_t* mtw, uint32_t* subset_weights_i, uintptr_t* mmasks, uint32_t start_idx, uint32_t end_idx) {
   uintptr_t* glptr;
   uintptr_t ulii;
   uintptr_t uljj;
@@ -1275,7 +1034,7 @@ void incr_wt_dist_missing(uint32_t* mtw, uint32_t* weights_i, uintptr_t* mmasks,
       for (ujj = 0; ujj < uii; ujj++) {
 	uljj = (*glptr++) & ulii;
         while (uljj) {
-          mtw[ujj] += weights_i[CTZLU(uljj)];
+          mtw[ujj] += subset_weights_i[CTZLU(uljj)];
           uljj &= uljj - 1;
         }
       }
@@ -1328,8 +1087,8 @@ THREAD_RET_TYPE calc_ibs_thread(void* arg) {
   while (1) {
     is_last_block = g_is_last_thread_block;
     if (weighted_missing_ptr) {
-      // g_weights_i moves around
-      incr_wt_dist_missing(weighted_missing_ptr, g_weights_i, mmasks_ptr, ulii, end_idx);
+      // g_subset_weights_i moves around
+      incr_wt_dist_missing(weighted_missing_ptr, g_subset_weights_i, mmasks_ptr, ulii, end_idx);
     }
     if (flat_missing_ptr) {
       incr_dists_rm(flat_missing_ptr, mmasks_ptr, ulii, end_idx);
@@ -1843,17 +1602,17 @@ THREAD_RET_TYPE calc_wdist_thread(void* arg) {
   uintptr_t* geno_ptr = (uintptr_t*)g_geno;
   uintptr_t* masks_ptr = g_masks;
   uintptr_t* mmasks_ptr = g_mmasks;
-  double* weights_ptr = g_weights;
-  uint32_t* weights_i_ptr = g_weights_i;
+  double* subset_weights_ptr = g_subset_weights;
+  uint32_t* subset_weights_i_ptr = g_subset_weights_i;
   uint32_t* weighted_missing_ptr = &(g_missing_tot_weights[offset]);
   uint32_t end_idx = g_thread_start[tidx + 1];
   uint32_t is_last_block;
   while (1) {
     is_last_block = g_is_last_thread_block;
-    incr_dists(dists_ptr, geno_ptr, masks_ptr, weights_ptr, ulii, end_idx);
+    incr_dists(dists_ptr, geno_ptr, masks_ptr, subset_weights_ptr, ulii, end_idx);
     if (is_last_block || (g_thread_spawn_ct & 1)) {
-      // weights_i is stationary here
-      incr_wt_dist_missing(weighted_missing_ptr, weights_i_ptr, mmasks_ptr, ulii, end_idx);
+      // subset_weights_i is stationary here
+      incr_wt_dist_missing(weighted_missing_ptr, subset_weights_i_ptr, mmasks_ptr, ulii, end_idx);
     }
     if ((!tidx) || is_last_block) {
       THREAD_RETURN;
@@ -1914,12 +1673,12 @@ THREAD_RET_TYPE calc_rel_thread(void* arg) {
   uintptr_t* masks_ptr = g_masks;
   uintptr_t* mmasks_ptr = g_mmasks;
   uint32_t* missing_ptr = &(g_missing_dbl_excluded[offset]);
-  double* weights_ptr = g_weights;
+  double* subset_weights_ptr = g_subset_weights;
   uint32_t end_idx = g_thread_start[tidx + 1];
   uint32_t is_last_block;
   while (1) {
     is_last_block = g_is_last_thread_block;
-    incr_dists_r(rel_ptr, geno_ptr, masks_ptr, (uint32_t)tidx, weights_ptr);
+    incr_dists_r(rel_ptr, geno_ptr, masks_ptr, (uint32_t)tidx, subset_weights_ptr);
     if (is_last_block || ((g_thread_spawn_ct % 3) == 2)) {
       incr_dists_rm(missing_ptr, mmasks_ptr, ulii, end_idx);
     }
@@ -1930,45 +1689,30 @@ THREAD_RET_TYPE calc_rel_thread(void* arg) {
   }
 }
 
-void incr_dists_r_f(float* dists_f, uintptr_t* geno, uintptr_t* masks, float* weights_f, uint32_t start_idx, uint32_t end_idx) {
-  uintptr_t* glptr;
-  uintptr_t* maskptr;
-  uintptr_t ulii;
-  uintptr_t uljj;
-  uintptr_t basemask;
-  float* weights1 = &(weights_f[32768]);
-#ifdef __LP64__
-  float* weights2 = &(weights_f[65536]);
-  float* weights3 = &(weights_f[98304]);
-#endif
-  uint32_t uii;
-  uint32_t ujj;
-  for (uii = start_idx; uii < end_idx; uii++) {
-    glptr = geno;
-    ulii = geno[uii];
-    maskptr = masks;
-    basemask = masks[uii];
-    if (!basemask) {
-      for (ujj = 0; ujj < uii; ujj++) {
-	uljj = ((*glptr++) + ulii) | (*maskptr++);
-#ifdef __LP64__
-	*dists_f += weights_f[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
-#else
-	*dists_f += weights_f[(uint16_t)uljj] + weights1[uljj >> 16];
-#endif
-	dists_f++;
-      }
-    } else {
-      for (ujj = 0; ujj < uii; ujj++) {
-        uljj = ((*glptr++) + ulii) | ((*maskptr++) | basemask);
-#ifdef __LP64__
-	*dists_f += weights_f[(uint16_t)uljj] + weights1[(uint16_t)(uljj >> 16)] + weights2[(uint16_t)(uljj >> 32)] + weights3[uljj >> 48];
-#else
-	*dists_f += weights_f[(uint16_t)uljj] + weights1[uljj >> 16];
-#endif
-	dists_f++;
-      }
+THREAD_RET_TYPE calc_wt_rel_thread(void* arg) {
+  // this needs more work
+  uintptr_t tidx = (uintptr_t)arg;
+  uintptr_t ulii = g_thread_start[tidx];
+  uintptr_t uljj = g_thread_start[0];
+  uintptr_t offset = (((uint64_t)ulii) * (ulii - 1) - ((uint64_t)uljj) * (uljj - 1)) / 2;
+  double* rel_ptr = &(g_rel_dists[offset]);
+  uintptr_t* geno_ptr = (uintptr_t*)g_geno;
+  uintptr_t* masks_ptr = g_masks;
+  uintptr_t* mmasks_ptr = g_mmasks;
+  uint32_t* missing_ptr = &(g_missing_dbl_excluded[offset]);
+  double* subset_weights_ptr = g_subset_weights;
+  uint32_t end_idx = g_thread_start[tidx + 1];
+  uint32_t is_last_block;
+  while (1) {
+    is_last_block = g_is_last_thread_block;
+    incr_dists_r(rel_ptr, geno_ptr, masks_ptr, (uint32_t)tidx, subset_weights_ptr);
+    if (is_last_block || ((g_thread_spawn_ct % 3) == 2)) {
+      incr_dists_rm(missing_ptr, mmasks_ptr, ulii, end_idx);
+    }
+    if ((!tidx) || is_last_block) {
+      THREAD_RETURN;
     }
+    THREAD_BLOCK_FINISH(tidx);
   }
 }
 
@@ -4477,6 +4221,7 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
   int32_t write_ibs_matrix = dist_calc_type & DISTANCE_IBS;
   int32_t write_1mibs_matrix = dist_calc_type & DISTANCE_1_MINUS_IBS;
   int32_t retval = 0;
+  unsigned char overflow_buf[262144];
   double dxx;
   double dyy;
   double* dist_ptr;
@@ -4824,11 +4569,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
 	  sprintf(outname_end, ".dist.gz");
 	}
 	if (shape == DISTANCE_SQ) {
-	  parallel_compress(outname, 0, distance_d_write_sq_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_sq_emitn);
 	} else if (shape == DISTANCE_SQ0) {
-	  parallel_compress(outname, 0, distance_d_write_sq0_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_sq0_emitn);
 	} else {
-	  parallel_compress(outname, 0, distance_d_write_tri_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_tri_emitn);
 	}
       } else {
 	if (parallel_tot > 1) {
@@ -4837,11 +4582,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
 	  sprintf(outname_end, ".dist");
 	}
 	if (shape == DISTANCE_SQ) {
-	  retval = write_uncompressed(outname, 0, distance_d_write_sq_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_sq_emitn);
 	} else if (shape == DISTANCE_SQ0) {
-	  retval = write_uncompressed(outname, 0, distance_d_write_sq0_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_sq0_emitn);
 	} else {
-	  retval = write_uncompressed(outname, 0, distance_d_write_tri_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_tri_emitn);
 	}
 	if (retval) {
 	  goto distance_d_write_ret_1;
@@ -4862,11 +4607,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
 	  sprintf(outname_end, ".mdist.gz");
 	}
 	if (shape == DISTANCE_SQ) {
-	  parallel_compress(outname, 0, distance_d_write_1mibs_sq_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_1mibs_sq_emitn);
 	} else if (shape == DISTANCE_SQ0) {
-	  parallel_compress(outname, 0, distance_d_write_1mibs_sq0_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_1mibs_sq0_emitn);
 	} else {
-	  parallel_compress(outname, 0, distance_d_write_1mibs_tri_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_1mibs_tri_emitn);
 	}
       } else {
 	if (parallel_tot > 1) {
@@ -4875,11 +4620,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
 	  sprintf(outname_end, ".mdist");
 	}
 	if (shape == DISTANCE_SQ) {
-	  retval = write_uncompressed(outname, 0, distance_d_write_1mibs_sq_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_1mibs_sq_emitn);
 	} else if (shape == DISTANCE_SQ0) {
-	  retval = write_uncompressed(outname, 0, distance_d_write_1mibs_sq0_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_1mibs_sq0_emitn);
 	} else {
-	  retval = write_uncompressed(outname, 0, distance_d_write_1mibs_tri_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_1mibs_tri_emitn);
 	}
 	if (retval) {
 	  goto distance_d_write_ret_1;
@@ -4902,11 +4647,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
 	  sprintf(outname_end, ".mibs.gz");
 	}
 	if (shape == DISTANCE_SQ) {
-	  parallel_compress(outname, 0, distance_d_write_ibs_sq_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_ibs_sq_emitn);
 	} else if (shape == DISTANCE_SQ0) {
-	  parallel_compress(outname, 0, distance_d_write_ibs_sq0_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_ibs_sq0_emitn);
 	} else {
-	  parallel_compress(outname, 0, distance_d_write_ibs_tri_emitn);
+	  parallel_compress(outname, overflow_buf, 0, distance_d_write_ibs_tri_emitn);
 	}
       } else {
 	if (parallel_tot > 1) {
@@ -4915,11 +4660,11 @@ int32_t distance_d_write(FILE** outfile_ptr, FILE** outfile2_ptr, FILE** outfile
 	  sprintf(outname_end, ".mibs");
 	}
 	if (shape == DISTANCE_SQ) {
-	  retval = write_uncompressed(outname, 0, distance_d_write_ibs_sq_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_ibs_sq_emitn);
 	} else if (shape == DISTANCE_SQ0) {
-	  retval = write_uncompressed(outname, 0, distance_d_write_ibs_sq0_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_ibs_sq0_emitn);
 	} else {
-	  retval = write_uncompressed(outname, 0, distance_d_write_ibs_tri_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, distance_d_write_ibs_tri_emitn);
 	}
 	if (retval) {
 	  goto distance_d_write_ret_1;
@@ -5239,6 +4984,7 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
   int32_t missing_ct_buf[BITCT];
   double set_allele_freq_buf[GENOME_MULTIPLEX];
   uint32_t nchrobs_buf[GENOME_MULTIPLEX];
+  unsigned char* overflow_buf;
   unsigned char* gptr;
   char* cptr;
   uintptr_t* geno;
@@ -5337,7 +5083,8 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
       wkspace_alloc_ul_checked(&masks, sample_ct * (GENOME_MULTIPLEX / 4)) ||
       wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
       wkspace_alloc_c_checked(&g_cg_fam1, plink_maxfid + 1) ||
-      wkspace_alloc_c_checked(&g_cg_fam2, plink_maxfid + 1)) {
+      wkspace_alloc_c_checked(&g_cg_fam2, plink_maxfid + 1) ||
+      wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
     goto calc_genome_ret_NOMEM;
   }
 
@@ -5719,14 +5466,14 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
     } else {
       strcpy(outname_end, ".genome.gz");
     }
-    parallel_compress(outname, 0, calc_genome_emitn);
+    parallel_compress(outname, overflow_buf, 0, calc_genome_emitn);
   } else {
     if (parallel_tot > 1) {
       sprintf(outname_end, ".genome.%d", parallel_idx + 1);
     } else {
       strcpy(outname_end, ".genome");
     }
-    retval = write_uncompressed(outname, 0, calc_genome_emitn);
+    retval = write_uncompressed(outname, overflow_buf, 0, calc_genome_emitn);
     if (retval) {
       goto calc_genome_ret_1;
     }
@@ -6119,6 +5866,7 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
   uint32_t rel_calc_type = relip->modifier & REL_CALC_MASK;
   uintptr_t* compact_rel_table;
   uintptr_t* rtptr;
+  unsigned char* overflow_buf;
   char* bufptr;
   uint64_t ullii;
   uint64_t ulljj;
@@ -6185,7 +5933,8 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
     goto rel_cutoff_batch_ret_NOMEM;
   }
   fill_ulong_zero(compact_rel_table, tot_words);
-  if (wkspace_alloc_i_checked(&rel_ct_arr, sample_ct * sizeof(int32_t))) {
+  if (wkspace_alloc_i_checked(&rel_ct_arr, sample_ct * sizeof(int32_t)) ||
+      wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
     goto rel_cutoff_batch_ret_NOMEM;
   }
   fill_int_zero(rel_ct_arr, sample_ct);
@@ -6539,10 +6288,10 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
       if (load_grm_bin) {
 	if (rel_calc_type & REL_CALC_GZ) {
 	  memcpy(outname_end, ".grm.gz", 8);
-	  parallel_compress(outname, 0, rel_cutoff_batch_rbin_emitn);
+	  parallel_compress(outname, overflow_buf, 0, rel_cutoff_batch_rbin_emitn);
 	} else {
 	  memcpy(outname_end, ".grm", 5);
-	  retval = write_uncompressed(outname, 0, rel_cutoff_batch_rbin_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, rel_cutoff_batch_rbin_emitn);
 	  if (retval) {
 	    goto rel_cutoff_batch_ret_1;
 	  }
@@ -6550,10 +6299,10 @@ int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, ch
       } else {
 	if (rel_calc_type & REL_CALC_GZ) {
 	  memcpy(outname_end, ".grm.gz", 8);
-	  parallel_compress(outname, 0, rel_cutoff_batch_emitn);
+	  parallel_compress(outname, overflow_buf, 0, rel_cutoff_batch_emitn);
 	} else {
 	  memcpy(outname_end, ".grm", 5);
-	  retval = write_uncompressed(outname, 0, rel_cutoff_batch_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, rel_cutoff_batch_emitn);
 	  if (retval) {
 	    goto rel_cutoff_batch_ret_1;
 	  }
@@ -6879,7 +6628,197 @@ uint32_t calc_rel_grm_emitn(uint32_t overflow_ct, unsigned char* readbuf) {
   return (uintptr_t)(((unsigned char*)sptr_cur) - readbuf);
 }
 
-int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, double* set_allele_freqs, double** rel_ibc_ptr,  [...]
+uint32_t block_load(FILE* bedfile, int32_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct, uint32_t block_max_size, uintptr_t unfiltered_sample_ct4, unsigned char* readbuf, uintptr_t* marker_uidx_ptr, uintptr_t* marker_idx_ptr, uint32_t* block_size_ptr) {
+  uintptr_t marker_uidx = *marker_uidx_ptr;
+  uintptr_t marker_idx = *marker_idx_ptr;
+  uint32_t markers_read = 0;
+  if (block_max_size > marker_ct - marker_idx) {
+    block_max_size = marker_ct - marker_idx;
+  }
+  while (markers_read < block_max_size) {
+    if (IS_SET(marker_exclude, marker_uidx)) {
+      marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+      if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+	return RET_READ_FAIL;
+      }
+    }
+    if (fread(&(readbuf[markers_read * unfiltered_sample_ct4]), 1, unfiltered_sample_ct4, bedfile) < unfiltered_sample_ct4) {
+      return RET_READ_FAIL;
+    }
+    markers_read++;
+    marker_idx++;
+    marker_uidx++;
+  }
+
+  *marker_uidx_ptr = marker_uidx;
+  *marker_idx_ptr = marker_idx;
+  *block_size_ptr = markers_read;
+  return 0;
+}
+
+void copy_set_allele_freqs(uintptr_t marker_uidx, uintptr_t* marker_exclude, uint32_t block_max_size, uintptr_t marker_idx, uint32_t marker_ct, uintptr_t* marker_reverse, double* set_allele_freqs, double* set_allele_freq_buf) {
+  uint32_t markers_read = 0;
+  if (block_max_size > marker_ct - marker_idx) {
+    block_max_size = marker_ct - marker_idx;
+  }
+  while (markers_read < block_max_size) {
+    next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+    if ((!marker_reverse) || (!IS_SET(marker_reverse, marker_uidx))) {
+      set_allele_freq_buf[markers_read] = set_allele_freqs[marker_uidx];
+    } else {
+      set_allele_freq_buf[markers_read] = 1.0 - set_allele_freqs[marker_uidx];
+    }
+    markers_read++;
+    marker_idx++;
+    marker_uidx++;
+  }
+}
+
+int32_t load_distance_wts(char* distance_wts_fname, uintptr_t unfiltered_marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t noheader, uint32_t conditional_alloc_exclude, uintptr_t** marker_exclude_ptr, uint32_t* marker_ct_ptr, double** main_weights_ptr) {
+  FILE* infile = NULL;
+  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t line_idx = 0;
+  uintptr_t topsize = 0;
+
+  // special case: weight-0 assignment effectively doesn't exist, but we still
+  // want to check for repeated IDs there.
+  uint32_t zcount = 0;
+
+  int32_t retval = 0;
+  unsigned char* wkspace_mark;
+  uintptr_t* marker_include;
+  double* main_weights_tmp;
+  double* dptr;
+  char* bufptr;
+  uint32_t* marker_id_htable;
+  double dxx;
+  uint32_t marker_id_htable_size;
+  uint32_t marker_uidx;
+  uint32_t marker_idx;
+  uint32_t idlen;
+  uint32_t marker_ct;
+  marker_include = (uintptr_t*)top_alloc(&topsize, unfiltered_marker_ctl * sizeof(intptr_t));
+  if (!marker_include) {
+    goto load_distance_wts_ret_NOMEM;
+  }
+  fill_ulong_zero(marker_include, unfiltered_marker_ctl);
+  main_weights_tmp = (double*)top_alloc(&topsize, unfiltered_marker_ct * sizeof(double));
+  if (!main_weights_tmp) {
+    goto load_distance_wts_ret_NOMEM;
+  }
+  wkspace_left -= topsize;
+  wkspace_mark = wkspace_base;
+  retval = alloc_and_populate_id_htable(unfiltered_marker_ct, *marker_exclude_ptr, *marker_ct_ptr, marker_ids, max_marker_id_len, 0, &marker_id_htable, &marker_id_htable_size);
+  wkspace_left += topsize;
+  if (retval) {
+    goto load_distance_wts_ret_1;
+  }
+  if (fopen_checked(&infile, distance_wts_fname, "r")) {
+    goto load_distance_wts_ret_OPEN_FAIL;
+  }
+  tbuf[MAXLINELEN - 1] = ' ';
+  while (fgets(tbuf, MAXLINELEN, infile)) {
+    line_idx++;
+    if (!tbuf[MAXLINELEN - 1]) {
+      LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, distance_wts_fname);
+      goto load_distance_wts_ret_INVALID_FORMAT_2;
+    }
+    bufptr = skip_initial_spaces(tbuf);
+    if (is_eoln_kns(*bufptr)) {
+      continue;
+    }
+    if (!noheader) {
+      noheader = 1;
+      continue;
+    }
+    // variant ID in first column, weight in second
+    idlen = strlen_se(bufptr);
+    marker_uidx = id_htable_find(bufptr, idlen, marker_id_htable, marker_id_htable_size, marker_ids, max_marker_id_len);
+    if (marker_uidx == 0xffffffffU) {
+      continue;
+    }
+    if (is_set(marker_include, marker_uidx)) {
+      bufptr[idlen] = '\0';
+      LOGPREPRINTFWW("Error: Duplicate variant ID '%s' in --distance-wts file.\n", bufptr);
+      goto load_distance_wts_ret_INVALID_FORMAT_2;
+    }
+    set_bit(marker_include, marker_uidx);
+    bufptr = skip_initial_spaces(&(bufptr[idlen]));
+    if (is_eoln_kns(*bufptr)) {
+      sprintf(logbuf, "Error: Line %" PRIuPTR " of --distance-wts file has fewer tokens than expected.\n", line_idx);
+      goto load_distance_wts_ret_INVALID_FORMAT_2;
+    }
+    if (scan_double(bufptr, &dxx)) {
+      goto load_distance_wts_ret_INVALID_WEIGHT;
+    }
+    if (!((dxx >= 0.0) && (dxx != INFINITY))) {
+      goto load_distance_wts_ret_INVALID_WEIGHT;
+    }
+    if (dxx == 0.0) {
+      zcount++;
+    }
+    main_weights_tmp[marker_uidx] = dxx;
+  }
+  if (!feof(infile)) {
+    goto load_distance_wts_ret_READ_FAIL;
+  }
+  wkspace_reset(wkspace_mark);
+  marker_ct = popcount_longs(marker_include, unfiltered_marker_ctl) - zcount;
+  if (!marker_ct) {
+    logprint("Error: No valid nonzero entries in --distance-wts file.\n");
+    goto load_distance_wts_ret_INVALID_FORMAT;
+  }
+  wkspace_left -= topsize;
+  if ((marker_ct != (*marker_ct_ptr))) {
+    if (conditional_alloc_exclude) {
+      if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+	goto load_distance_wts_ret_NOMEM2;
+      }
+    }
+    bitfield_exclude_to_include(marker_include, *marker_exclude_ptr, unfiltered_marker_ct);
+    *marker_ct_ptr = marker_ct;
+  }
+  if (wkspace_alloc_d_checked(main_weights_ptr, marker_ct * sizeof(double))) {
+    goto load_distance_wts_ret_NOMEM2;
+  }
+  wkspace_left += topsize;
+  dptr = *main_weights_ptr;
+  *marker_ct_ptr = marker_ct;
+  for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++) {
+    next_set_unsafe_ck(marker_include, &marker_uidx);
+    dxx = main_weights_tmp[marker_uidx];
+    if (dxx != 0.0) {
+      *dptr++ = dxx;
+      marker_idx++;
+    }
+  }
+  // topsize = 0;
+  while (0) {
+  load_distance_wts_ret_NOMEM2:
+    wkspace_left += topsize;
+  load_distance_wts_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
+  load_distance_wts_ret_OPEN_FAIL:
+    retval = RET_OPEN_FAIL;
+    break;
+  load_distance_wts_ret_READ_FAIL:
+    retval = RET_READ_FAIL;
+    break;
+  load_distance_wts_ret_INVALID_WEIGHT:
+    sprintf(logbuf, "Error: Invalid weight on line %" PRIuPTR " of --distance-wts file.\n", line_idx);
+  load_distance_wts_ret_INVALID_FORMAT_2:
+    logprintb();
+  load_distance_wts_ret_INVALID_FORMAT:
+    retval = RET_INVALID_FORMAT;
+    break;
+  }
+ load_distance_wts_ret_1:
+  fclose_cond(infile);
+  return retval;
+}
+
+int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* distance_wts_fname, uint32_t distance_wts_noheader, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t* marker_reverse, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
   unsigned char* wkspace_mark = wkspace_base;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t sample_ct = unfiltered_sample_ct - (*sample_exclude_ct_ptr);
@@ -6887,6 +6826,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
   uintptr_t marker_idx = 0;
   FILE* outfile = NULL;
   FILE* out_bin_nfile = NULL;
+  uintptr_t* marker_exclude = marker_exclude_orig;
   uint32_t rel_calc_type = relip->modifier & REL_CALC_MASK;
   int32_t ibc_type = relip->ibc_type;
   int32_t retval = 0;
@@ -6899,19 +6839,21 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
   double* dptr3 = NULL;
   double* dptr4 = NULL;
   double* rel_dists = NULL;
-  uint32_t chrom_fo_idx = 0;
+  double* main_weights = NULL;
+  double* main_weights_ptr = NULL;
   double* dptr2;
   double set_allele_freq_buf[MULTIPLEX_DIST];
   char wbuf[96];
   uint64_t start_offset;
   uint64_t hundredth;
+  unsigned char* overflow_buf;
   char* wptr;
   char* fam_id;
   char* sample_id;
   uintptr_t* geno;
   uintptr_t* masks;
   uintptr_t* mmasks;
-  double* weights;
+  double* subset_weights;
   double* rel_ibc;
   uint32_t* mdeptr;
   uint32_t* sample_missing_unwt;
@@ -6937,10 +6879,11 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
   uint32_t* giptr;
   uint32_t* giptr2;
   uintptr_t* glptr2;
-  if (is_set(chrom_info_ptr->haploid_mask, 0)) {
-    logprint("Error: --make-rel/--make-grm-... cannot be used on haploid genomes.\n");
-    goto calc_rel_ret_INVALID_CMDLINE;
+  if (distance_wts_fname) {
+    logprint("Error: --make-{rel,grm-gz,grm-bin} + --distance-wts is currently under\ndevelopment.\n");
+    goto calc_rel_ret_1;
   }
+
   // timing results on the NIH 512-core machine suggest that it's
   // counterproductive to make thread count exceed about n/64
   if (dist_thread_ct > sample_ct / 64) {
@@ -6990,6 +6933,7 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
     fill_double_zero(rel_dists, llxx);
   }
   wkspace_mark = wkspace_base;
+  // stack allocations after this point are freed normally
   if (rel_req && (!g_missing_dbl_excluded)) {
     if (wkspace_alloc_ui_checked(&g_missing_dbl_excluded, llxx * sizeof(int32_t))) {
       goto calc_rel_ret_NOMEM;
@@ -7003,23 +6947,27 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       wkspace_alloc_ul_checked(&mmasks, sample_ct * sizeof(intptr_t)) ||
       wkspace_alloc_uc_checked(&gptr, MULTIPLEX_REL * unfiltered_sample_ct4) ||
       wkspace_alloc_ul_checked(&masks, sample_ct * sizeof(intptr_t)) ||
-      wkspace_alloc_d_checked(&weights, 2048 * BITCT * sizeof(double))) {
+      wkspace_alloc_d_checked(&subset_weights, 2048 * BITCT * sizeof(double)) ||
+      wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
     goto calc_rel_ret_NOMEM;
   }
   g_geno = (unsigned char*)geno;
   g_masks = masks;
   g_mmasks = mmasks;
-  g_weights = weights;
+  g_subset_weights = subset_weights;
 
   // Exclude markers on non-autosomal chromosomes for now.
-  uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
-  if (uii) {
-    if (uii == marker_ct) {
-      logprint("Error: No autosomal variants for relationship matrix calculation.\n");
-      goto calc_rel_ret_INVALID_CMDLINE;
+  retval = conditional_allocate_non_autosomal_markers(chrom_info_ptr, unfiltered_marker_ct, marker_exclude_orig, marker_ct, 1, 1, "relationship matrix calc", &marker_exclude, &uii);
+  if (retval) {
+    goto calc_rel_ret_1;
+  }
+  marker_ct -= uii;
+
+  if (distance_wts_fname) {
+    retval = load_distance_wts(distance_wts_fname, unfiltered_marker_ct, marker_ids, max_marker_id_len, distance_wts_noheader, (marker_exclude == marker_exclude_orig), &marker_exclude, &marker_ct, &main_weights);
+    if (retval) {
+      goto calc_rel_ret_1;
     }
-    LOGPRINTF("Excluding %u variant%s on non-autosomes from relationship matrix calc.\n", uii, (uii == 1)? "" : "s");
-    marker_ct -= uii;
   }
 
   // See comments at the beginning of this file, and those in the main
@@ -7028,7 +6976,11 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
   // each marker to 3 bits and use + instead of XOR to distinguish the
   // cases.
   do {
-    retval = block_load_autosomal(bedfile, bed_offset, marker_exclude, marker_ct, MULTIPLEX_REL, unfiltered_sample_ct4, chrom_info_ptr, set_allele_freqs, NULL, gptr, &chrom_fo_idx, &marker_uidx, &marker_idx, &cur_markers_loaded, marker_reverse, set_allele_freq_buf, NULL, NULL);
+    copy_set_allele_freqs(marker_uidx, marker_exclude, MULTIPLEX_REL, marker_idx, marker_ct, marker_reverse, set_allele_freqs, set_allele_freq_buf);
+    if (main_weights) {
+      main_weights_ptr = &(main_weights[marker_idx]);
+    }
+    retval = block_load(bedfile, bed_offset, marker_exclude, marker_ct, MULTIPLEX_REL, unfiltered_sample_ct4, gptr, &marker_uidx, &marker_idx, &cur_markers_loaded);
     if (retval) {
       goto calc_rel_ret_1;
     }
@@ -7074,18 +7026,25 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
       }
       if (calculation_type & CALC_IBC) {
 	for (uii = 0; uii < 3; uii++) {
-	  update_rel_ibc(&(rel_ibc[uii * sample_ct]), geno, &(set_allele_freq_buf[win_marker_idx]), uii, sample_ct, ukk);
+	  update_rel_ibc(&(rel_ibc[uii * sample_ct]), geno, &(set_allele_freq_buf[win_marker_idx]), main_weights_ptr? (&(main_weights_ptr[win_marker_idx])) : NULL, uii, sample_ct, ukk);
 	}
       } else {
-	update_rel_ibc(rel_ibc, geno, &(set_allele_freq_buf[win_marker_idx]), ibc_type, sample_ct, ukk);
+	update_rel_ibc(rel_ibc, geno, &(set_allele_freq_buf[win_marker_idx]), main_weights_ptr? (&(main_weights_ptr[win_marker_idx])) : NULL, ibc_type, sample_ct, ukk);
       }
       if (rel_req) {
-	fill_weights_r(weights, &(set_allele_freq_buf[win_marker_idx]), (ibc_type != -1));
-	if (spawn_threads2(threads, &calc_rel_thread, dist_thread_ct, ujj)) {
-	  goto calc_rel_ret_THREAD_CREATE_FAIL;
-	}
+	fill_subset_weights_r(subset_weights, &(set_allele_freq_buf[win_marker_idx]), main_weights_ptr? (&(main_weights_ptr[win_marker_idx])) : NULL, (ibc_type != -1));
 	ulii = 0;
-	calc_rel_thread((void*)ulii);
+	if (!main_weights_ptr) {
+	  if (spawn_threads2(threads, &calc_rel_thread, dist_thread_ct, ujj)) {
+	    goto calc_rel_ret_THREAD_CREATE_FAIL;
+	  }
+	  calc_rel_thread((void*)ulii);
+	} else {
+	  if (spawn_threads2(threads, &calc_wt_rel_thread, dist_thread_ct, ujj)) {
+	    goto calc_rel_ret_THREAD_CREATE_FAIL;
+	  }
+	  calc_wt_rel_thread((void*)ulii);
+	}
 	join_threads2(threads, dist_thread_ct, ujj);
       }
     }
@@ -7354,13 +7313,13 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
 	  } else {
 	    strcpy(outname_end, ".grm.gz");
 	  }
-	  parallel_compress(outname, 0, calc_rel_grm_emitn);
+	  parallel_compress(outname, overflow_buf, 0, calc_rel_grm_emitn);
 	} else {
 	  strcpy(outname_end, ".grm");
 	  if (parallel_tot > 1) {
 	    sprintf(&(outname_end[4]), ".%u", parallel_idx + 1);
 	  }
-	  retval = write_uncompressed(outname, 0, calc_rel_grm_emitn);
+	  retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_grm_emitn);
 	  if (retval) {
 	    goto calc_rel_ret_1;
 	  }
@@ -7380,9 +7339,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
 	}
 	if (rel_shape == REL_CALC_TRI) {
 	  if (rel_calc_type & REL_CALC_GZ) {
-	    parallel_compress(outname, 0, calc_rel_tri_emitn);
+	    parallel_compress(outname, overflow_buf, 0, calc_rel_tri_emitn);
 	  } else {
-	    retval = write_uncompressed(outname, 0, calc_rel_tri_emitn);
+	    retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_tri_emitn);
 	    if (retval) {
 	      goto calc_rel_ret_1;
 	    }
@@ -7406,9 +7365,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
 	  }
 	  g_cr_min_sample = min_sample;
 	  if (rel_calc_type & REL_CALC_GZ) {
-	    parallel_compress(outname, 0, calc_rel_sq0_emitn);
+	    parallel_compress(outname, overflow_buf, 0, calc_rel_sq0_emitn);
 	  } else {
-	    retval = write_uncompressed(outname, 0, calc_rel_sq0_emitn);
+	    retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_sq0_emitn);
 	    if (retval) {
 	      goto calc_rel_ret_1;
 	    }
@@ -7416,9 +7375,9 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
 	} else {
 	  g_cr_min_sample = min_sample;
 	  if (rel_calc_type & REL_CALC_GZ) {
-	    parallel_compress(outname, 0, calc_rel_sq_emitn);
+	    parallel_compress(outname, overflow_buf, 0, calc_rel_sq_emitn);
 	  } else {
-	    retval = write_uncompressed(outname, 0, calc_rel_sq_emitn);
+	    retval = write_uncompressed(outname, overflow_buf, 0, calc_rel_sq_emitn);
 	    if (retval) {
 	      goto calc_rel_ret_1;
 	    }
@@ -7470,9 +7429,6 @@ int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_to
   calc_rel_ret_WRITE_FAIL:
     retval = RET_WRITE_FAIL;
     break;
-  calc_rel_ret_INVALID_CMDLINE:
-    retval = RET_INVALID_CMDLINE;
-    break;
   calc_rel_ret_THREAD_CREATE_FAIL:
     retval = RET_THREAD_CREATE_FAIL;
     break;
@@ -7937,13 +7893,13 @@ int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outna
 }
 #endif
 
-int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Chrom_info* chrom_info_ptr) {
+int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Chrom_info* chrom_info_ptr) {
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t marker_uidx = 0;
   uintptr_t marker_idx = 0;
-  uint32_t chrom_fo_idx = 0;
   uint32_t dist_thread_ct = g_thread_ct;
   int32_t retval = 0;
+  uintptr_t* marker_exclude = marker_exclude_orig;
   uint32_t* giptr = NULL;
   unsigned char* wkspace_mark;
   unsigned char* bedbuf;
@@ -7959,12 +7915,7 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
   uint32_t umm;
   uint32_t unn;
   uintptr_t* glptr;
-  uint32_t marker_ct_autosomal;
   int64_t llxx;
-  if (is_set(chrom_info_ptr->haploid_mask, 0)) {
-    logprint("Error: '--cluster missing' cannot currently be used on haploid genomes.\n");
-    goto calc_ibm_ret_INVALID_CMDLINE;
-  }
   g_sample_ct = sample_ct;
   if (dist_thread_ct > sample_ct / 32) {
     dist_thread_ct = sample_ct / 32;
@@ -7990,21 +7941,20 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
   }
   g_mmasks = mmasks;
   fseeko(bedfile, bed_offset, SEEK_SET);
-  uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
-  marker_ct_autosomal = marker_ct - uii;
-  if (uii) {
-    LOGPRINTF("Excluding %u variant%s on non-autosomes from IBM calculation.\n", uii, (uii == 1)? "" : "s");
-  }
-  is_last_block = (marker_idx == marker_ct_autosomal);
-  while (!is_last_block) {
-    retval = block_load_autosomal(bedfile, bed_offset, marker_exclude, marker_ct_autosomal, MULTIPLEX_DIST, unfiltered_sample_ct4, chrom_info_ptr, NULL, NULL, bedbuf, &chrom_fo_idx, &marker_uidx, &marker_idx, &ujj, NULL, NULL, NULL, NULL);
+  retval = conditional_allocate_non_autosomal_markers(chrom_info_ptr, unfiltered_marker_ct, marker_exclude_orig, marker_ct, 1, 1, "IBM calculation", &marker_exclude, &uii);
+  if (retval) {
+    goto calc_ibm_ret_1;
+  }
+  marker_ct -= uii;
+  do {
+    retval = block_load(bedfile, bed_offset, marker_exclude, marker_ct, MULTIPLEX_DIST, unfiltered_sample_ct4, bedbuf, &marker_uidx, &marker_idx, &ujj);
     if (retval) {
       goto calc_ibm_ret_1;
     }
     if (ujj < MULTIPLEX_DIST) {
       memset(&(bedbuf[ujj * unfiltered_sample_ct4]), 0, (MULTIPLEX_DIST - ujj) * unfiltered_sample_ct4);
     }
-    is_last_block = (marker_idx == marker_ct_autosomal);
+    is_last_block = (marker_idx == marker_ct);
     for (ukk = 0; ukk < ujj; ukk += BITCT) {
       glptr = mmasks;
       giptr = sample_missing_unwt;
@@ -8044,43 +7994,43 @@ int32_t calc_ibm(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uintpt
 
     printf("\r%" PRIuPTR " markers complete.", marker_idx);
     fflush(stdout);
-  }
+  } while (!is_last_block);
   putchar('\r');
   wkspace_reset(wkspace_mark);
   while (0) {
   calc_ibm_ret_NOMEM:
     retval = RET_NOMEM;
     break;
-  calc_ibm_ret_INVALID_CMDLINE:
-    retval = RET_INVALID_CMDLINE;
-    break;
   calc_ibm_ret_THREAD_CREATE_FAIL:
     retval = RET_THREAD_CREATE_FAIL;
     break;
   }
  calc_ibm_ret_1:
+  // caller will free memory if there was an error
   return retval;
 }
 
-int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t* marker_exclude, uint32_t marker_ct, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, Chrom_info* chrom_info_ptr, uint32_t wt_needed, uint32_t marker_weight_sum, uint3 [...]
+int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* read_dists_fname, char* distance_wts_fname, double distance_exp, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exc [...]
+  // if calculation_type == 0, this must perform the basic unweighted
+  // computation and not write to disk.
   FILE* outfile = NULL;
   FILE* outfile2 = NULL;
   FILE* outfile3 = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uint64_t dists_alloc = 0;
-  double marker_weight_sum_d = (double)marker_weight_sum;
+  uint32_t missing_wt_needed = ((calculation_type & CALC_DISTANCE) || ((!read_dists_fname) && (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE)))) && (!(dist_calc_type & DISTANCE_FLAT_MISSING));
   uint32_t unwt_needed = 0;
-  uintptr_t marker_uidx = 0;
-  uintptr_t marker_idx = 0;
-  uint32_t chrom_fo_idx = 0;
+  uint32_t marker_weight_sum = 0;
   int32_t retval = 0;
-  uint32_t exp0 = (exponent == 0.0);
+  uintptr_t* marker_exclude = marker_exclude_orig;
+  uint32_t* dist_missing_wts_i = NULL;
   uint32_t* sample_missing = NULL;
   uint32_t* sample_missing_unwt = NULL;
   uint32_t* giptr = NULL;
   uint32_t* giptr2 = NULL;
   char* writebuf = NULL;
-  double* weights = NULL;
+  double* main_weights = NULL;
+  double* subset_weights = NULL;
   uint32_t dist_thread_ct = g_thread_ct;
   double set_allele_freq_buf[MULTIPLEX_DIST];
   uint32_t wtbuf[MULTIPLEX_DIST];
@@ -8093,6 +8043,8 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   unsigned char* gptr;
   uintptr_t sample_uidx;
   uintptr_t sample_idx;
+  uintptr_t marker_uidx;
+  uintptr_t marker_idx;
   uintptr_t ulii;
   uintptr_t uljj;
   uintptr_t ulkk;
@@ -8109,17 +8061,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   uintptr_t* glptr;
   uintptr_t* glptr2;
   uintptr_t* glptr3;
+  double* dist_missing_wts;
   double* dptr2;
+  double marker_weight_sum_d;
   double dxx;
   double dyy;
-  uint32_t marker_ct_autosomal;
   uint32_t multiplex;
-  uint32_t chrom_end;
   int64_t llxx;
-  if (is_set(chrom_info_ptr->haploid_mask, 0)) {
-    logprint("Error: --distance/--ibs-matrix/--distance-matrix cannot be used on haploid\ngenomes.\n");
-    goto calc_distance_ret_INVALID_CMDLINE;
-  }
   g_sample_ct = sample_ct;
   if (dist_thread_ct > sample_ct / 32) {
     dist_thread_ct = sample_ct / 32;
@@ -8154,8 +8102,9 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   if (wkspace_alloc_d_checked(&g_dists, dists_alloc + CACHELINE)) {
     goto calc_distance_ret_NOMEM;
   }
+  // stack allocations before this point must be freed by the caller.
   wkspace_mark = wkspace_base;
-  if (wt_needed) {
+  if (missing_wt_needed) {
     if (wkspace_alloc_ui_checked(&g_missing_tot_weights, llxx * sizeof(int32_t)) ||
         wkspace_alloc_ui_checked(&sample_missing, sample_ct * sizeof(int32_t))) {
       goto calc_distance_ret_NOMEM;
@@ -8167,12 +8116,31 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     g_missing_tot_weights = NULL;
   }
 
-  if (exp0) {
+  ujj = distance_wts_fname || (distance_exp != 0.0); // special weights?
+  if (!ujj) {
     g_idists = (int32_t*)((char*)wkspace_mark - CACHEALIGN(llxx * sizeof(int32_t)));
     fill_int_zero(g_idists, llxx);
-    masks = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
   } else {
     fill_double_zero(g_dists, llxx);
+  }
+
+  retval = conditional_allocate_non_autosomal_markers(chrom_info_ptr, unfiltered_marker_ct, marker_exclude_orig, marker_ct, 1, 1, "distance matrix calc", &marker_exclude, &uii);
+  if (retval) {
+    goto calc_distance_ret_1;
+  }
+  marker_ct -= uii;
+
+  if (distance_wts_fname) {
+    retval = load_distance_wts(distance_wts_fname, unfiltered_marker_ct, marker_ids, max_marker_id_len, dist_calc_type & DISTANCE_WTS_NOHEADER, (marker_exclude == marker_exclude_orig), &marker_exclude, &marker_ct, &main_weights);
+    if (retval) {
+      goto calc_distance_ret_1;
+    }
+  }
+
+  // stack allocations past this point are freed BEFORE results are written.
+  if (!ujj) {
+    masks = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
+  } else {
     masks = (uintptr_t*)wkspace_alloc(sample_ct * sizeof(intptr_t));
   }
   if (!masks) {
@@ -8182,7 +8150,77 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     goto calc_distance_ret_NOMEM;
   }
 
-  if (exp0) {
+  // Load or compute nonuniform marker weighting scheme.
+  if (distance_exp != 0.0) {
+    if (wkspace_alloc_d_checked(&main_weights, marker_ct * sizeof(double))) {
+      goto calc_distance_ret_NOMEM;
+    }
+    for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+      next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+      dxx = set_allele_freqs[marker_uidx];
+      dyy = 2 * dxx * (1.0 - dxx);
+      if (dyy != 0.0) {
+	dyy = pow(dyy, -distance_exp);
+      }
+      main_weights[marker_idx] = dyy;
+    }
+  }
+  // Now compute missing observation weights.  (Note that these are usually not
+  // the same as the raw marker weights: for instance, a missing observation at
+  // a MAF-0 marker has no weight at all.)
+  if (missing_wt_needed) {
+    // hack: overwrite dist_missing_wts while populating dist_missing_wts_i.
+    // CACHELINE padding added to reduce risk of an aliasing problem.
+    if (wkspace_alloc_ui_checked(&dist_missing_wts_i, CACHELINE) ||
+        wkspace_alloc_d_checked(&dist_missing_wts, marker_ct * sizeof(double))) {
+      goto calc_distance_ret_NOMEM;
+    }
+    dyy = 0.0; // raw weight sum
+    for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+      next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+      // assume HWE, compute expected contribution to distance statistic:
+      //   expected minor allele obs: 2 * maf
+      //   P(0 copies) = (1 - maf) * (1 - maf)
+      //   P(1 copy)   = 2 * maf * (1 - maf)
+      //   P(2 copies) = maf * maf
+      //   frequency of distance-1 pairs:
+      //       freq[0-1 pair] + freq[1-2 pair]
+      //     =   2 * (1 - maf) * (1 - maf) * 2 * maf * (1 - maf)
+      //       + 2 * 2 * maf * (1 - maf) * maf * maf
+      //     =   4 * maf * (1 - maf) * (maf * maf + (1 - maf) * (1 - maf))
+      //         4 * maf * (1 - maf) * (2 * maf * maf - 2 * maf + 1)
+      //   frequency of distance-2 pairs:
+      //     2 * (1 - maf) * (1 - maf) * maf * maf
+      //   expected distance:
+      //       4 * maf * (1 - maf) * (2 * maf * maf - 2 * maf + 1
+      //                              + maf * (1 - maf))
+      //     = 4 * maf * (1 - maf) * (maf * maf - maf + 1)
+      //     constant factor doesn't matter here
+      dxx = set_allele_freqs[marker_uidx];
+      if ((dxx != 0.0) && (dxx != 1.0)) {
+	dxx = dxx * (1.0 - dxx) * (dxx * dxx - dxx + 1);
+	if (main_weights) {
+	  dxx *= main_weights[marker_idx];
+	}
+      }
+      dist_missing_wts[marker_idx] = dxx;
+      dyy += dxx;
+    }
+
+    // now normalize to sum to just under 2^32.  (switch to 2^64 if/when 32-bit
+    // performance becomes less important than accuracy on 50+ million marker
+    // sets.)
+    // subtract marker_ct to guard against rounding-driven overflow
+    dyy = (4294967296.0 - ((double)((intptr_t)marker_ct))) / dyy;
+    for (marker_idx = 0; marker_idx < marker_ct; marker_idx++) {
+      uii = (uint32_t)(dist_missing_wts[marker_idx] * dyy + 0.5);
+      marker_weight_sum += uii;
+      dist_missing_wts_i[marker_idx] = uii;
+    }
+  }
+  marker_weight_sum_d = (double)marker_weight_sum;
+
+  if (!main_weights) {
     multiplex = MULTIPLEX_DIST;
     geno = (uintptr_t*)wkspace_alloc(sample_ct * (MULTIPLEX_2DIST / 8));
   } else {
@@ -8199,27 +8237,23 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   if (wkspace_alloc_uc_checked(&bedbuf, multiplex * unfiltered_sample_ct4)) {
     goto calc_distance_ret_NOMEM;
   }
-  if (!exp0) {
+  if (main_weights) {
 #ifdef __LP64__
-    if (wkspace_alloc_d_checked(&weights, 45056 * sizeof(double))) {
+    if (wkspace_alloc_d_checked(&subset_weights, 45056 * sizeof(double))) {
       goto calc_distance_ret_NOMEM;
     }
 #else
-    if (wkspace_alloc_d_checked(&weights, 32768 * sizeof(double))) {
+    if (wkspace_alloc_d_checked(&subset_weights, 32768 * sizeof(double))) {
       goto calc_distance_ret_NOMEM;
     }
-    g_weights_i = wtbuf;
+    g_subset_weights_i = wtbuf;
 #endif
-    g_weights = weights;
+    g_subset_weights = subset_weights;
   }
   fseeko(bedfile, bed_offset, SEEK_SET);
-  uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude, 1, 1);
-  marker_ct_autosomal = marker_ct - uii;
-  if (uii) {
-    LOGPRINTF("Excluding %u variant%s on non-autosomes from distance matrix calc.\n", uii, (uii == 1)? "" : "s");
-  }
-  is_last_block = (marker_idx == marker_ct_autosomal);
-  while (!is_last_block) {
+  marker_uidx = 0;
+  marker_idx = 0;
+  do {
     for (ujj = 0; ujj < multiplex; ujj++) {
       set_allele_freq_buf[ujj] = 0.5;
     }
@@ -8268,13 +8302,21 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     // See the comments at the beginning of this file for discussion of
     // the zero exponent special case.
 
-    retval = block_load_autosomal(bedfile, bed_offset, marker_exclude, marker_ct_autosomal, multiplex, unfiltered_sample_ct4, chrom_info_ptr, set_allele_freqs, marker_weights_i, bedbuf, &chrom_fo_idx, &marker_uidx, &marker_idx, &ujj, NULL, set_allele_freq_buf, NULL, wt_needed? wtbuf : NULL);
+    copy_set_allele_freqs(marker_uidx, marker_exclude, multiplex, marker_idx, marker_ct, NULL, set_allele_freqs, set_allele_freq_buf);
+    if (missing_wt_needed) {
+      uii = marker_ct - marker_idx;
+      if (uii > multiplex) {
+	uii = multiplex;
+      }
+      memcpy(wtbuf, &(dist_missing_wts_i[marker_idx]), uii * sizeof(int32_t));
+    }
+    retval = block_load(bedfile, bed_offset, marker_exclude, marker_ct, multiplex, unfiltered_sample_ct4, bedbuf, &marker_uidx, &marker_idx, &ujj);
     if (retval) {
       goto calc_distance_ret_1;
     }
     if (ujj < multiplex) {
       memset(&(bedbuf[ujj * unfiltered_sample_ct4]), 0, (multiplex - ujj) * unfiltered_sample_ct4);
-      if (exp0) {
+      if (!main_weights) {
 	fill_ulong_zero(geno, sample_ct * (MULTIPLEX_2DIST / BITCT));
 	fill_ulong_zero(masks, sample_ct * (MULTIPLEX_2DIST / BITCT));
       } else {
@@ -8282,13 +8324,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 	fill_ulong_zero(masks, sample_ct);
       }
     }
-    is_last_block = (marker_idx == marker_ct_autosomal);
-    if (exp0) {
+    is_last_block = (marker_idx == marker_ct);
+    if (!main_weights) {
       for (ukk = 0; ukk < ujj; ukk += BITCT) {
 	glptr = &(geno[ukk / BITCT2]);
 	glptr2 = &(masks[ukk / BITCT2]);
 	glptr3 = mmasks;
-	if (wt_needed) {
+	if (missing_wt_needed) {
 	  giptr = sample_missing;
 	}
 	if (unwt_needed) {
@@ -8305,7 +8347,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 	      ulii |= uljj << (umm * 2);
 	      if (uljj == 1) {
 		ulkk |= ONELU << umm;
-		if (wt_needed) {
+		if (missing_wt_needed) {
 		  *giptr += wtbuf[umm + ukk];
 		}
 		if (unwt_needed) {
@@ -8328,7 +8370,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 	      ulii |= uljj << (umm * 2);
 	      if (uljj == 1) {
 		ulkk |= ONELU << umm;
-		if (wt_needed) {
+		if (missing_wt_needed) {
 		  *giptr += wtbuf[umm + ukk + BITCT2];
 		}
 		if (unwt_needed) {
@@ -8343,7 +8385,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 	    *glptr3++ |= ulkk << BITCT2;
 	    glptr = &(glptr[(MULTIPLEX_2DIST / BITCT) - 1]);
 	    glptr2 = &(glptr2[(MULTIPLEX_2DIST / BITCT) - 1]);
-	    if (wt_needed) {
+	    if (missing_wt_needed) {
 	      giptr++;
 	    }
 	    if (unwt_needed) {
@@ -8352,8 +8394,8 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 	  }
 	}
 
-	if (wt_needed) {
-	  g_weights_i = &(wtbuf[ukk]);
+	if (missing_wt_needed) {
+	  g_subset_weights_i = &(wtbuf[ukk]);
 	}
 	uii = is_last_block && (ukk + BITCT >= ujj);
 	if (spawn_threads2(threads, &calc_ibs_thread, dist_thread_ct, uii)) {
@@ -8400,7 +8442,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
 	    giptr3++;
 	  }
 	}
-	fill_weights(weights, &(set_allele_freq_buf[ukk]), exponent);
+	fill_subset_weights(subset_weights, &(main_weights[ukk]));
 	uii = is_last_block && (ukk + (MULTIPLEX_DIST_EXP / 3) >= ujj);
 	if (spawn_threads2(threads, &calc_wdist_thread, dist_thread_ct, uii)) {
 	  goto calc_distance_ret_THREAD_CREATE_FAIL;
@@ -8412,7 +8454,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     }
     printf("\r%" PRIuPTR " markers complete.", marker_idx);
     fflush(stdout);
-  }
+  } while (!is_last_block);
   putchar('\r');
   logprint("Distance matrix calculation complete.\n");
   wkspace_reset(masks);
@@ -8432,7 +8474,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     // parallel_tot must be 1 for --distance-matrix
     for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
       giptr2 = sample_missing_unwt;
-      uii = marker_ct_autosomal - giptr2[sample_idx];
+      uii = marker_ct - giptr2[sample_idx];
       wptr = writebuf;
       for (ujj = 0; ujj < sample_idx; ujj++) {
 	wptr = double_g_writex(wptr, ((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++))), ' ');
@@ -8482,7 +8524,7 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     pct = 1;
     for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
       giptr2 = sample_missing_unwt;
-      uii = marker_ct_autosomal - giptr2[sample_idx];
+      uii = marker_ct - giptr2[sample_idx];
       wptr = writebuf;
       for (ujj = 0; ujj < sample_idx; ujj++) {
 	wptr = double_g_writex(wptr, 1.0 - (((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++)))), ' ');
@@ -8514,12 +8556,12 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
     }
     outname_end[5] = '\0';
     LOGPRINTFWW("IBS matrix written to %s , and IDs to %s.id .\n", outname, outname);
-  }
+  } while (!is_last_block);
   tstc = g_thread_start[dist_thread_ct];
-  if (wt_needed) {
+  if (missing_wt_needed) {
     giptr = g_missing_tot_weights;
     dptr2 = g_dists;
-    if (exp0) {
+    if (!main_weights) {
       iptr = g_idists;
       for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
 	giptr2 = sample_missing;
@@ -8541,13 +8583,13 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   } else if (dist_calc_type & DISTANCE_FLAT_MISSING) {
     dptr2 = g_dists;
     giptr = g_missing_dbl_excluded;
-    if (exp0) {
+    if (!main_weights) {
       iptr = g_idists;
       if (dist_calc_type & DISTANCE_CLUSTER) {
 	// save as IBS
         for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
 	  giptr2 = sample_missing_unwt;
-	  uii = marker_ct_autosomal - giptr2[sample_idx];
+	  uii = marker_ct - giptr2[sample_idx];
 	  for (ujj = 0; ujj < sample_idx; ujj++) {
 	    *dptr2++ = 1.0 - (((double)(*iptr++)) / (2 * (uii - (*giptr2++) + (*giptr++))));
 	  }
@@ -8555,18 +8597,18 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
       } else {
 	for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
 	  giptr2 = sample_missing_unwt;
-	  uii = marker_ct_autosomal - giptr2[sample_idx];
+	  uii = marker_ct - giptr2[sample_idx];
 	  for (ujj = 0; ujj < sample_idx; ujj++) {
-	    *dptr2++ = (((double)marker_ct_autosomal) / (uii - (*giptr2++) + (*giptr++))) * (*iptr++);
+	    *dptr2++ = (((double)marker_ct) / (uii - (*giptr2++) + (*giptr++))) * (*iptr++);
 	  }
 	}
       }
     } else {
       for (sample_idx = g_thread_start[0]; sample_idx < tstc; sample_idx++) {
 	giptr2 = sample_missing_unwt;
-	uii = marker_ct_autosomal - giptr2[sample_idx];
+	uii = marker_ct - giptr2[sample_idx];
 	for (ujj = 0; ujj < sample_idx; ujj++) {
-	  *dptr2 *= ((double)marker_ct_autosomal) / (uii - (*giptr2++) + (*giptr++));
+	  *dptr2 *= ((double)marker_ct) / (uii - (*giptr2++) + (*giptr++));
 	  dptr2++;
 	}
       }
@@ -8574,18 +8616,16 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   }
 
   if (calculation_type & (CALC_DISTANCE | CALC_IBS_TEST)) {
-    if ((exponent == 0.0) || (!(dist_calc_type & (DISTANCE_IBS | DISTANCE_1_MINUS_IBS)))) {
-      g_half_marker_ct_recip = 0.5 / (double)marker_ct_autosomal;
+    if ((distance_exp == 0.0) || (!(dist_calc_type & (DISTANCE_IBS | DISTANCE_1_MINUS_IBS)))) {
+      g_half_marker_ct_recip = 0.5 / (double)marker_ct;
     } else {
       dyy = 0.0;
       marker_uidx = 0;
-      chrom_fo_idx = 0xffffffffU;
-      chrom_end = 0;
-      for (marker_idx = 0; marker_idx < marker_ct_autosomal; marker_uidx++, marker_idx++) {
-	marker_uidx = next_autosomal_unsafe(marker_exclude, marker_uidx, chrom_info_ptr, &chrom_end, &chrom_fo_idx);
+      for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
+	next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
 	dxx = set_allele_freqs[marker_uidx];
 	if ((dxx > 0.0) && (dxx < 1.0)) {
-	  dyy += pow(2 * dxx * (1.0 - dxx), -exponent);
+	  dyy += pow(2 * dxx * (1.0 - dxx), -distance_exp);
 	} else {
 	  dyy += 1.0;
 	}
@@ -8618,9 +8658,6 @@ int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parall
   calc_distance_ret_WRITE_FAIL:
     retval = RET_WRITE_FAIL;
     break;
-  calc_distance_ret_INVALID_CMDLINE:
-    retval = RET_INVALID_CMDLINE;
-    break;
   calc_distance_ret_THREAD_CREATE_FAIL:
     retval = RET_THREAD_CREATE_FAIL;
     break;
@@ -8919,7 +8956,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
     // calculate entire distance matrix, or use already-calculated matrix in
     // memory
     if (!g_dists) {
-      retval = calc_distance(threads, 0, 1, bedfile, bed_offset, outname, outname_end, 0, DISTANCE_FLAT_MISSING | DISTANCE_CLUSTER, marker_exclude, marker_ct, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr, 0, 0, NULL, 0.0);
+      retval = calc_distance(threads, 0, 1, bedfile, bed_offset, outname, outname_end, NULL, NULL, 0.0, 0, DISTANCE_FLAT_MISSING | DISTANCE_CLUSTER, unfiltered_marker_ct, marker_exclude, marker_ct, NULL, 0, set_allele_freqs, unfiltered_sample_ct, sample_exclude, sample_ct, sample_ids, max_sample_id_len, chrom_info_ptr);
       if (retval) {
         goto calc_cluster_neighbor_ret_1;
       }
@@ -9057,7 +9094,7 @@ int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_o
   }
   if (cluster_missing || ibm_constraint) {
     if (!g_missing_dbl_excluded) {
-      retval = calc_ibm(threads, bedfile, bed_offset, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, chrom_info_ptr);
+      retval = calc_ibm(threads, bedfile, bed_offset, unfiltered_marker_ct, marker_exclude, marker_ct, unfiltered_sample_ct, sample_exclude, sample_ct, chrom_info_ptr);
       if (retval) {
         goto calc_cluster_neighbor_ret_1;
       }
diff --git a/plink_calc.h b/plink_calc.h
index 1dc3abb..6e86302 100644
--- a/plink_calc.h
+++ b/plink_calc.h
@@ -61,13 +61,13 @@ int32_t calc_genome(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uin
 
 int32_t rel_cutoff_batch(uint32_t load_grm_bin, char* grmname, char* outname, char* outname_end, Rel_info* relip);
 
-int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_reverse, uint32_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, double* set_allele_freqs, double** rel_ibc_ptr,  [...]
+int32_t calc_rel(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, uint64_t calculation_type, Rel_info* relip, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* distance_wts_fname, uint32_t distance_wts_noheader, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t* marker_reverse, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_e [...]
 
 #ifndef NOLAPACK
 int32_t calc_pca(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint64_t calculation_type, Rel_info* relip, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pca_sample_exclude, uintptr_t pca_sample_ct, char* sample_ids, uintptr_t max_sample_id_len, double* set_allele_freq [...]
 #endif
 
-int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t* marker_exclude, uint32_t marker_ct, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uintptr_t max_sample_id_len, Chrom_info* chrom_info_ptr, uint32_t wt_needed, uint32_t marker_weight_sum, uint3 [...]
+int32_t calc_distance(pthread_t* threads, uint32_t parallel_idx, uint32_t parallel_tot, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* read_dists_fname, char* distance_wts_fname, double distance_exp, uint64_t calculation_type, uint32_t dist_calc_type, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exc [...]
 
 int32_t calc_cluster_neighbor(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, uint32_t marker_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, char* read_dists_fname, char* read_dists_id_fname, char* read_genome_fname, char* outname, char* ou [...]
 
diff --git a/plink_cluster.c b/plink_cluster.c
index 7d23997..305e817 100644
--- a/plink_cluster.c
+++ b/plink_cluster.c
@@ -600,6 +600,11 @@ void fill_unfiltered_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t
 }
 
 int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts, uint32_t* sample_to_cluster, uint32_t* late_clidx_to_sample_uidx) {
+  // If late_clidx_to_sample_uidx is not NULL, all samples not in a loaded
+  // cluster are given their own cluster, and late_clidx_to_sample_uidx is
+  // filled with the cluster index -> sample uidx mapping.
+  // (Yes, this is a strange interface; it may be switched to filtered sample
+  // indexes later.)
   unsigned char* wkspace_mark = wkspace_base;
   uint32_t* cluster_map_pos = cluster_map;
   int32_t retval = 0;
@@ -619,7 +624,7 @@ int32_t fill_sample_to_cluster(uintptr_t unfiltered_sample_ct, uintptr_t* sample
       sample_to_cluster[uidx_to_idx[*cluster_map_pos]] = cluster_idx;
     } while (++cluster_map_pos < cluster_end_ptr);
   }
-  if (cluster_starts[cluster_ct] < sample_ct) {
+  if (late_clidx_to_sample_uidx && (cluster_starts[cluster_ct] < sample_ct)) {
     sample_uidx = 0;
     for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
       sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
diff --git a/plink_common.c b/plink_common.c
index 5a6680c..7d15185 100644
--- a/plink_common.c
+++ b/plink_common.c
@@ -568,6 +568,25 @@ int32_t read_tokens(FILE* infile, char* buf, uintptr_t half_bufsize, uintptr_t t
   }
 }
 
+int32_t gzputs_w4(gzFile gz_outfile, const char* ss) {
+  if (!ss[1]) {
+    if (gzputs(gz_outfile, "   ") == -1) {
+      return -1;
+    }
+    return gzputc(gz_outfile, ss[0]);
+  }
+  if (!ss[2]) {
+    if (gzputs(gz_outfile, "  ") == -1) {
+      return -1;
+    }
+  } else if (!ss[3]) {
+    if (gzputc(gz_outfile, ' ') == -1) {
+      return -1;
+    }
+  }
+  return gzputs(gz_outfile, ss);
+}
+
 int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr) {
   char* lptr;
   do {
@@ -7633,6 +7652,54 @@ uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* mark
   return ct;
 }
 
+int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr) {
+  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  int32_t x_code = chrom_info_ptr->x_code;
+  int32_t y_code = chrom_info_ptr->y_code;
+  int32_t mt_code = chrom_info_ptr->mt_code;
+  uint32_t x_ct = 0;
+  uint32_t y_ct = 0;
+  uint32_t mt_ct = 0;
+  if (is_set(chrom_info_ptr->haploid_mask, 0)) {
+    *newly_excluded_ct_ptr = marker_ct;
+  } else {
+    if (count_x && (x_code != -1)) {
+      x_ct = count_chrom_markers(chrom_info_ptr, x_code, marker_exclude_orig);
+    }
+    if (y_code != -1) {
+      y_ct = count_chrom_markers(chrom_info_ptr, y_code, marker_exclude_orig);
+    }
+    if (count_mt && (mt_code != -1)) {
+      mt_ct = count_chrom_markers(chrom_info_ptr, mt_code, marker_exclude_orig);
+    }
+    *newly_excluded_ct_ptr = x_ct + y_ct + mt_ct;
+  }
+  if (*newly_excluded_ct_ptr) {
+    LOGPRINTF("Excluding %u variant%s on non-autosomes from %s.\n", *newly_excluded_ct_ptr, (*newly_excluded_ct_ptr == 1)? "" : "s", calc_descrip);
+  }
+  if (*newly_excluded_ct_ptr == marker_ct) {
+    logprint("Error: No variants remaining.\n");
+    return RET_INVALID_CMDLINE;
+  }
+  if (!(*newly_excluded_ct_ptr)) {
+    return 0;
+  }
+  if (wkspace_alloc_ul_checked(marker_exclude_ptr, unfiltered_marker_ctl * sizeof(intptr_t))) {
+    return RET_NOMEM;
+  }
+  memcpy(*marker_exclude_ptr, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
+  if (x_ct) {
+    fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)x_code], chrom_info_ptr->chrom_end[(uint32_t)x_code] - chrom_info_ptr->chrom_start[(uint32_t)x_code]);
+  }
+  if (y_ct) {
+    fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)y_code], chrom_info_ptr->chrom_end[(uint32_t)y_code] - chrom_info_ptr->chrom_start[(uint32_t)y_code]);
+  }
+  if (mt_ct) {
+    fill_bits(*marker_exclude_ptr, chrom_info_ptr->chrom_start[(uint32_t)mt_code], chrom_info_ptr->chrom_end[(uint32_t)mt_code] - chrom_info_ptr->chrom_start[(uint32_t)mt_code]);
+  }
+  return 0;
+}
+
 uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr) {
   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
   uint32_t max_chrom_size = 0;
@@ -7683,21 +7750,6 @@ void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_
   *unk_ct_ptr = unk_ct;
 }
 
-double calc_wt_mean_maf(double exponent, double maf) {
-  // assume Hardy-Weinberg equilibrium
-  // homozygote frequencies: maf^2, (1-maf)^2
-  // heterozygote frequency: 2maf(1-maf)
-  double ll_freq = maf * maf;
-  double lh_freq = 2 * maf * (1.0 - maf);
-  double hh_freq = (1.0 - maf) * (1.0 - maf);
-  double weight;
-  if (lh_freq == 0.0) {
-    return 0.0;
-  }
-  weight = pow(lh_freq, -exponent);
-  return (lh_freq * (ll_freq + lh_freq) + 2 * ll_freq * hh_freq) * weight;
-}
-
 void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct) {
   uintptr_t sample_bidx = 0;
   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
@@ -7920,77 +7972,6 @@ uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sa
   }
 }
 
-uint32_t block_load_autosomal(FILE* bedfile, int32_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct_autosomal, uint32_t block_max_size, uintptr_t unfiltered_sample_ct4, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_weights, unsigned char* readbuf, uint32_t* chrom_fo_idx_ptr, uintptr_t* marker_uidx_ptr, uintptr_t* marker_idx_ptr, uint32_t* block_size_ptr, uintptr_t* marker_reverse, double* set_allele_freq_buf, float* set_allele_freq_buf_fl, uint32_t* wtbuf) {
-  uintptr_t marker_uidx = *marker_uidx_ptr;
-  uintptr_t marker_idx = *marker_idx_ptr;
-  uint32_t chrom_fo_idx = *chrom_fo_idx_ptr;
-  uint32_t chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
-  uint32_t markers_read = 0;
-  uint32_t autosome_ct = chrom_info_ptr->autosome_ct;
-  uint32_t xy_code = (uint32_t)chrom_info_ptr->xy_code;
-  uint32_t max_code = chrom_info_ptr->max_code;
-  uint32_t cur_chrom;
-  uint32_t is_x;
-  uint32_t is_y;
-  uint32_t is_mt;
-  uint32_t is_haploid;
-
-  if (block_max_size > marker_ct_autosomal - marker_idx) {
-    block_max_size = marker_ct_autosomal - marker_idx;
-  }
-  while (markers_read < block_max_size) {
-    if (IS_SET(marker_exclude, marker_uidx)) {
-      marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
-      if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
-	return RET_READ_FAIL;
-      }
-    }
-    if (marker_uidx >= chrom_end) {
-      while (1) {
-	chrom_fo_idx++;
-	refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
-	cur_chrom = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
-	if ((cur_chrom <= autosome_ct) || (cur_chrom == xy_code) || (cur_chrom > max_code)) {
-	  // for now, unplaced chromosomes are all "autosomal"
-	  break;
-	}
-	marker_uidx = next_unset_ul_unsafe(marker_exclude, chrom_end);
-	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
-	  return RET_READ_FAIL;
-	}
-      }
-    }
-    if (fread(&(readbuf[markers_read * unfiltered_sample_ct4]), 1, unfiltered_sample_ct4, bedfile) < unfiltered_sample_ct4) {
-      return RET_READ_FAIL;
-    }
-    if (set_allele_freq_buf) {
-      if ((!marker_reverse) || (!IS_SET(marker_reverse, marker_uidx))) {
-        set_allele_freq_buf[markers_read] = set_allele_freqs[marker_uidx];
-      } else {
-        set_allele_freq_buf[markers_read] = 1.0 - set_allele_freqs[marker_uidx];
-      }
-    } else if (set_allele_freq_buf_fl) {
-      if (!IS_SET(marker_reverse, marker_uidx)) {
-        set_allele_freq_buf_fl[markers_read] = (float)set_allele_freqs[marker_uidx];
-      } else {
-        set_allele_freq_buf_fl[markers_read] = 1.0 - ((float)set_allele_freqs[marker_uidx]);
-      }
-    }
-    if (wtbuf) {
-      wtbuf[markers_read] = marker_weights[marker_idx];
-    }
-    markers_read++;
-    marker_idx++;
-    marker_uidx++;
-  }
-
-  *chrom_fo_idx_ptr = chrom_fo_idx;
-  *marker_uidx_ptr = marker_uidx;
-  *marker_idx_ptr = marker_idx;
-  *block_size_ptr = markers_read;
-  return 0;
-}
-
 void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include) {
   uint32_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
   uintptr_t ulii;
diff --git a/plink_common.h b/plink_common.h
index f8848e1..315f56e 100644
--- a/plink_common.h
+++ b/plink_common.h
@@ -20,12 +20,6 @@
 #define PROG_NAME_STR "plink"
 #define PROG_NAME_CAPS "PLINK"
 
-#ifdef STABLE_BUILD
-  #define UNSTABLE goto main_unstable_disabled
-#else
-  #define UNSTABLE
-#endif
-
 #ifdef _WIN32
   // needed for MEMORYSTATUSEX
   #ifndef _WIN64
@@ -45,6 +39,7 @@
   #define pthread_t HANDLE
   #define THREAD_RET_TYPE unsigned __stdcall
   #define THREAD_RETURN return 0
+  #define EOLN_STR "\r\n"
 #else
   #include <pthread.h>
   #define THREAD_RET_TYPE void*
@@ -54,6 +49,7 @@
       #define PRId64 "lld"
     #endif
   #endif
+  #define EOLN_STR "\n"
 #endif
 
 #ifdef __APPLE__
@@ -223,6 +219,9 @@
 #define MISC_SPLIT_MERGE_NOFAIL 0x400000000LLU
 #define MISC_REAL_REF_ALLELES 0x800000000LLU
 #define MISC_RPLUGIN_DEBUG 0x1000000000LLU
+#define MISC_MISSING_GZ 0x2000000000LLU
+#define MISC_FREQ_GZ 0x4000000000LLU
+#define MISC_HET_GZ 0x8000000000LLU
 
 // assume for now that .bed must always be accompanied by both .bim and .fam
 #define FILTER_ALL_REQ 1LLU
@@ -305,9 +304,10 @@
 #define CALC_WRITE_VAR_RANGES 0x40000000000000LLU
 #define CALC_DUPVAR 0x80000000000000LLU
 #define CALC_RPLUGIN 0x100000000000000LLU
+#define CALC_DFAM 0x200000000000000LLU
 #define CALC_ONLY_BIM (CALC_WRITE_SET | CALC_WRITE_SNPLIST | CALC_WRITE_VAR_RANGES | CALC_LIST_23_INDELS | CALC_MAKE_BIM | CALC_DUPVAR)
 #define CALC_ONLY_FAM (CALC_MAKE_PERM_PHENO | CALC_WRITE_COVAR | CALC_MAKE_FAM)
-// only room for 7 more basic commands before we need to switch from a single
+// only room for 6 more basic commands before we need to switch from a single
 // uint64_t to uintptr_t*/is_set()/etc.
 
 // necessary to patch heterozygous haploids/female Y chromosome genotypes
@@ -370,6 +370,7 @@
 #define DISTANCE_TYPEMASK 0xe0
 #define DISTANCE_FLAT_MISSING 0x100
 #define DISTANCE_CLUSTER 0x200
+#define DISTANCE_WTS_NOHEADER 0x400
 
 #define RECODE_01 1
 #define RECODE_12 2
@@ -400,6 +401,7 @@
 #define RECODE_FID 0x2000000
 #define RECODE_IID 0x4000000
 #define RECODE_INCLUDE_ALT 0x8000000
+#define RECODE_BGZ 0x10000000
 
 #define GENOME_OUTPUT_GZ 1
 #define GENOME_REL_CHECK 2
@@ -433,6 +435,7 @@
 #define HWE_MIDP 1
 #define HWE_THRESH_MIDP 2
 #define HWE_THRESH_ALL 4
+#define HWE_GZ 8
 
 #define MENDEL_FILTER 1
 #define MENDEL_FILTER_VAR_FIRST 2
@@ -802,6 +805,12 @@ void wordwrap(char* ss, uint32_t suffix_len);
 // 5 = length of "done." suffix, which is commonly used
 #define LOGPRINTFWW5(...) sprintf(logbuf, __VA_ARGS__); wordwrap(logbuf, 5); logprintb();
 
+#ifdef STABLE_BUILD
+  #define UNSTABLE(val) sptr = strcpya(&(logbuf[9]), val); goto main_unstable_disabled
+#else
+  #define UNSTABLE(val)
+#endif
+
 int32_t fopen_checked(FILE** target_ptr, const char* fname, const char* mode);
 
 static inline int32_t putc_checked(int32_t ii, FILE* outfile) {
@@ -843,12 +852,51 @@ static inline int32_t fclose_null(FILE** fptr_ptr) {
 
 int32_t gzopen_checked(gzFile* target_ptr, const char* fname, const char* mode);
 
+static inline int32_t gzclose_null(gzFile* gzf_ptr) {
+  int32_t ii = gzclose(*gzf_ptr);
+  *gzf_ptr = NULL;
+  return (ii != Z_OK);
+}
+
 static inline void gzclose_cond(gzFile gz_infile) {
   if (gz_infile) {
     gzclose(gz_infile);
   }
 }
 
+static inline int32_t flexwrite_checked(const void* buf, size_t len, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+  if (!output_gz) {
+    return fwrite_checked(buf, len, outfile);
+  } else {
+    return (!gzwrite(gz_outfile, buf, len));
+  }
+}
+
+static inline int32_t flexputc_checked(int32_t ii, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+  if (!output_gz) {
+    putc(ii, outfile);
+    return ferror(outfile);
+  } else {
+    return (gzputc(gz_outfile, ii) == -1);
+  }
+}
+
+static inline int32_t flexputs_checked(const char* ss, uint32_t output_gz, FILE* outfile, gzFile gz_outfile) {
+  if (!output_gz) {
+    return fputs_checked(ss, outfile);
+  } else {
+    return (gzputs(gz_outfile, ss) == -1);
+  }
+}
+
+static inline int32_t flexclose_null(uint32_t output_gz, FILE** fptr_ptr, gzFile* gzf_ptr) {
+  if (!output_gz) {
+    return fclose_null(fptr_ptr);
+  } else {
+    return gzclose_null(gzf_ptr);
+  }
+}
+
 static inline int32_t bed_suffix_conflict(uint64_t calculation_type, uint32_t recode_modifier) {
   return (calculation_type & CALC_MAKE_BED) || ((calculation_type & CALC_RECODE) && (recode_modifier & (RECODE_LGEN | RECODE_LGEN_REF | RECODE_RLIST)));
 }
@@ -1101,6 +1149,17 @@ static inline char* strcpyax(char* target, const void* source, char extra_char)
   return &(target[slen + 1]);
 }
 
+static inline void append_binary_eoln(char** target_ptr) {
+#ifdef _WIN32
+  (*target_ptr)[0] = '\r';
+  (*target_ptr)[1] = '\n';
+  *target_ptr += 2;
+#else
+  **target_ptr = '\n';
+  *target_ptr += 1;
+#endif
+}
+
 static inline void fputs_w4(char* ss, FILE* outfile) {
   // for efficient handling of width-4 allele columns; don't want to call
   // strlen() since that's redundant with fputs
@@ -1118,6 +1177,8 @@ static inline void fputs_w4(char* ss, FILE* outfile) {
   }
 }
 
+int32_t gzputs_w4(gzFile gz_outfile, const char* ss);
+
 int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr);
 
 int32_t get_next_noncomment_excl(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr, uintptr_t* marker_exclude, uintptr_t* marker_uidx_ptr);
@@ -1553,15 +1614,15 @@ static inline void prev_unset_unsafe_ck(uintptr_t* bit_arr, uint32_t* loc_ptr) {
 
 // These functions seem to optimize better than memset(arr, 0, x) under gcc.
 static inline void fill_long_zero(intptr_t* larr, size_t size) {
-  intptr_t* lptr = &(larr[size]);
-  while (larr < lptr) {
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
     *larr++ = 0;
   }
 }
 
 static inline void fill_ulong_zero(uintptr_t* ularr, size_t size) {
-  uintptr_t* ulptr = &(ularr[size]);
-  while (ularr < ulptr) {
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
     *ularr++ = 0;
   }
 }
@@ -1577,15 +1638,15 @@ static inline void fill_ull_zero(uint64_t* ullarr, size_t size) {
 #endif
 
 static inline void fill_long_one(intptr_t* larr, size_t size) {
-  intptr_t* lptr = &(larr[size]);
-  while (larr < lptr) {
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
     *larr++ = -1;
   }
 }
 
 static inline void fill_ulong_one(uintptr_t* ularr, size_t size) {
-  uintptr_t* ulptr = &(ularr[size]);
-  while (ularr < ulptr) {
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
     *ularr++ = ~ZEROLU;
   }
 }
@@ -1601,59 +1662,43 @@ static inline void fill_ull_one(uint64_t* ullarr, size_t size) {
 #endif
 
 static inline void fill_int_zero(int32_t* iarr, size_t size) {
-#ifdef __LP64__
-  fill_long_zero((intptr_t*)iarr, size >> 1);
-  if (size & 1) {
-    iarr[size - 1] = 0;
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
+    *iarr++ = 0;
   }
-#else
-  fill_long_zero((intptr_t*)iarr, size);
-#endif
 }
 
 static inline void fill_int_one(int32_t* iarr, size_t size) {
-#ifdef __LP64__
-  fill_long_one((intptr_t*)iarr, size >> 1);
-  if (size & 1) {
-    iarr[size - 1] = -1;
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
+    *iarr++ = -1;
   }
-#else
-  fill_long_one((intptr_t*)iarr, size);
-#endif
 }
 
 static inline void fill_uint_zero(uint32_t* uiarr, size_t size) {
-#ifdef __LP64__
-  fill_long_zero((intptr_t*)uiarr, size >> 1);
-  if (size & 1) {
-    uiarr[size - 1] = 0;
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
+    *uiarr++ = 0;
   }
-#else
-  fill_long_zero((intptr_t*)uiarr, size);
-#endif
 }
 
 static inline void fill_uint_one(uint32_t* uiarr, size_t size) {
-#ifdef __LP64__
-  fill_ulong_one((uintptr_t*)uiarr, size >> 1);
-  if (size & 1) {
-    uiarr[size - 1] = ~0U;
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
+    *uiarr++ = ~0U;
   }
-#else
-  fill_ulong_one((uintptr_t*)uiarr, size);
-#endif
 }
 
 static inline void fill_float_zero(float* farr, size_t size) {
-  float* fptr = &(farr[size]);
-  while (farr < fptr) {
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
     *farr++ = 0.0;
   }
 }
 
 static inline void fill_double_zero(double* darr, size_t size) {
-  double* dptr = &(darr[size]);
-  while (darr < dptr) {
+  size_t ulii;
+  for (ulii = 0; ulii < size; ulii++) {
     *darr++ = 0.0;
   }
 }
@@ -1861,6 +1906,9 @@ static inline int32_t chrom_exists(Chrom_info* chrom_info_ptr, uint32_t chrom_id
 
 int32_t resolve_or_add_chrom_name(Chrom_info* chrom_info_ptr, char* bufptr, int32_t* chrom_idx_ptr, uintptr_t line_idx, const char* file_descrip);
 
+// no need for this; code is simpler if we just create a copy of marker_exclude
+// with all non-autosomal loci removed
+/*
 static inline uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr_t marker_uidx, Chrom_info* chrom_info_ptr, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr) {
   // assumes we are at an autosomal marker if marker_uidx < *chrom_end_ptr
   next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
@@ -1881,6 +1929,7 @@ static inline uintptr_t next_autosomal_unsafe(uintptr_t* marker_exclude, uintptr
     marker_uidx = next_unset_ul_unsafe(marker_exclude, *chrom_end_ptr);
   }
 }
+*/
 
 void refresh_chrom_info(Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* chrom_end_ptr, uint32_t* chrom_fo_idx_ptr, uint32_t* is_x_ptr, uint32_t* is_y_ptr, uint32_t* is_mt_ptr, uint32_t* is_haploid_ptr);
 
@@ -2097,12 +2146,12 @@ static inline uint32_t count_chrom_markers(Chrom_info* chrom_info_ptr, uint32_t
 
 uint32_t count_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt);
 
+int32_t conditional_allocate_non_autosomal_markers(Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr);
+
 uint32_t get_max_chrom_size(Chrom_info* chrom_info_ptr, uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr);
 
 void count_genders(uintptr_t* sex_nm, uintptr_t* sex_male, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uint32_t* male_ct_ptr, uint32_t* female_ct_ptr, uint32_t* unk_ct_ptr);
 
-double calc_wt_mean_maf(double exponent, double maf);
-
 void reverse_loadbuf(unsigned char* loadbuf, uintptr_t unfiltered_sample_ct);
 
 void collapse_copy_2bitarr(uintptr_t* rawbuf, uintptr_t* mainbuf, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t* sample_exclude);
@@ -2141,8 +2190,6 @@ uint32_t load_and_collapse_incl(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfil
 
 uint32_t load_and_split(FILE* bedfile, uintptr_t* rawbuf, uint32_t unfiltered_sample_ct, uintptr_t* casebuf, uintptr_t* ctrlbuf, uintptr_t* pheno_nm, uintptr_t* pheno_c);
 
-uint32_t block_load_autosomal(FILE* bedfile, int32_t bed_offset, uintptr_t* marker_exclude, uint32_t marker_ct_autosomal, uint32_t block_max_size, uintptr_t unfiltered_sample_ct4, Chrom_info* chrom_info_ptr, double* set_allele_freqs, uint32_t* marker_weights, unsigned char* readbuf, uint32_t* chrom_fo_idx_ptr, uintptr_t* marker_uidx_ptr, uintptr_t* marker_idx_ptr, uint32_t* block_size_ptr, uintptr_t* marker_reverse, double* set_allele_freq_buf, float* set_allele_freq_buf_fl, uint32_t* wtbuf);
-
 void vec_include_init(uintptr_t unfiltered_sample_ct, uintptr_t* new_include2, uintptr_t* old_include);
 
 void exclude_to_vec_include(uintptr_t unfiltered_sample_ct, uintptr_t* include_vec, uintptr_t* exclude_arr);
diff --git a/plink_data.c b/plink_data.c
index 228a7d7..13d1f44 100644
--- a/plink_data.c
+++ b/plink_data.c
@@ -14,6 +14,7 @@
 #include <sys/types.h>
 #include "plink_family.h"
 #include "plink_set.h"
+#include "bgzf.h"
 
 #define PHENO_EPSILON 0.000030517578125
 
@@ -566,7 +567,10 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
   uint32_t cur_pos;
   char cc;
   fill_ulong_zero(loaded_chrom_mask, CHROM_MASK_WORDS);
-  fill_ulong_zero((uintptr_t*)(&insert_buf), 4);
+  insert_buf[0] = NULL;
+  insert_buf[1] = NULL;
+  insert_buf[2] = NULL;
+  insert_buf[3] = NULL;
   if (sf_ct) {
     sf_start_idxs = (uint32_t*)malloc((MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t));
     if (!sf_start_idxs) {
@@ -584,7 +588,11 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     }
   }
   fill_uint_zero(missing_template_seg_len, 5);
-  fill_ulong_zero((uintptr_t*)(&missing_template_seg), 5);
+  missing_template_seg[0] = NULL;
+  missing_template_seg[1] = NULL;
+  missing_template_seg[2] = NULL;
+  missing_template_seg[3] = NULL;
+  missing_template_seg[4] = NULL;
   if (missing_mid_template) {
     if (!missing_marker_id_match) {
       missing_marker_id_match = &(g_one_char_strs[92]); // '.'
@@ -1045,6 +1053,11 @@ int32_t load_bim(char* bimname, uint32_t* map_cols_ptr, uintptr_t* unfiltered_ma
     if (snps_only) {
       max_marker_allele_len = 2;
     }
+    if (max_marker_allele_len > 500000000) {
+      // guard against overflows
+      logprint("Error: Alleles are limited to 500 million characters.\n");
+      goto load_bim_ret_INVALID_FORMAT;
+    }
     *max_marker_allele_len_ptr = max_marker_allele_len;
     marker_allele_ptrs = (char**)wkspace_alloc(unfiltered_marker_ct * 2 * sizeof(intptr_t));
     if (!marker_allele_ptrs) {
@@ -4026,6 +4039,7 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
   uint32_t shiftval;
   uint32_t bgen_compressed;
   uint32_t bgen_multichar_alleles;
+  uint32_t identical_alleles;
   uint32_t uii;
   uint32_t ujj;
   uint32_t ukk;
@@ -4415,20 +4429,34 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       if (putc_checked(' ', outfile_bim)) {
 	goto oxford_to_bed_ret_WRITE_FAIL;
       }
-      bufptr = bufptr2;
-      bufptr2 = next_token_mult(bufptr, 2);
-      if (no_more_tokens_kns(bufptr2)) {
+      bufptr = next_token(bufptr2);
+      bufptr3 = next_token(bufptr);
+      if (no_more_tokens_kns(bufptr3)) {
 	goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
       }
-      bufptr2 = token_endnn(bufptr2);
-      fwrite(bufptr, 1, bufptr2 - bufptr, outfile_bim);
+      // bufptr2 = pos
+      // bufptr  = allele 1
+      // bufptr3 = allele 2
+      bufptr4 = token_endnn(bufptr3);
+      uii = (uintptr_t)(bufptr4 - bufptr3);
+      identical_alleles = (strlen_se(bufptr) == uii) && (!memcmp(bufptr, bufptr3, uii));
+      if (identical_alleles) {
+	// we treat identical A1 and A2 as a special case, since naive handling
+	// prevents e.g. later data merge.
+	// maybe add a warning?
+	fwrite(bufptr2, 1, strlen_se(bufptr2), outfile_bim);
+        fputs(" 0 ", outfile_bim);
+	fwrite(bufptr3, 1, bufptr4 - bufptr3, outfile_bim);        
+      } else {
+	fwrite(bufptr2, 1, bufptr4 - bufptr2, outfile_bim);
+      }
       if (putc_checked('\n', outfile_bim)) {
 	goto oxford_to_bed_ret_WRITE_FAIL;
       }
       cur_word = 0;
       shiftval = 0;
       ulptr = writebuf;
-      bufptr = skip_initial_spaces(&(bufptr2[1]));
+      bufptr = skip_initial_spaces(&(bufptr4[1]));
       for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
 	if (is_eoln_kns(*bufptr)) {
 	  goto oxford_to_bed_ret_MISSING_TOKENS_GEN;
@@ -4555,6 +4583,16 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       if (shiftval) {
 	*ulptr++ = cur_word;
       }
+      if (identical_alleles) {
+	// keep missing calls, but convert hom/het A1 to hom A2.
+	for (ulptr = writebuf; ulptr < (&(writebuf[sample_ctl2])); ulptr++) {
+	  ulii = *ulptr;
+	  *ulptr = ((~ulii) << 1) | ulii | FIVEMASK;
+	}
+	if (sample_ct % 4) {
+	  writebuf[sample_ctl2 - 1] &= (ONELU << (2 * (sample_ct % BITCT2))) - ONELU;
+	}
+      }
       if (fwrite_checked(writebuf, sample_ct4, outfile)) {
 	goto oxford_to_bed_ret_WRITE_FAIL;
       }
@@ -4580,9 +4618,9 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
     }
     loadbuf = (char*)wkspace_base;
     loadbuf_size = wkspace_left;
-    if (loadbuf_size > MAXLINEBUFLEN / 2) {
+    if (loadbuf_size > MAXLINEBUFLEN) {
       // halve the limit since there are two alleles
-      loadbuf_size = MAXLINEBUFLEN / 2;
+      loadbuf_size = MAXLINEBUFLEN;
     } else if (loadbuf_size < 3 * 65536) {
       goto oxford_to_bed_ret_NOMEM;
     }
@@ -4738,8 +4776,8 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
         fwrite(bufptr, 1, usjj, outfile_bim);
 	bufptr = uint32_writex(&(tbuf[3]), uint_arr[0], ' ');
 	fwrite(tbuf, 1, bufptr - tbuf, outfile_bim);
-        if (uint_arr[1] >= loadbuf_size) {
-	  if (loadbuf_size < MAXLINEBUFLEN / 2) {
+        if (uint_arr[1] >= loadbuf_size / 2) {
+	  if (loadbuf_size < MAXLINEBUFLEN) {
 	    goto oxford_to_bed_ret_NOMEM;
 	  }
 	  logprint("Error: Excessively long allele in .bgen file.\n");
@@ -4749,25 +4787,31 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	  goto oxford_to_bed_ret_READ_FAIL;
 	}
 	loadbuf[uint_arr[1]] = ' ';
-        if (fwrite_checked(loadbuf, uint_arr[1] + 1, outfile_bim)) {
-	  goto oxford_to_bed_ret_WRITE_FAIL;
-	}
 	if (fread(&uii, 1, 4, infile) < 4) {
 	  goto oxford_to_bed_ret_READ_FAIL;
 	}
-        if (uii >= loadbuf_size) {
-	  if (loadbuf_size < MAXLINEBUFLEN / 2) {
+        if (uii >= loadbuf_size / 2) {
+	  if (loadbuf_size < MAXLINEBUFLEN) {
 	    goto oxford_to_bed_ret_NOMEM;
 	  }
 	  logprint("Error: Excessively long allele in .bgen file.\n");
 	  goto oxford_to_bed_ret_INVALID_FORMAT;
 	}
-        if (fread(loadbuf, 1, uii, infile) < uii) {
+	bufptr = &(loadbuf[uint_arr[1] + 1]);
+        if (fread(bufptr, 1, uii, infile) < uii) {
 	  goto oxford_to_bed_ret_READ_FAIL;
 	}
-	loadbuf[uii] = '\n';
-        if (fwrite_checked(loadbuf, uii + 1, outfile_bim)) {
-	  goto oxford_to_bed_ret_WRITE_FAIL;
+	bufptr[uii] = '\n';
+	identical_alleles = (uii == uint_arr[1]) && (!memcmp(loadbuf, bufptr, uii));
+	if (!identical_alleles) {
+	  if (fwrite_checked(loadbuf, uint_arr[1] + uii + 2, outfile_bim)) {
+	    goto oxford_to_bed_ret_WRITE_FAIL;
+	  }
+	} else {
+	  fputs("0 ", outfile_bim);
+	  if (fwrite_checked(bufptr, uii + 1, outfile_bim)) {
+	    goto oxford_to_bed_ret_WRITE_FAIL;
+	  }
 	}
       } else {
 	uii = 0;
@@ -4834,7 +4878,12 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
 	fwrite(&(loadbuf[uii + 2]), 1, ukk, outfile_bim);
 	memcpy(&ujj, &(loadbuf[2 * uii + 3]), 4);
 	bufptr = uint32_writex(&(tbuf[3]), ujj, ' ');
-	*bufptr++ = loadbuf[2 * uii + 7];
+	identical_alleles = (loadbuf[2 * uii + 7] == loadbuf[2 * uii + 8]);
+	if (!identical_alleles) {
+	  *bufptr++ = loadbuf[2 * uii + 7];
+	} else {
+	  *bufptr++ = '0';
+	}
 	*bufptr++ = ' ';
 	*bufptr++ = loadbuf[2 * uii + 8];
 	*bufptr++ = '\n';
@@ -4942,6 +4991,15 @@ int32_t oxford_to_bed(char* genname, char* samplename, char* outname, char* outn
       if (shiftval) {
 	*ulptr++ = cur_word;
       }
+      if (identical_alleles) {
+	for (ulptr = writebuf; ulptr < (&(writebuf[sample_ctl2])); ulptr++) {
+	  ulii = *ulptr;
+	  *ulptr = ((~ulii) << 1) | ulii | FIVEMASK;
+	}
+	if (sample_ct % 4) {
+	  writebuf[sample_ctl2 - 1] &= (ONELU << (2 * (sample_ct % BITCT2))) - ONELU;
+	}
+      }
       if (fwrite_checked(writebuf, sample_ct4, outfile)) {
 	goto oxford_to_bed_ret_WRITE_FAIL;
       }
@@ -5583,17 +5641,19 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
 	    ukk = map_reverse[umm++];
 	    if ((ukk >= marker_start) && (ukk < marker_end)) {
 	      ucc = 1;
-	      if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk + 1])) {
-		if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
-		  ucc = 3;
-		} else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk])) {
-		  ucc = 2;
-		}
-	      } else if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk])) {
-		if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
-		  ucc = 0;
-		} else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk + 1])) {
-		  ucc = 2;
+	      if ((*aptr1 != missing_geno) || (alen1 != 1)) {
+		if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk + 1])) {
+		  if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+		    ucc = 3;
+		  } else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk])) {
+		    ucc = 2;
+		  }
+		} else if (!strcmp(aptr1, marker_allele_ptrs[2 * ukk])) {
+		  if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+		    ucc = 0;
+		  } else if (!strcmp(aptr2, marker_allele_ptrs[2 * ukk + 1])) {
+		    ucc = 2;
+		  }
 		}
 	      }
 	      wbufptr[(ukk - marker_start) * sample_ct4] |= ucc << ii_shift;
@@ -5616,17 +5676,19 @@ int32_t ped_to_bed_multichar_allele(FILE** pedfile_ptr, FILE** outfile_ptr, char
 	      continue;
 	    }
 	    ucc = 1;
-	    if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx + 1])) {
-	      if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
-		ucc = 3;
-	      } else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx])) {
-		ucc = 2;
-	      }
-	    } else if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx])) {
-	      if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
-		ucc = 0;
-	      } else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx + 1])) {
-		ucc = 2;
+	    if ((*aptr1 != missing_geno) || (alen1 != 1)) {
+	      if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx + 1])) {
+		if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+		  ucc = 3;
+		} else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx])) {
+		  ucc = 2;
+		}
+	      } else if (!strcmp(aptr1, marker_allele_ptrs[2 * marker_idx])) {
+		if ((alen1 == alen2) && (!memcmp(aptr1, aptr2, alen1))) {
+		  ucc = 0;
+		} else if (!strcmp(aptr2, marker_allele_ptrs[2 * marker_idx + 1])) {
+		  ucc = 2;
+		}
 	      }
 	    }
 	    *wbufptr |= ucc << ii_shift;
@@ -7260,7 +7322,10 @@ int32_t transposed_to_bed(char* tpedname, char* tfamname, char* outname, char* o
       goto transposed_to_bed_ret_WRITE_FAIL;
     }
     cptr2 = cptr4;
-    fill_ulong_zero((uintptr_t*)alleles, 4);
+    alleles[0] = NULL;
+    alleles[1] = NULL;
+    alleles[2] = NULL;
+    alleles[3] = NULL;
     fill_uint_zero(allele_cts, 4);
     for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
       cptr2 = skip_initial_spaces(cptr2);
@@ -7806,8 +7871,11 @@ int32_t vcf_sample_line(char* outname, char* outname_end, int32_t missing_pheno,
 	}
       }
       if (memchr(&(bufptr3[1]), (unsigned char)id_delim, (uintptr_t)(bufptr2 - &(bufptr3[1])))) {
-        sprintf(logbuf, "Error: Multiple instances of '%c' in sample ID.\n", id_delim);
-        goto vcf_sample_line_ret_INVALID_FORMAT_2;
+        LOGPRINTF("Error: Multiple instances of '%c' in sample ID.\n", id_delim);
+	if (id_delim == '_') {
+	  logprint("If you do not want '_' to be treated as a FID/IID delimiter, use --double-id or\n--const-fid to choose a different method of converting VCF sample IDs to PLINK\nIDs, or --id-delim to change the FID/IID delimiter.\n");
+	}
+        goto vcf_sample_line_ret_INVALID_FORMAT;
       }
       wptr = memcpyax(tbuf, bufptr, (uintptr_t)(bufptr3 - bufptr), '\t');
       bufptr3++;
@@ -8928,6 +8996,12 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
     }
     goto bcf_to_bed_ret_INVALID_FORMAT_2;
   }
+  if (((unsigned char)(tbuf[4])) > 2) {
+    // defend against 0x82-0x87 being given a meaning in 8-bit int vectors,
+    // etc.
+    LOGPREPRINTFWW("Error: %s appears to be formatted as BCFv2.%u; this PLINK build only supports v2.0-2.2. You may need to obtain an updated version of PLINK.\n", bcfname, ((unsigned char)(tbuf[4])));
+    goto bcf_to_bed_ret_INVALID_FORMAT_2;
+  }
   if (gzread(gz_infile, &header_size, 4) < 4) {
     goto bcf_to_bed_ret_READ_OR_FORMAT_FAIL;
   }
@@ -9341,11 +9415,12 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
       ucptr = (unsigned char*)loadbuf;
       if (ujj == 2) {
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, ucptr++) {
-	  ulii = *ucptr++;
+	  // discard all phase bits for now
+	  // missing = 0x80 or 0x81
+	  ulii = (*ucptr++) & 0x7e;
 	  if (ulii) {
-	    // discard all phase bits for now
 	    ulii = ((ulii / 2) - 1) * sample_ctv2;
-	    uljj = *ucptr;
+	    uljj = (*ucptr) & 0x7e;
 	    if (uljj) {
 	      set_bit(&(base_bitfields[ulii]), sample_idx * 2);
 	      base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9357,7 +9432,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	}
       } else if (ujj == 1) {
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
-	  ulii = *ucptr++;
+	  ulii = (*ucptr++) & 0x7e;
 	  if (ulii) {
 	    set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
 	  }
@@ -9367,10 +9442,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	  if (ucptr[2]) {
 	    ucptr = &(ucptr[ujj]);
 	  } else {
-	    ulii = *ucptr++;
+	    ulii = (*ucptr++) & 0x7e;
 	    if (ulii) {
 	      ulii = ((ulii / 2) - 1) * sample_ctv2;
-	      uljj = *ucptr;
+	      uljj = (*ucptr) & 0x7e;
 	      if (uljj) {
 		set_bit(&(base_bitfields[ulii]), sample_idx * 2);
 		base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9387,10 +9462,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
       // bleah, this should totally use templates instead of cut-and-paste
       if (ujj == 2) {
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, ui16ptr++) {
-	  ulii = *ui16ptr++;
+	  ulii = (*ui16ptr++) & 0x7ffe;
 	  if (ulii) {
 	    ulii = ((ulii / 2) - 1) * sample_ctv2;
-            uljj = *ui16ptr;
+            uljj = (*ui16ptr) & 0x7ffe;
 	    if (uljj) {
 	      set_bit(&(base_bitfields[ulii]), sample_idx * 2);
 	      base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9401,7 +9476,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	}
       } else if (ujj == 1) {
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
-	  ulii = *ui16ptr++;
+	  ulii = (*ui16ptr++) & 0x7ffe;
 	  if (ulii) {
 	    set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
 	  }
@@ -9411,10 +9486,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	  if (ui16ptr[2]) {
 	    ui16ptr = &(ui16ptr[ujj]);
 	  } else {
-	    ulii = *ui16ptr++;
+	    ulii = (*ui16ptr++) & 0x7ffe;
 	    if (ulii) {
 	      ulii = ((ulii / 2) - 1) * sample_ctv2;
-              uljj = *ui16ptr;
+              uljj = (*ui16ptr) & 0x7ffe;
 	      if (uljj) {
 		set_bit(&(base_bitfields[ulii]), sample_idx * 2);
 		base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9430,10 +9505,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
       uiptr = (uint32_t*)loadbuf;
       if (ujj == 2) {
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, uiptr++) {
-	  ulii = *uiptr++;
+	  ulii = (*uiptr++) & 0x7ffffffe;
 	  if (ulii) {
 	    ulii = ((ulii / 2) - 1) * sample_ctv2;
-            uljj = *uiptr;
+            uljj = (*uiptr) & 0x7ffffffe;
 	    if (uljj) {
 	      set_bit(&(base_bitfields[ulii]), sample_idx * 2);
 	      base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -9444,7 +9519,7 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	}
       } else if (ujj == 1) {
 	for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
-	  ulii = *uiptr++;
+	  ulii = (*uiptr++) & 0x7ffffffe;
 	  if (ulii) {
 	    set_bit(&(base_bitfields[((ulii / 2) - 1) * sample_ctv2]), sample_idx * 2 + 1);
 	  }
@@ -9454,10 +9529,10 @@ int32_t bcf_to_bed(char* bcfname, char* outname, char* outname_end, int32_t miss
 	  if (uiptr[2]) {
 	    uiptr = &(uiptr[ujj]);
 	  } else {
-	    ulii = *uiptr++;
+	    ulii = (*uiptr++) & 0x7ffffffe;
 	    if (ulii) {
 	      ulii = ((ulii / 2) - 1) * sample_ctv2;
-              uljj = *uiptr;
+              uljj = (*uiptr) & 0x7ffffffe;
 	      if (uljj) {
 		set_bit(&(base_bitfields[ulii]), sample_idx * 2);
 		base_bitfields[((uljj / 2) - 1) * sample_ctv2 + sample_idx / BITCT2] += ONELU << (2 * (sample_idx % BITCT2));
@@ -11545,9 +11620,34 @@ uint32_t valid_vcf_allele_code(const char* allele_code) {
   return 1;
 }
 
+int32_t flexbwrite_checked(const void* buf, size_t len, uint32_t output_bgz, FILE* outfile, BGZF* bgz_outfile) {
+  if (!output_bgz) {
+    return fwrite_checked(buf, len, outfile);
+  } else {
+    return (bgzf_write(bgz_outfile, buf, len) < 0);
+  }
+}
+
+int32_t flexbputs_checked(const char* buf, uint32_t output_bgz, FILE* outfile, BGZF* bgz_outfile) {
+  if (!output_bgz) {
+    return fputs_checked(buf, outfile);
+  } else {
+    return (bgzf_write(bgz_outfile, buf, strlen(buf)) < 0);
+  }
+}
+
+int32_t flexbputc_checked(unsigned char ucc, uint32_t output_bgz, FILE* outfile, BGZF* bgz_outfile) {
+  if (!output_bgz) {
+    return putc_checked(ucc, outfile);
+  } else {
+    return (bgzf_write(bgz_outfile, &ucc, 1) < 0);
+  }
+}
+
 int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, char* recode_allele_name, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* marker_ids, uintptr_t max_marker_id_len, double* marker_cms, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uint32_t* marker_pos, uintptr_t* marker_reverse, char* sample_ids,  [...]
   FILE* outfile = NULL;
   FILE* outfile2 = NULL;
+  BGZF* bgz_outfile = NULL;
   uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t sample_ctv2 = 2 * ((sample_ct + (BITCT - 1)) / BITCT);
@@ -11569,6 +11669,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   uint32_t vcf_not_fid = (recode_modifier & RECODE_VCF) && (!(recode_modifier & RECODE_FID));
   uint32_t vcf_not_iid = (recode_modifier & RECODE_VCF) && (!(recode_modifier & RECODE_IID));
   uint32_t vcf_two_ids = vcf_not_fid && vcf_not_iid;
+  uint32_t output_bgz = (recode_modifier / RECODE_BGZ) & 1;
   uint32_t recode_012 = recode_modifier & (RECODE_01 | RECODE_12);
   uint32_t set_hh_missing = (misc_flags / MISC_SET_HH_MISSING) & 1;
   uint32_t real_ref_alleles = (misc_flags / MISC_REAL_REF_ALLELES) & 1;
@@ -12290,21 +12391,32 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       }
     }
   } else if (recode_modifier & RECODE_VCF) {
-    strcpy(outname_end, ".vcf");
-    if (fopen_checked(&outfile, outname, "w")) {
-      goto recode_ret_OPEN_FAIL;
-    }
-    if (fputs_checked(
-"##fileformat=VCFv4.2\n"
-"##fileDate=", outfile)) {
-      goto recode_ret_WRITE_FAIL;
+    if (!output_bgz) {
+      memcpy(outname_end, ".vcf", 5);
+      if (fopen_checked(&outfile, outname, "w")) {
+	goto recode_ret_OPEN_FAIL;
+      }
+    } else {
+      memcpy(outname_end, ".vcf.gz", 7);
+      bgz_outfile = bgzf_open(outname, "w");
+      if (!bgz_outfile) {
+	goto recode_ret_OPEN_FAIL;
+      }
+#ifndef _WIN32
+      if (g_thread_ct > 1) {
+	bgzf_mt(bgz_outfile, g_thread_ct, 128);
+      }
+#endif
     }
+    wbufptr = memcpya(tbuf, "##fileformat=VCFv4.2\n##fileDate=", 32);
     time(&rawtime);
     loctime = localtime(&rawtime);
-    strftime(tbuf, MAXLINELEN, "%Y%m%d", loctime);
-    fputs(tbuf, outfile);
-    fputs("\n##source=PLINKv1.90\n", outfile);
+    wbufptr += strftime(wbufptr, MAXLINELEN, "%Y%m%d", loctime);
+    wbufptr = memcpya(wbufptr, "\n##source=PLINKv1.90\n", 21);
     uii = 0; // '0' written already?
+    if (flexbwrite_checked(tbuf, wbufptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+      goto recode_ret_WRITE_FAIL;
+    }
     memcpy(tbuf, "##contig=<ID=", 13);
     for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
       chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
@@ -12332,15 +12444,21 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	}
       }
       cptr = memcpya(cptr, ">\n", 2);
-      fwrite(tbuf, 1, cptr - tbuf, outfile);
+      if (flexbwrite_checked(tbuf, cptr - tbuf, output_bgz, outfile, bgz_outfile)) {
+	goto recode_ret_WRITE_FAIL;
+      }
     }
     if (!real_ref_alleles) {
-      fputs("##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\"\n", outfile);
+      if (flexbputs_checked("##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">\n", output_bgz, outfile, bgz_outfile)) {
+	goto recode_ret_WRITE_FAIL;
+      }
     }
-    fputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", outfile);
     // todo: include PEDIGREE in header, and make --vcf be able to read it?
-    // Can't find a specification for how this should be done...
-    fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", outfile);
+    if (flexbputs_checked(
+"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", output_bgz, outfile, bgz_outfile)) {
+      goto recode_ret_WRITE_FAIL;
+    }
     chrom_fo_idx = 0;
     refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
@@ -12350,9 +12468,13 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
       next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
       cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
       ulii = strlen_se(cptr);
-      putc('\t', outfile);
+      if (flexbputc_checked('\t', output_bgz, outfile, bgz_outfile)) {
+	goto recode_ret_WRITE_FAIL;
+      }
       if (vcf_not_iid) {
-	fwrite(cptr, 1, ulii, outfile);
+	if (flexbwrite_checked(cptr, ulii, output_bgz, outfile, bgz_outfile)) {
+	  goto recode_ret_WRITE_FAIL;
+	}
 	if (vcf_two_ids) {
 	  if (!shiftval) {
 	    if (strchr(cptr, '_')) {
@@ -12360,14 +12482,18 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	      logprint("Warning: Underscore(s) present in sample IDs.\n");
 	    }
 	  }
-	  putc('_', outfile);
+	  if (flexbputc_checked('_', output_bgz, outfile, bgz_outfile)) {
+	    goto recode_ret_WRITE_FAIL;
+	  }
 	}
       }
       if (vcf_not_fid) {
-	fputs(&(cptr[ulii + 1]), outfile);
+	if (flexbputs_checked(&(cptr[ulii + 1]), output_bgz, outfile, bgz_outfile)) {
+	  goto recode_ret_WRITE_FAIL;
+	}
       }
     }
-    LOGPRINTFWW5("--recode vcf%s to %s ... ", vcf_not_iid? (vcf_not_fid? "" : "-fid") : "-iid", outname);
+    LOGPRINTFWW5("--recode vcf%s%s to %s ... ", vcf_not_iid? (vcf_not_fid? "" : "-fid") : "-iid", output_bgz? " bgz" : "", outname);
     fputs("0%", stdout);
     fflush(stdout);
     tbuf[0] = '\n';
@@ -12399,19 +12525,25 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	*wbufptr++ = '\t';
 	wbufptr = uint32_writex(wbufptr, marker_pos[marker_uidx], '\t');
 	wbufptr = strcpyax(wbufptr, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
-	if (fwrite_checked(tbuf, wbufptr - tbuf, outfile)) {
+	if (flexbwrite_checked(tbuf, wbufptr - tbuf, output_bgz, outfile, bgz_outfile)) {
 	  goto recode_ret_WRITE_FAIL;
 	}
 	cptr = mk_allele_ptrs[2 * marker_uidx + 1];
 	if (cptr == missing_geno_ptr) {
-	  putc('N', outfile);
+	  if (flexbputc_checked('N', output_bgz, outfile, bgz_outfile)) {
+	    goto recode_ret_WRITE_FAIL;
+	  }
 	} else {
           if ((!invalid_allele_code_seen) && (!valid_vcf_allele_code(cptr))) {
             invalid_allele_code_seen = 1;
 	  }
-	  fputs(cptr, outfile);
+	  if (flexbputs_checked(cptr, output_bgz, outfile, bgz_outfile)) {
+	    goto recode_ret_WRITE_FAIL;
+	  }
+	}
+	if (flexbputc_checked('\t', output_bgz, outfile, bgz_outfile)) {
+	  goto recode_ret_WRITE_FAIL;
 	}
-	putc('\t', outfile);
 
 	if (load_and_collapse(bedfile, (uintptr_t*)loadbuf, unfiltered_sample_ct, loadbuf_collapsed, sample_ct, sample_exclude, final_mask, IS_SET(marker_reverse, marker_uidx))) {
 	  goto recode_ret_READ_FAIL;
@@ -12428,19 +12560,22 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	  // if ALT allele is not actually present in immediate dataset, VCF
 	  // spec actually requires '.'
 	  if (!is_monomorphic_a2(loadbuf_collapsed, sample_ct)) {
-	    fputs(cptr, outfile);
+	    if (flexbputs_checked(cptr, output_bgz, outfile, bgz_outfile)) {
+	      goto recode_ret_WRITE_FAIL;
+	    }
 	  } else {
-	    putc('.', outfile);
+	    if (flexbputc_checked('.', output_bgz, outfile, bgz_outfile)) {
+	      goto recode_ret_WRITE_FAIL;
+	    }
 	  }
 	} else {
-	  putc('.', outfile);
+	  if (flexbputc_checked('.', output_bgz, outfile, bgz_outfile)) {
+	    goto recode_ret_WRITE_FAIL;
+	  }
 	}
-	if (!real_ref_alleles) {
-	  fputs("\t.\t.\tPR\tGT", outfile);
-	} else {
-	  fputs("\t.\t.\t.\tGT", outfile);
+	if (flexbputs_checked(real_ref_alleles? "\t.\t.\t.\tGT" : "\t.\t.\tPR\tGT", output_bgz, outfile, bgz_outfile)) {
+	  goto recode_ret_WRITE_FAIL;
 	}
-
 	wbufptr = writebuf;
 	ulptr = loadbuf_collapsed;
 	ulptr_end = &(loadbuf_collapsed[sample_ct / BITCT2]);
@@ -12482,7 +12617,7 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	    shiftmax = sample_ct % BITCT2;
 	  }
 	}
-	if (fwrite_checked(writebuf, wbufptr - writebuf, outfile)) {
+	if (flexbwrite_checked(writebuf, wbufptr - writebuf, output_bgz, outfile, bgz_outfile)) {
 	  goto recode_ret_WRITE_FAIL;
 	}
       }
@@ -12494,9 +12629,16 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
 	fflush(stdout);
       }
     }
-    if (putc_checked('\n', outfile)) {
+    if (flexbputc_checked('\n', output_bgz, outfile, bgz_outfile)) {
       goto recode_ret_WRITE_FAIL;
     }
+    if (output_bgz) {
+      if (bgzf_close(bgz_outfile)) {
+	bgz_outfile = NULL;
+	goto recode_ret_WRITE_FAIL;
+      }
+      bgz_outfile = NULL;
+    }
   } else if (recode_modifier & RECODE_OXFORD) {
     memcpy(outname_end, ".gen", 5);
     if (fopen_checked(&outfile, outname, "w")) {
@@ -13795,6 +13937,9 @@ int32_t recode(uint32_t recode_modifier, FILE* bedfile, uintptr_t bed_offset, ch
   wkspace_reset(wkspace_mark);
   fclose_cond(outfile2);
   fclose_cond(outfile);
+  if (bgz_outfile) {
+    bgzf_close(bgz_outfile);
+  }
   return retval;
 }
 
@@ -14575,15 +14720,18 @@ static inline uint32_t merge_post_msort_update_maps(char* marker_ids, uintptr_t
   uint32_t chrom_idx;
   uint32_t chrom_read_end_idx;
   int64_t llxx;
+  uint32_t unplaced;
   uint32_t prev_bp;
   uint32_t cur_bp;
   uint32_t presort_idx;
   for (chrom_idx = 0; chrom_idx < chrom_ct; chrom_idx++) {
-    if (!IS_SET(chrom_mask, chrom_id[chrom_idx])) {
+    unplaced = chrom_id[chrom_idx]; // initially chromosome code
+    if (!IS_SET(chrom_mask, unplaced)) {
       read_pos = chrom_start[chrom_idx + 1];
       chrom_start[chrom_idx + 1] = write_pos;
       continue;
     }
+    unplaced = (unplaced == 0) || (chrom_info_ptr->zero_extra_chroms && (unplaced > chrom_info_ptr->max_code));
     chrom_read_end_idx = chrom_start[chrom_idx + 1];
     // ll_buf has base-pair positions in high 32 bits, and pre-sort indices in
     // low 32 bits.
@@ -14596,21 +14744,19 @@ static inline uint32_t merge_post_msort_update_maps(char* marker_ids, uintptr_t
       llxx = ll_buf[read_pos];
       presort_idx = (uint32_t)llxx;
       cur_bp = (uint32_t)(llxx >> 32);
-      if (prev_bp == cur_bp) {
+      // do not merge chr 0 (unplaced).
+      if ((prev_bp == cur_bp) && (!unplaced)) {
 	if (merge_equal_pos && merge_alleles(marker_allele_ptrs, ((uint32_t)ll_buf[read_pos - 1]), presort_idx)) {
 	  LOGPRINTFWW("Error: --merge-equal-pos failure.  Variants '%s' and '%s' have the same position, but do not share the same alleles.\n", &(marker_ids[max_marker_id_len * presort_idx]), &(marker_ids[max_marker_id_len * ((uint32_t)ll_buf[read_pos - 1])]));
 	  return 1;
 	}
-	if (prev_bp) {
-	  // no warning if prev_bp is 0
-	  LOGPREPRINTFWW("Warning: Variants '%s' and '%s' have the same position.\n", &(marker_ids[max_marker_id_len * presort_idx]), &(marker_ids[max_marker_id_len * ((uint32_t)ll_buf[read_pos - 1])]));
-	  if (position_warning_ct < 3) {
-	    logprintb();          
-	  } else {
-	    logstr(logbuf);
-	  }
-	  position_warning_ct++;
+	LOGPREPRINTFWW("Warning: Variants '%s' and '%s' have the same position.\n", &(marker_ids[max_marker_id_len * presort_idx]), &(marker_ids[max_marker_id_len * ((uint32_t)ll_buf[read_pos - 1])]));
+	if (position_warning_ct < 3) {
+	  logprintb();
+	} else {
+	  logstr(logbuf);
 	}
+	position_warning_ct++;
 	if (merge_equal_pos) {
 	  marker_map[presort_idx] = write_pos - 1;
 	  continue;
@@ -15357,6 +15503,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   uint32_t orig_idx = 0;
   uint32_t cur_marker_ct = 0;
   uint32_t tot_marker_ct = 0;
+  int32_t retval = 0;
   uint32_t* map_reverse = NULL;
   uintptr_t* reversed = NULL;
   char* bim_loadbuf = NULL;
@@ -15410,7 +15557,6 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
   unsigned char* ubufptr;
   char cc;
   unsigned char ucc;
-  int32_t retval;
   if (wkspace_alloc_ui_checked(&chrom_start, (MAX_POSSIBLE_CHROM + 1) * sizeof(int32_t)) ||
       wkspace_alloc_ui_checked(&chrom_id, MAX_POSSIBLE_CHROM * sizeof(int32_t))) {
     goto merge_datasets_ret_NOMEM;
@@ -15573,7 +15719,7 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
       LOGPRINTFWW("%u %s loaded from %s.\n", max_cur_sample_ct, species_str(max_cur_sample_ct), mergelist_fam[0]);
       LOGPRINTFWW("%u %s to be merged from %s.\n", cur_sample_ct, species_str(cur_sample_ct), mergelist_fam[1]);
       uii = ullxx - max_cur_sample_ct;
-      LOGPRINTF("Of these, %u are new, while %u are present in the base dataset.\n", uii, cur_sample_ct - uii);
+      LOGPRINTF("Of these, %u %s new, while %u %s present in the base dataset.\n", uii, (uii == 1)? "is" : "are", cur_sample_ct - uii, (cur_sample_ct - uii == 1)? "is" : "are");
     }
     if (cur_sample_ct > max_cur_sample_ct) {
       max_cur_sample_ct = cur_sample_ct;
@@ -15781,12 +15927,16 @@ int32_t merge_datasets(char* bedname, char* bimname, char* famname, char* outnam
     }
     if (!merge_list) {
       if (!mlpos) {
-	uii = cur_marker_ct;
+	uii = ullxx;
       } else {
 	LOGPRINTFWW("%u marker%s loaded from %s.\n", uii, (uii == 1)? "" : "s", mergelist_bim[0]);
 	LOGPRINTFWW("%u marker%s to be merged from %s.\n", cur_marker_ct, (cur_marker_ct == 1)? "" : "s", mergelist_bim[1]);
+	// bugfix: don't underflow when a single file has duplicate IDs (e.g.
+	// '.').
+	// Merging should fail anyway in that case, but we should not embarrass
+	// ourselves by printing inaccurate numbers here.
 	uii = ullxx - uii;
-	LOGPRINTF("Of these, %u are new, while %u are present in the base dataset.\n", uii, cur_marker_ct - uii);
+	LOGPRINTF("Of these, %u %s new, while %u %s present in the base dataset.\n", uii, (uii == 1)? "is" : "are", cur_marker_ct - uii, (cur_marker_ct - uii == 1)? "is" : "are");
       }
     }
     if (!mergelist_fam[mlpos]) {
diff --git a/plink_dosage.c b/plink_dosage.c
index b1d9041..4bf4505 100644
--- a/plink_dosage.c
+++ b/plink_dosage.c
@@ -7,6 +7,7 @@
 #include "plink_glm.h"
 #include "plink_matrix.h"
 #include "plink_misc.h"
+#include "pigz.h"
 
 void dosage_init(Dosage_info* doip) {
   doip->fname = NULL;
@@ -422,6 +423,10 @@ int32_t dosage_load_score_files(Score_info* sc_ip, char* outname, char* outname_
       score_qrange_bounds[2 * ulii + 1] = ubound;
       ulii++;
     }
+    if (ulii != qrange_ct) {
+      // catches /dev/stdin redirection
+      goto dosage_load_score_files_ret_READ_FAIL;
+    }
     if (fclose_null(&infile)) {
       goto dosage_load_score_files_ret_READ_FAIL;
     }
@@ -461,9 +466,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   // there's no long-term maintenance problem
   FILE* phenofile = NULL;
   FILE* infile = NULL;
-  FILE* outfile = NULL;
+  FILE* profile_outfile = NULL;
   gzFile* gz_infiles = NULL;
-  gzFile gz_outfile = NULL;
   char* marker_ids = NULL;
   char* sample_ids = NULL;
   char* paternal_ids = NULL;
@@ -478,6 +482,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   char** score_allele_codes = NULL;
   char* a1_ptr = NULL;
   char* a2_ptr = NULL;
+  char* pzwritep = NULL;
   uintptr_t* marker_exclude = NULL;
   uintptr_t* sample_exclude = NULL;
   uintptr_t* sex_nm = NULL;
@@ -606,7 +611,9 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   __CLPK_integer dgels_lwork;
 #endif
   char missing_pheno_str[32];
+  Pigz_state ps;
   unsigned char* wkspace_mark;
+  unsigned char* overflow_buf;
   char* fnames;
   char* loadbuf;
   char* bufptr;
@@ -663,6 +670,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   uint32_t uii;
   uint32_t ukk;
   int32_t ii;
+  pzwrite_init_null(&ps);
   if (load_map) {
     retval = load_bim(mapname, &map_cols, &unfiltered_marker_ct, &marker_exclude_ct, &max_marker_id_len, &marker_exclude, NULL, NULL, NULL, &ulii, &marker_ids, NULL, 0, NULL, chrom_info_ptr, NULL, &marker_pos, misc_flags, filter_flags, marker_pos_start, marker_pos_end, snp_window_size, markername_from, markername_to, markername_snp, snps_range_list_ptr, &map_is_unsorted, do_glm || min_bp_space || (misc_flags & (MISC_EXTRACT_RANGE | MISC_EXCLUDE_RANGE)), 0, 0, NULL, ".map file", NULL);
     if (retval) {
@@ -796,7 +804,7 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       }
       if (excludename) {
 	if (!(misc_flags & MISC_EXCLUDE_RANGE)) {
-	  retval = extract_exclude_flag_norange(extractname, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
+	  retval = extract_exclude_flag_norange(excludename, marker_id_htable, marker_id_htable_size, 1, marker_ids, max_marker_id_len, unfiltered_marker_ct, marker_exclude, &marker_exclude_ct);
 	  if (retval) {
 	    goto plink1_dosage_ret_1;
 	  }
@@ -1330,7 +1338,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     memcpy(fnames, doip->fname, uii);
     infile_ct = 1;
   }
-  if (wkspace_alloc_ui_checked(&file_icts, max_batch_size * sizeof(int32_t)) ||
+  if (wkspace_alloc_uc_checked(&overflow_buf, 2 * PIGZ_BLOCK_SIZE) ||
+      wkspace_alloc_ui_checked(&file_icts, max_batch_size * sizeof(int32_t)) ||
       wkspace_alloc_ul_checked(&line_idx_arr, max_batch_size * sizeof(intptr_t)) ||
       wkspace_alloc_ul_checked(&batch_samples, sample_ctl * sizeof(intptr_t)) ||
       wkspace_alloc_ul_checked(&cur_samples, sample_ctl * sizeof(intptr_t)) ||
@@ -1463,7 +1472,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
     }
     bufptr = memcpya(bufptr, "  A1  A2     FRQ    INFO    ", 28);
     bufptr = memcpya(bufptr, pheno_c? "  OR" : "BETA", 4);
-    bufptr = memcpya(bufptr, "      SE       P\n", 17);
+    bufptr = memcpya(bufptr, "      SE       P", 16);
+    append_binary_eoln(&bufptr);
     bufptr2 = memcpyb(outname_end, ".assoc.dosage", 14);
   } else if (count_occur) {
     // could just use a uint32_t array if .map provided
@@ -1485,56 +1495,29 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   }
   if (output_gz) {
     memcpy(bufptr2, ".gz", 4);
-    if (gzopen_checked(&gz_outfile, outname, "wb")) {
-      goto plink1_dosage_ret_OPEN_FAIL;
-    }
+  }
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+    goto plink1_dosage_ret_OPEN_FAIL;
+  }
+  pzwritep = (char*)overflow_buf;
+
+  if (!do_score) {
     if (do_glm) {
-      if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
-	goto plink1_dosage_ret_WRITE_FAIL;
-      }
-    } else {
-      if (gzputs(gz_outfile, "SNP A1 A2 ") == -1) {
-	goto plink1_dosage_ret_WRITE_FAIL;
-      }
+      pzwritep = memcpya(pzwritep, tbuf, bufptr - tbuf);
+    } else if (!count_occur) {
+      pzwritep = memcpya(pzwritep, "SNP A1 A2 ", 10);
       for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
 	next_unset_unsafe_ck(sample_exclude, &sample_uidx);
 	bufptr = &(sample_ids[sample_uidx * max_sample_id_len]);
 	bufptr2 = strchr(bufptr, '\t');
-	*bufptr2 = ' ';
-        if (gzputs(gz_outfile, bufptr) == -1) {
-	  goto plink1_dosage_ret_WRITE_FAIL;
-	}
-	if (gzputc(gz_outfile, ' ') == -1) {
+	pzwritep = memcpya(pzwritep, bufptr, bufptr2 - bufptr);
+	*pzwritep++ = ' ';
+	pzwritep = strcpyax(pzwritep, &(bufptr2[1]), ' ');
+	if (flex_pzwrite(&ps, &pzwritep)) {
 	  goto plink1_dosage_ret_WRITE_FAIL;
 	}
-	*bufptr2 = '\t';
-      }
-      if (gzputc(gz_outfile, '\n') == -1) {
-	goto plink1_dosage_ret_WRITE_FAIL;
-      }
-    }
-  } else if (!do_score) {
-    if (fopen_checked(&outfile, outname, "w")) {
-      goto plink1_dosage_ret_OPEN_FAIL;
-    }
-    if (do_glm) {
-      if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
-	goto plink1_dosage_ret_WRITE_FAIL;
-      }
-    } else if (!count_occur) {
-      fputs("SNP A1 A2 ", outfile);
-      for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
-	next_unset_unsafe_ck(sample_exclude, &sample_uidx);
-	bufptr = &(sample_ids[sample_uidx * max_sample_id_len]);
-	bufptr2 = strchr(bufptr, '\t');
-	*bufptr2 = ' ';
-        fputs(bufptr, outfile);
-	putc(' ', outfile);
-	*bufptr2 = '\t';
-      }
-      if (putc_checked('\n', outfile)) {
-	goto plink1_dosage_ret_WRITE_FAIL;
       }
+      append_binary_eoln(&pzwritep);
     }
   }
   wkspace_mark = wkspace_base;
@@ -1923,92 +1906,52 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	  }
 #endif
 	  if (load_map) {
-	    bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx)));
-	    *bufptr++ = ' ';
-	    bufptr = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, bufptr);
-            bufptr = memseta(bufptr, 32, 2);
-            bufptr = uint32_writew10(bufptr, marker_pos[marker_idx]);
+	    pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_idx)));
+	    *pzwritep++ = ' ';
+	    pzwritep = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, pzwritep);
+            pzwritep = memseta(pzwritep, 32, 2);
+            pzwritep = uint32_writew10(pzwritep, marker_pos[marker_idx]);
 	  } else {
-	    tbuf[0] = ' ';
-	    bufptr = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, &(tbuf[1]));
+	    *pzwritep++ = ' ';
+	    pzwritep = fw_strcpyn(11, cur_marker_id_len, cur_marker_id_buf, pzwritep);
 	  }
-	  *bufptr++ = ' ';
-	  *bufptr = '\0';
-          if (output_gz) {
-	    if (gzputs(gz_outfile, tbuf) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (a1_len < 3) {
-	      if (gzputc(gz_outfile, ' ') == -1) {
-		goto plink1_dosage_ret_WRITE_FAIL;
-	      }
-	      if (a1_len == 1) {
-		if (gzputc(gz_outfile, ' ') == -1) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
-	      }
-	    }
-	    if (gzputs(gz_outfile, a1_ptr) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (gzputc(gz_outfile, ' ') == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (a2_len < 3) {
-	      if (gzputc(gz_outfile, ' ') == -1) {
-		goto plink1_dosage_ret_WRITE_FAIL;
-	      }
-	      if (a2_len == 1) {
-		if (gzputc(gz_outfile, ' ') == -1) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
-	      }
-	    }
-	    if (gzputs(gz_outfile, a2_ptr) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	  } else {
-            fputs(tbuf, outfile);
-	    if (a1_len < 3) {
-	      putc(' ', outfile);
-	      if (a1_len == 1) {
-		putc(' ', outfile);
-	      }
+	  *pzwritep++ = ' ';
+	  if (a1_len < 3) {
+	    *pzwritep++ = ' ';
+	    if (a1_len == 1) {
+	      *pzwritep++ = ' ';
 	    }
-	    fputs(a1_ptr, outfile);
-	    putc(' ', outfile);
-	    if (a2_len < 3) {
-	      putc(' ', outfile);
-	      if (a2_len == 1) {
-		putc(' ', outfile);
-	      }
+	  }
+	  if (flex_pzputs_allele(&ps, &pzwritep, a1_ptr, a1_len)) {
+	    goto plink1_dosage_ret_WRITE_FAIL;
+	  }
+	  *pzwritep++ = ' ';
+	  if (a2_len < 3) {
+            *pzwritep++ = ' ';
+	    if (a2_len == 1) {
+	      *pzwritep++ = ' ';
 	    }
-	    fputs(a2_ptr, outfile);
-          }
-	  bufptr = tbuf;
-	  *bufptr++ = ' ';
-          bufptr = double_f_writew74(bufptr, dzz);
-	  *bufptr++ = ' ';
-	  bufptr = double_f_writew74(bufptr, rsq);
-	  *bufptr++ = ' ';
+	  }
+	  if (flex_pzputs_allele(&ps, &pzwritep, a2_ptr, a2_len)) {
+	    goto plink1_dosage_ret_WRITE_FAIL;
+	  }
+	  *pzwritep++ = ' ';
+          pzwritep = double_f_writew74(pzwritep, dzz);
+	  *pzwritep++ = ' ';
+	  pzwritep = double_f_writew74(pzwritep, rsq);
+	  *pzwritep++ = ' ';
 	  if (is_valid) {
-	    bufptr = double_f_writew74(bufptr, pheno_c? exp(beta * 0.5) : (beta * 0.5));
-	    *bufptr++ = ' ';
-	    bufptr = double_f_writew74(bufptr, se * 0.5);
-	    *bufptr++ = ' ';
-	    bufptr = double_g_writewx4(bufptr, MAXV(pval, output_min_p), 7);
-	    bufptr = memcpya(bufptr, "\n", 2);
+	    pzwritep = double_f_writew74(pzwritep, pheno_c? exp(beta * 0.5) : (beta * 0.5));
+	    *pzwritep++ = ' ';
+	    pzwritep = double_f_writew74(pzwritep, se * 0.5);
+	    *pzwritep++ = ' ';
+	    pzwritep = double_g_writewx4(pzwritep, MAXV(pval, output_min_p), 7);
 	  } else {
-	    bufptr = memcpya(bufptr, "     NA      NA      NA\n", 25);
+	    pzwritep = memcpya(pzwritep, "     NA      NA      NA", 23);
 	  }
-	  if (output_gz) {
-	    if (gzputs(gz_outfile, tbuf) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	  } else {
-	    if (fputs_checked(tbuf, outfile)) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
+	  append_binary_eoln(&pzwritep);
+	  if (flex_pzwrite(&ps, &pzwritep)) {
+	    goto plink1_dosage_ret_WRITE_FAIL;
 	  }
 	} else if (do_score) {
 	  sample_valid_ct = popcount_longs(cur_samples, sample_ctl);
@@ -2055,32 +1998,17 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    }
 	  } while (++qrange_idx < qrange_ct);
 	} else if (!count_occur) {
-	  if (output_gz) {
-	    if (gzputs(gz_outfile, cur_marker_id_buf) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (gzputc(gz_outfile, ' ') == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (gzputs(gz_outfile, a1_ptr) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (gzputc(gz_outfile, ' ') == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (gzputs(gz_outfile, a2_ptr) == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	    if (gzputc(gz_outfile, ' ') == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	  } else {
-	    fputs(cur_marker_id_buf, outfile);
-	    putc(' ', outfile);
-	    fputs(a1_ptr, outfile);
-	    putc(' ', outfile);
-	    fputs(a2_ptr, outfile);
-	    putc(' ', outfile);
+	  pzwritep = strcpyax(pzwritep, cur_marker_id_buf, ' ');
+	  if (flex_pzputs_allele(&ps, &pzwritep, a1_ptr, a1_len)) {
+	    goto plink1_dosage_ret_WRITE_FAIL;
+	  }
+	  *pzwritep++ = ' ';
+	  if (flex_pzputs_allele(&ps, &pzwritep, a2_ptr, a2_len)) {
+	    goto plink1_dosage_ret_WRITE_FAIL;
+	  }
+	  *pzwritep++ = ' ';
+	  if (flex_pzwrite(&ps, &pzwritep)) {
+	    goto plink1_dosage_ret_WRITE_FAIL;
 	  }
 	  ulii = 0;
 	  // could make output format independent of input format (other than
@@ -2091,94 +2019,67 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	    sample_idx = 0;
 	    do {
 	      ulii += MAXLINELEN / 16;
-	      bufptr = tbuf;
 	      if (ulii > sample_ct) {
 		ulii = sample_ct;
 	      }
 	      for (; sample_idx < ulii; sample_idx++) {
 		if (!is_set(cur_samples, sample_idx)) {
-		  bufptr = memcpyl3a(bufptr, "NA ");
+		  pzwritep = memcpyl3a(pzwritep, "NA ");
 		} else {
-		  bufptr = double_g_writex(bufptr, 2 * cur_dosages[sample_idx], ' ');
+		  pzwritep = double_g_writex(pzwritep, 2 * cur_dosages[sample_idx], ' ');
 		}
 	      }
-	      if (output_gz) {
-		if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
-	      } else {
-		if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
+	      if (flex_pzwrite(&ps, &pzwritep)) {
+		goto plink1_dosage_ret_WRITE_FAIL;
 	      }
 	    } while (ulii < sample_ct);
 	  } else if (format_val == 2) {
 	    sample_idx = 0;
 	    do {
 	      ulii += MAXLINELEN / 32;
-	      bufptr = tbuf;
 	      if (ulii > sample_ct) {
 		ulii = sample_ct;
 	      }
 	      for (; sample_idx < ulii; sample_idx++) {
 		if (!is_set(cur_samples, sample_idx)) {
-		  bufptr = memcpya(bufptr, "NA NA ", 6);
+		  pzwritep = memcpya(pzwritep, "NA NA ", 6);
 		} else {
-		  bufptr = double_g_writex(bufptr, cur_dosages[sample_idx], ' ');
-		  bufptr = double_g_writex(bufptr, cur_dosages2[sample_idx], ' ');
+		  pzwritep = double_g_writex(pzwritep, cur_dosages[sample_idx], ' ');
+		  pzwritep = double_g_writex(pzwritep, cur_dosages2[sample_idx], ' ');
 		}
 	      }
-	      if (output_gz) {
-		if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
-	      } else {
-		if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
+	      if (flex_pzwrite(&ps, &pzwritep)) {
+		goto plink1_dosage_ret_WRITE_FAIL;
 	      }
 	    } while (ulii < sample_ct);
 	  } else {
 	    sample_idx = 0;
 	    do {
 	      ulii += MAXLINELEN / 48;
-	      bufptr = tbuf;
 	      if (ulii > sample_ct) {
 		ulii = sample_ct;
 	      }
 	      for (; sample_idx < ulii; sample_idx++) {
 		if (!is_set(cur_samples, sample_idx)) {
-		  bufptr = memcpya(bufptr, "NA NA NA ", 9);
+		  pzwritep = memcpya(pzwritep, "NA NA NA ", 9);
 		} else {
 		  dxx = cur_dosages[sample_idx];
-		  bufptr = double_g_writex(bufptr, dxx, ' ');
+		  pzwritep = double_g_writex(pzwritep, dxx, ' ');
 		  dyy = cur_dosages2[sample_idx];
-		  bufptr = double_g_writex(bufptr, dyy, ' ');
+		  pzwritep = double_g_writex(pzwritep, dyy, ' ');
 		  dxx = 1.0 - dxx - dyy;
 		  if (fabs(dxx) < SMALL_EPSILON) {
 		    dxx = 0.0;
 		  }
-		  bufptr = double_g_writex(bufptr, dxx, ' ');
+		  pzwritep = double_g_writex(pzwritep, dxx, ' ');
 		}
 	      }
-	      if (output_gz) {
-		if (!gzwrite(gz_outfile, tbuf, bufptr - tbuf)) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
-	      } else {
-		if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
-		  goto plink1_dosage_ret_WRITE_FAIL;
-		}
+	      if (flex_pzwrite(&ps, &pzwritep)) {
+		goto plink1_dosage_ret_WRITE_FAIL;
 	      }
 	    } while (ulii < sample_ct);
 	  }
-	  if (output_gz) {
-	    if (gzputc(gz_outfile, '\n') == -1) {
-	      goto plink1_dosage_ret_WRITE_FAIL;
-	    }
-	  } else {
-	    putc('\n', outfile);
-	  }
+	  append_binary_eoln(&pzwritep);
 	}
       }
       if (a1_ptr) {
@@ -2211,11 +2112,13 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       } else {
 	memcpy(outname_end, ".profile", 9);
       }
-      if (fopen_checked(&outfile, outname, "w")) {
+      // this is not affected by 'gz' in PLINK 1.07; retain that for backward
+      // compatibility.
+      if (fopen_checked(&profile_outfile, outname, "w")) {
 	goto plink1_dosage_ret_OPEN_FAIL;
       }
       sprintf(tbuf, "%%%us %%%us  PHENO%s %s\n", plink_maxfid, plink_maxiid, dosage_score_cnt? "    CNT" : "", score_report_average? "   SCORE" : "SCORESUM");
-      fprintf(outfile, tbuf, "FID", "IID");
+      fprintf(profile_outfile, tbuf, "FID", "IID");
       uii = score_range_obs_cts[qrange_idx];
       uiptr = &(score_miss_cts[sample_ct * qrange_idx]);
       dxx = score_bases[qrange_idx];
@@ -2254,11 +2157,11 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
 	}
 	bufptr = width_force(8, bufptr, double_g_write(bufptr, dyy));
 	*bufptr++ = '\n';
-        if (fwrite_checked(tbuf, bufptr - tbuf, outfile)) {
+        if (fwrite_checked(tbuf, bufptr - tbuf, profile_outfile)) {
 	  goto plink1_dosage_ret_WRITE_FAIL;
 	}
       }
-      if (fclose_null(&outfile)) {
+      if (fclose_null(&profile_outfile)) {
 	goto plink1_dosage_ret_WRITE_FAIL;
       }
       LOGPRINTFWW("--score: Results written to %s .\n", outname);
@@ -2290,27 +2193,16 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
       for (ulii = 0; ulii < distinct_id_ct; ulii++) {
 	bufptr2 = &(bufptr[ulii * max_occur_id_len]);
 	slen = strlen(bufptr2);
-	bufptr3 = memcpyax(tbuf, bufptr2, slen, ' ');
-	bufptr3 = uint32_write(bufptr3, *((uint32_t*)(&(bufptr2[slen + 1]))));
-	memcpy(bufptr3, "\n", 2);
-	if (output_gz) {
-	  if (gzputs(gz_outfile, tbuf) == -1) {
-	    goto plink1_dosage_ret_WRITE_FAIL;
-	  }
-	} else {
-	  fputs(tbuf, outfile);
+	pzwritep = memcpyax(pzwritep, bufptr2, slen, ' ');
+	pzwritep = uint32_write(pzwritep, *((uint32_t*)(&(bufptr2[slen + 1]))));
+	append_binary_eoln(&pzwritep);
+	if (flex_pzwrite(&ps, &pzwritep)) {
+	  goto plink1_dosage_ret_WRITE_FAIL;
 	}
       }
     }
-    if (output_gz) {
-      if (gzclose(gz_outfile) != Z_OK) {
-	goto plink1_dosage_ret_WRITE_FAIL;
-      }
-      gz_outfile = NULL;
-    } else {
-      if (fclose_null(&outfile)) {
-	goto plink1_dosage_ret_WRITE_FAIL;
-      }
+    if (flex_pzwrite_close_null(&ps, pzwritep)) {
+      goto plink1_dosage_ret_WRITE_FAIL;
     }
     LOGPRINTFWW("--%sdosage%s: Results saved to %s .\n", (do_glm || count_occur)? "" : "write-", count_occur? " occur" : "", outname);
     if (count_occur) {
@@ -2369,8 +2261,8 @@ int32_t plink1_dosage(Dosage_info* doip, char* famname, char* mapname, char* out
   free_cond(pheno_d);
   fclose_cond(phenofile);
   fclose_cond(infile);
-  fclose_cond(outfile);
-  gzclose_cond(gz_outfile);
+  fclose_cond(profile_outfile);
+  flex_pzwrite_close_cond(&ps, pzwritep);
   if (a1_ptr && a1_ptr[1]) {
     free(a1_ptr);
   }
diff --git a/plink_family.c b/plink_family.c
index eb764d3..fe36eda 100644
--- a/plink_family.c
+++ b/plink_family.c
@@ -1,6 +1,7 @@
 #include "plink_common.h"
 
 #include "plink_assoc.h"
+#include "plink_cluster.h"
 #include "plink_family.h"
 #include "plink_stats.h"
 
@@ -11,6 +12,8 @@ void family_init(Family_info* fam_ip) {
   fam_ip->mendel_modifier = 0;
   fam_ip->tdt_modifier = 0;
   fam_ip->tdt_mperm_val = 0;
+  fam_ip->dfam_modifier = 0;
+  fam_ip->dfam_mperm_val = 0;
   fam_ip->qfam_modifier = 0;
   fam_ip->qfam_mperm_val = 0;
 }
@@ -60,6 +63,9 @@ const uint32_t mendel_error_table_x[] =
  0x5000001, 0, 0x2010101, 0};
 
 int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* founder_info, uintptr_t* sex_nm, uintptr_t* sex_male, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, char** fids_ptr, uintptr_t* max_fid_len_ptr, char** iids_ptr, uintptr_t* max_iid_len_ptr, uint64_t** family_list_ptr, uint32_t* family_ct_ptr, uint64_t** trio_list_ptr [...]
+  // This mirrors linkRelateds() in genedrop.cpp, and parseTrios() in trio.cpp,
+  // in PLINK 1.07.
+  //
   // family_list has paternal indices in low 32 bits, maternal indices in high
   // 32, sorted in child ID order.
   // trio_list has child IDs in low 32 bits, family_list indices in high 32
@@ -78,7 +84,8 @@ int32_t get_trios_and_families(uintptr_t unfiltered_sample_ct, uintptr_t* sample
   // PLINK 1.07 enforces <= 1 father and <= 1 mother per sample (and ambiguous
   // sex parents are not permitted), but the IDs CAN be reversed in the .fam
   // with no adverse consequences.  For backward compatibility, we replicate
-  // this.  (Todo: report a warning exactly once when this happens.)
+  // this.  (Possible todo: report a warning exactly once when this happens.)
+  // It won't be replicated in PLINK 2.0.
   unsigned char* wkspace_mark = wkspace_base;
   uint64_t* edge_list = NULL;
   uint32_t* toposort_queue = NULL;
@@ -1065,18 +1072,31 @@ int32_t mendel_error_scan(Family_info* fam_ip, FILE* bedfile, uintptr_t bed_offs
     for (uii = 0; uii < family_ct; uii++) {
       family_code = family_list[uii];
       ujj = (uint32_t)family_code; // paternal uidx
+      ukk = (uint32_t)(family_code >> 32); // maternal uidx
       if (ujj < unfiltered_sample_ct) {
 	// bleah, fids[] isn't in right order for this lookup
 	cptr = &(sample_ids[ujj * max_sample_id_len]);
 	wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, tbuf);
       } else {
-	wptr = memseta(tbuf, 32, plink_maxfid - 1);
-	*wptr++ = '0';
+	cptr = &(sample_ids[ukk * max_sample_id_len]);
+	wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(((char*)memchr(cptr, '\t', max_sample_id_len)) - cptr), cptr, tbuf);
+	// wptr = memseta(tbuf, 32, plink_maxfid - 1);
+	// *wptr++ = '0';
       }
       *wptr++ = ' ';
-      wptr = fw_strcpy(plink_maxiid, &(iids[ujj * max_iid_len]), wptr);
+      if (ujj != unfiltered_sample_ct) {
+        wptr = fw_strcpy(plink_maxiid, &(iids[ujj * max_iid_len]), wptr);
+      } else {
+	wptr = memseta(wptr, 32, plink_maxiid - 1);
+	*wptr++ = '0';
+      }
       *wptr++ = ' ';
-      wptr = fw_strcpy(plink_maxiid, &(iids[((uintptr_t)(family_code >> 32)) * max_iid_len]), wptr);
+      if (ukk != unfiltered_sample_ct) {
+        wptr = fw_strcpy(plink_maxiid, &(iids[ukk * max_iid_len]), wptr);
+      } else {
+	wptr = memseta(wptr, 32, plink_maxiid - 1);
+	*wptr++ = '0';
+      }
       *wptr++ = ' ';
       wptr = uint32_writew6x(wptr, child_cts[uii], ' ');
       if (family_error_cts[uii * 3] < 10000) {
@@ -1704,7 +1724,7 @@ int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfilte
   return 0;
 }
 
-int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct_ax, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_male_include2, uint32_t* trio_nuclear_lookup, uint32_t family_ct, uint32_t [...]
+int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct_ax, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_male_include2, uint32_t* trio_nuclear_lookup, uint32_t family_ct, Aperm_in [...]
   FILE* outfile = NULL;
   uint64_t mendel_error_ct = 0;
   double pat_a2transmit_recip = 0.0;
@@ -1918,7 +1938,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
 	if (pct < 100) {
 	  printf("\b\b%" PRIuPTR "%%", pct);
 	  fflush(stdout);
-	  pct_thresh = ((++pct) * ((uint64_t)markers_done)) / 100;
+	  pct_thresh = ((++pct) * ((uint64_t)marker_ct_ax)) / 100;
 	}
       }
       if (++marker_uidx == chrom_end) {
@@ -1956,7 +1976,7 @@ int32_t tdt_poo(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* o
   return retval;
 }
 
-int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
+int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
   unsigned char* wkspace_mark = wkspace_base;
   FILE* outfile = NULL;
   char* textbuf = tbuf;
@@ -1977,6 +1997,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
   uint32_t is_exact = fam_ip->tdt_modifier & TDT_EXACT;
   uint32_t is_midp = fam_ip->tdt_modifier & TDT_MIDP;
   uint32_t poo_test = fam_ip->tdt_modifier & TDT_POO;
+  // uint32_t perm_count = fam_ip->tdt_modifier & TDT_PERM_COUNT;
   uint32_t case_trio_ct = 0;
   uint32_t is_discordant = 0;
   uint32_t discord_exists = 0;
@@ -2182,7 +2203,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
     }
   }
   if (poo_test) {
-    retval = tdt_poo(threads, bedfile, bed_offset, outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_male_include2, trio_nuclear_lookup, family_ct, mperm_save, sample_ids, max_sample_id_len, chrom_info_ptr, hh_exists, fam_ip, loadbuf, workbuf, textbuf, orig_chisq, trio_error_lookup, trio_ct);
+    retval = tdt_poo(threads, bedfile, bed_offset, outname, outname_end, output_min_p, unfiltered_marker_ct, marker_exclude, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, marker_allele_ptrs, max_marker_allele_len, marker_reverse, unfiltered_sample_ct, sample_male_include2, trio_nuclear_lookup, family_ct, apip, mperm_save, sample_ids, max_sample_id_len, chrom_info_ptr, hh_exists, fam_ip, loadbuf, workbuf, textbuf, orig_chisq, trio_error_lookup, trio_ct);
     if (retval) {
       goto tdt_ret_1;
     }
@@ -2445,7 +2466,7 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
         if (pct < 100) {
 	  printf("\b\b%" PRIuPTR "%%", pct);
           fflush(stdout);
-          pct_thresh = ((++pct) * ((uint64_t)markers_done)) / 100;
+          pct_thresh = ((++pct) * ((uint64_t)marker_ct)) / 100;
 	}
       }
       if (++marker_uidx == chrom_end) {
@@ -2507,14 +2528,17 @@ int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outna
   return retval;
 }
 
-int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t max_fid_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uint64_t* family_list, uint64_t* trio_list, uint32_t family_ct, uintptr_t trio_ct, uint32_t test_type, uintptr_t** lm_eligible_ptr, uintptr_t** lm_withi [...]
-  // on top of get_trios_and_families()'s return values, we need the following
-  // information for the main qfam() loop:
+int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, uintptr_t* pheno_nm, uintptr_t* founder_info, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t max_fid_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uint64_t* family_list, uint64_t* trio_list, uint32_t family_ct, uintptr_t trio_ct, uint32_t test_type, uintptr_t** lm_eligible_ptr, uintptr_t** lm_within2_founder_ptr, u [...]
+  // On top of get_trios_and_families()'s return values, we need the following
+  // information for the main dfam() and qfam() loops:
   // 1. sample idx -> family/sibship idx array
   // 2. fs_starts[]/fs_contents[] arrays describing family/sibship idx ->
   //    sample idxs mapping.
-  // we may as well sort size-1 sibships/singleton founders to the end; this
+  // We may as well sort size-1 sibships/singleton founders to the end; this
   // lets us get away with a smaller fs_starts[] array and a faster loop.
+  // There is also some qfam-specific initialization here (e.g. a divorcee with
+  // children from two different spouses may be excluded from the linear
+  // model).  test_type is zero for dfam and nonzero for qfam.
   uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
   uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
   uintptr_t max_merged_id_len = max_fid_len + max_paternal_id_len + max_maternal_id_len + sizeof(int32_t);
@@ -2522,6 +2546,7 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   uintptr_t topsize = 0;
   uintptr_t* tmp_within2_founder = NULL;
   uintptr_t* lm_within2_founder = NULL;
+  uintptr_t* lm_eligible = NULL;
   uint32_t is_within2 = (test_type == QFAM_WITHIN2);
   uint32_t family_idx = 0;
   uint32_t fssc_idx = 0;
@@ -2530,7 +2555,6 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   char* bufptr;
   char* bufptr2;
   char* bufptr3;
-  uintptr_t* lm_eligible;
   uintptr_t* not_in_family;
   uintptr_t* ulptr;
   uintptr_t* ulptr2;
@@ -2550,20 +2574,34 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   uint32_t ujj;
   uint32_t ukk;
   uint32_t umm;
-  if (is_within2) {
-    if (wkspace_alloc_ul_checked(&lm_within2_founder, sample_ctl * sizeof(intptr_t))) {
-      goto get_sibship_info_ret_NOMEM2;
+  if (test_type) {
+    if (is_within2) {
+      if (wkspace_alloc_ul_checked(&lm_within2_founder, sample_ctl * sizeof(intptr_t))) {
+	goto get_sibship_info_ret_NOMEM2;
+      }
+    }
+    if (wkspace_alloc_ul_checked(&lm_eligible, sample_ctl * sizeof(intptr_t))) {
+      goto get_sibship_info_ret_NOMEM;
     }
   }
-  if (wkspace_alloc_ul_checked(&lm_eligible, sample_ctl * sizeof(intptr_t)) ||
-      // shrink later
-      wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
-    goto get_sibship_info_ret_NOMEM;
-  }
-  // this is the equivalent of PLINK 1.07's family pointers
-  sample_to_fss_idx = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
-  if (!sample_to_fss_idx) {
-    goto get_sibship_info_ret_NOMEM;
+  if (test_type) {
+    // shrink later
+    if (wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
+      goto get_sibship_info_ret_NOMEM;
+    }
+    // this is the equivalent of PLINK 1.07's family pointers
+    sample_to_fss_idx = (uint32_t*)top_alloc(&topsize, sample_ct * sizeof(int32_t));
+    if (!sample_to_fss_idx) {
+      goto get_sibship_info_ret_NOMEM;
+    }
+  } else {
+    if (wkspace_alloc_ui_checked(&sample_to_fss_idx, sample_ct * sizeof(int32_t))) {
+      goto get_sibship_info_ret_NOMEM;
+    }
+    // shrink later
+    if (wkspace_alloc_ui_checked(&fss_contents, (sample_ct + 2 * family_ct) * sizeof(int32_t))) {
+      goto get_sibship_info_ret_NOMEM;
+    }
   }
   topsize_bak = topsize;
   not_in_family = (uintptr_t*)top_alloc(&topsize, unfiltered_sample_ctl * sizeof(intptr_t));
@@ -2605,13 +2643,15 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   if (family_ct) {
     while (1) {
       ullii = family_list[family_idx];
+      // uii, ukk = unfiltered idxs of parents
+      // ujj, umm = filtered idxs
       uii = (uint32_t)ullii;
       ujj = sample_uidx_to_idx[uii];
       fss_contents[fssc_idx++] = ujj;
       ukk = (uint32_t)(ullii >> 32);
       umm = sample_uidx_to_idx[ukk];
       if (is_within2) {
-	if (is_set(pheno_nm, uii) && is_set(pheno_nm, ukk) && (pheno_d[uii] != pheno_d[ukk])) {
+	if (is_set(pheno_nm, uii) && is_set(pheno_nm, ukk)) {
 	  set_bit(tmp_within2_founder, uii);
 	  set_bit(tmp_within2_founder, ukk);
 	}
@@ -2661,10 +2701,12 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
     collapse_copy_bitarr(unfiltered_sample_ct, tmp_within2_founder, sample_exclude, sample_ct, lm_within2_founder);
   }
   bitfield_andnot_reversed_args(ulptr, pheno_nm, unfiltered_sample_ctl);
-  if (test_type == QFAM_WITHIN1) {
-    bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+  if (test_type) {
+    if (test_type == QFAM_WITHIN1) {
+      bitfield_andnot(ulptr, founder_info, unfiltered_sample_ctl);
+    }
+    collapse_copy_bitarr(unfiltered_sample_ct, ulptr, sample_exclude, sample_ct, lm_eligible);
   }
-  collapse_copy_bitarr(unfiltered_sample_ct, ulptr, sample_exclude, sample_ct, lm_eligible);
   topsize = ulii;
 
   memcpy(ulptr, not_in_family, unfiltered_sample_ctl * sizeof(intptr_t));
@@ -2738,33 +2780,38 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   *fs_ct_ptr = family_idx;
   fs_starts[family_idx] = fssc_idx;
   wkspace_shrink_top(fs_starts, (family_idx + 1) * sizeof(int32_t));
-  // now iterate through not_in_family
-  ulii = popcount_longs(not_in_family, unfiltered_sample_ctl);
-  for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
-    next_set_ul_unsafe_ck(not_in_family, &sample_uidx);
-    ujj = sample_uidx_to_idx[sample_uidx];
-    fss_contents[fssc_idx++] = ujj;
-    sample_to_fss_idx[ujj] = family_idx + sample_idx;
-  }
-  *singleton_ct_ptr = ulii;
-  // finally, collapse sample_to_fss_idx to sample_lm_to_fss_idx
-  topsize = topsize_bak;
-  wkspace_left -= topsize;
-  ulii = popcount_longs(lm_eligible, sample_ctl);
-  if (wkspace_alloc_ui_checked(&sample_lm_to_fss_idx, ulii * sizeof(int32_t))) {
-    goto get_sibship_info_ret_NOMEM2;
-  }
-  wkspace_left += topsize;
-  for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
-    next_set_ul_unsafe_ck(lm_eligible, &sample_uidx);
-    sample_lm_to_fss_idx[sample_idx] = sample_to_fss_idx[sample_uidx];
+  if (test_type) {
+    // for qfam, save singletons, and collapse sample_to_fss_idx to
+    // sample_lm_to_fss_idx
+    ulii = popcount_longs(not_in_family, unfiltered_sample_ctl);
+    for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
+      next_set_ul_unsafe_ck(not_in_family, &sample_uidx);
+      ujj = sample_uidx_to_idx[sample_uidx];
+      fss_contents[fssc_idx++] = ujj;
+      sample_to_fss_idx[ujj] = family_idx + sample_idx;
+    }
+    *singleton_ct_ptr = ulii;
+    topsize = topsize_bak;
+    wkspace_left -= topsize;
+    ulii = popcount_longs(lm_eligible, sample_ctl);
+    if (wkspace_alloc_ui_checked(&sample_lm_to_fss_idx, ulii * sizeof(int32_t))) {
+      goto get_sibship_info_ret_NOMEM2;
+    }
+    wkspace_left += topsize;
+    for (sample_uidx = 0, sample_idx = 0; sample_idx < ulii; sample_uidx++, sample_idx++) {
+      next_set_ul_unsafe_ck(lm_eligible, &sample_uidx);
+      sample_lm_to_fss_idx[sample_idx] = sample_to_fss_idx[sample_uidx];
+    }
+    *lm_eligible_ptr = lm_eligible;
+    *lm_within2_founder_ptr = lm_within2_founder;
+    *sample_lm_to_fss_idx_ptr = sample_lm_to_fss_idx;
+    *lm_ct_ptr = ulii;
+  } else {
+    // return sample_to_fss_idx in place of sample_lm_to_fss_idx
+    *sample_lm_to_fss_idx_ptr = sample_to_fss_idx;
   }
-  *lm_eligible_ptr = lm_eligible;
-  *lm_within2_founder_ptr = lm_within2_founder;
   *fs_starts_ptr = fs_starts;
   *fss_contents_ptr = fss_contents;
-  *sample_lm_to_fss_idx_ptr = sample_lm_to_fss_idx;
-  *lm_ct_ptr = ulii;
   // topsize = 0;
 
   while (0) {
@@ -2777,6 +2824,969 @@ int32_t get_sibship_info(uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclu
   return retval;
 }
 
+// multithread globals
+/*
+static double* g_maxt_extreme_stat;
+static double* g_maxt_thread_results;
+static double* g_mperm_save_all;
+static uintptr_t* g_pheno_c;
+*/
+
+static uintptr_t* g_loadbuf;
+static uintptr_t* g_lm_eligible;
+static uintptr_t* g_lm_within2_founder;
+static uintptr_t* g_qfam_flip;
+static uintptr_t* g_nm_fss;
+static uintptr_t* g_nm_lm;
+static uint32_t* g_qfam_permute;
+static uint32_t* g_permute_edit;
+static uint32_t* g_perm_2success_ct;
+static uint32_t* g_perm_attempt_ct;
+static uint32_t* g_fs_starts;
+static uint32_t* g_fss_contents;
+static uint32_t* g_sample_lm_to_fss_idx;
+static unsigned char* g_perm_adapt_stop;
+static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
+static double* g_orig_stat;
+static double* g_pheno_d2;
+static double* g_qfam_b;
+static double* g_qfam_w;
+static double* g_beta_sum;
+static double* g_beta_ssq;
+static uint32_t* g_beta_fail_cts;
+static uintptr_t g_cur_perm_ct;
+static double g_qt_sum_all;
+static double g_qt_ssq_all;
+static uint32_t g_test_type;
+static uint32_t g_qfam_thread_ct;
+static uint32_t g_fs_ct;
+static uint32_t g_singleton_ct;
+static uint32_t g_lm_ct;
+static uint32_t g_family_ct;
+static uint32_t g_block_size;
+static uint32_t g_perms_done;
+static uint32_t g_first_adapt_check;
+static double g_adaptive_intercept;
+static double g_adaptive_slope;
+static double g_aperm_alpha;
+static double g_adaptive_ci_zt;
+
+// tried encoding this in a single 32-bit integer, but that appears to be
+// slower.
+const uint8_t dfam_allele_ct_table[] =
+{0, 0, 3, 0,
+ 0, 0, 0, 0,
+ 3, 0, 2, 1,
+ 0, 0, 1, 0};
+
+void dfam_sibship_calc(uint32_t cur_case_ct, uint32_t case_hom_a1_ct, uint32_t case_het_ct, uint32_t cur_ctrl_ct, uint32_t ctrl_hom_a1_ct, uint32_t ctrl_het_ct, uint32_t* total_a1_count_ptr, double* numer_ptr, double* denom_ptr, double* total_expected_ptr) {
+  if (!cur_ctrl_ct) {
+    return;
+  }
+  uint32_t hom_a1_ct = case_hom_a1_ct + ctrl_hom_a1_ct;
+  uint32_t het_ct = case_het_ct + ctrl_het_ct;
+  uint32_t total_ct = cur_case_ct + cur_ctrl_ct;
+  uint32_t case_a1_ct = 2 * case_hom_a1_ct + case_het_ct;
+  *total_a1_count_ptr += case_a1_ct;
+  if (((!hom_a1_ct) && (!het_ct)) || (het_ct == total_ct) || (hom_a1_ct == total_ct)) {
+    *total_expected_ptr += (double)((int32_t)case_a1_ct);
+    return;
+  }
+  double hom_a1_ctd = (double)((int32_t)hom_a1_ct);
+  double het_ctd = (double)((int32_t)het_ct);
+  double case_ctd = (double)((int32_t)cur_case_ct);
+  double ctrl_ctd = (double)((int32_t)cur_ctrl_ct);
+  double total_ctd = (double)((int32_t)total_ct);
+  double total_ct_recip = 1.0 / total_ctd;
+  double case_proportion = case_ctd * total_ct_recip;
+  double case_expected_hom_a1 = case_proportion * hom_a1_ctd;
+  double case_expected_het = case_proportion * het_ctd;
+  double case_ctrl_div_xxxm1 = case_proportion * ctrl_ctd / (total_ctd * (total_ctd - 1));
+  double case_var_hom_a1 = case_ctrl_div_xxxm1 * hom_a1_ctd * (total_ctd - hom_a1_ctd);
+  double case_var_het = case_ctrl_div_xxxm1 * het_ctd * (total_ctd - het_ctd);
+  double case_neg_covar = case_ctrl_div_xxxm1 * hom_a1_ctd * het_ctd;
+  double case_expected_a1_ct = 2 * case_expected_hom_a1 + case_expected_het;
+  double case_var_a1_ct = 4 * (case_var_hom_a1 + case_neg_covar) + case_var_het;
+  *numer_ptr += (double)((int32_t)case_a1_ct) - case_expected_a1_ct;
+  *denom_ptr += case_var_a1_ct;
+  *total_expected_ptr += case_expected_a1_ct;
+}
+
+int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct_orig, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude,  [...]
+  logprint("Error: --dfam is currently under development.\n");
+  return RET_CALC_NOT_YET_SUPPORTED;
+  /*
+  unsigned char* wkspace_mark = wkspace_base;
+  FILE* outfile = NULL;
+  FILE* outfile_msa = NULL;
+  char* textbuf = tbuf;
+  uintptr_t marker_ct_orig_autosomal = marker_ct_orig;
+  uintptr_t unfiltered_marker_ctl = (unfiltered_marker_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
+  uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
+  uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
+  uintptr_t unfiltered_sample_ctp1l2 = 1 + (unfiltered_sample_ct / BITCT2);
+  uintptr_t final_mask = get_final_mask(unfiltered_sample_ct);
+  uintptr_t* marker_exclude_orig_autosomal = marker_exclude_orig;
+  uintptr_t* founder_pnm = NULL;
+  double* orig_chisq = NULL;
+  uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
+  uint32_t multigen = (fam_ip->mendel_modifier / MENDEL_MULTIGEN) & 1;
+  uint32_t is_set_test = fam_ip->dfam_modifier & DFAM_SET_TEST;
+  uint32_t perm_adapt_nst = (fam_ip->dfam_modifier & DFAM_PERM) && (!is_set_test);
+  uint32_t perm_maxt_nst = (fam_ip->dfam_modifier & DFAM_MPERM) && (!is_set_test);
+  uint32_t do_perms = fam_ip->dfam_modifier & (DFAM_PERM | DFAM_MPERM);
+  uint32_t do_perms_nst = do_perms && (!is_set_test);
+  uint32_t perm_count = fam_ip->dfam_modifier & DFAM_PERM_COUNT;
+  uint32_t fill_orig_chisq = do_perms || mtest_adjust;
+  uint32_t no_unrelateds = (fam_ip->dfam_modifier & DFAM_NO_UNRELATEDS) || (within_cmdflag && (!cluster_ct));
+  uint32_t family_all_case_children_ct = 0;
+  uint32_t family_mixed_ct = 0;
+  uint32_t sibship_mixed_ct = 0;
+  uint32_t unrelated_cluster_ct = 0;
+  uint32_t pct = 0;
+  uint32_t max_thread_ct = g_thread_ct;
+  uint32_t perm_pass_idx = 0;
+  uint32_t perms_total = 0;
+  uint32_t perms_done = 0;
+  int32_t retval = 0;
+  uintptr_t* pheno_nm;
+  uintptr_t* dfam_pheno_c;
+  uintptr_t* loadbuf_raw;
+  uintptr_t* loadbuf_ptr;
+  uintptr_t* workbuf;
+  uintptr_t* marker_exclude;
+  uintptr_t* dfam_sample_exclude;
+  double* maxt_extreme_stat = NULL;
+  uint32_t mu_table[MODEL_BLOCKSIZE];
+  char* outname_end2;
+  char* wptr_start;
+  char* wptr;
+  uint64_t* family_list;
+  uint64_t* trio_list;
+  uint32_t* trio_error_lookup;
+  uint32_t* fs_starts;
+  uint32_t* fss_contents;
+  uint32_t* sample_to_fss_idx;
+  uint32_t* dfam_iteration_order;
+  uint32_t* idx_to_uidx;
+  uint32_t* uidx_to_idx;
+  uint32_t* sample_to_cluster;
+  uint32_t* cluster_ctrl_case_cts;
+  uint32_t* cluster_write_idxs;
+  uint32_t* cur_dfam_ptr;
+  uintptr_t marker_ct;
+  uintptr_t marker_uidx; // loading
+  uintptr_t marker_uidx2; // writing
+  uintptr_t trio_ct;
+  uintptr_t max_fid_len;
+  uintptr_t ulii;
+  double numer;
+  double denom;
+  double total_expected;
+  double case_proportion;
+  double case_expected_a1_ct;
+  double case_var_a1_ct;
+  double dxx;
+  uint32_t family_ct;
+  uint32_t fs_ct;
+  uint32_t sample_uidx;
+  uint32_t sample_idx;
+  uint32_t fs_idx;
+  uint32_t fssc_start;
+  uint32_t fssc_end;
+  uint32_t fssc_idx;
+  uint32_t unrelated_cluster_idx;
+  uint32_t write_idx;
+  uint32_t cur_ctrl_ct;
+  uint32_t cur_case_ct;
+  uint32_t dfam_sample_ct;
+  uint32_t dfam_sample_ctl2;
+  uint32_t chrom_fo_idx;
+  uint32_t chrom_idx;
+  uint32_t chrom_end;
+  uint32_t block_size;
+  uint32_t block_end;
+  uint32_t marker_bidx;
+  uint32_t marker_unstopped_ct;
+  uint32_t loop_end;
+  uint32_t marker_idx;
+  uint32_t marker_idx2;
+  uint32_t paternal_id;
+  uint32_t maternal_id;
+  uint32_t paternal_geno;
+  uint32_t maternal_geno;
+  uint32_t sibling_ct;
+  uint32_t parental_a1_ct;
+  uint32_t sib_idx;
+  uint32_t cur_geno;
+  uint32_t case_a1_ct;
+  uint32_t quad_denom;
+  uint32_t total_count;
+  uint32_t twice_total_expected;
+  uint32_t case_hom_a1_ct;
+  uint32_t case_het_ct;
+  uint32_t ctrl_hom_a1_ct;
+  uint32_t ctrl_het_ct;
+  uint32_t hom_a1_ct;
+  uint32_t het_ct;
+  uint32_t uii;
+  uint32_t ujj;
+  int32_t twice_numer;
+  uii = count_non_autosomal_markers(chrom_info_ptr, marker_exclude_orig, 1, 1);
+  if (uii) {
+    LOGPRINTF("Excluding %u X/MT/haploid variant%s from DFAM test.\n", uii, (uii == 1)? "" : "s");
+    if (uii == marker_ct_orig_autosomal) {
+      logprint("Error: No variants remaining for DFAM analysis.\n");
+      goto dfam_ret_INVALID_CMDLINE;
+    }
+    marker_ct_orig_autosomal -= uii;
+    if (wkspace_alloc_ul_checked(&marker_exclude_orig_autosomal, unfiltered_marker_ctl * sizeof(intptr_t))) {
+      goto dfam_ret_NOMEM;
+    }
+    memcpy(marker_exclude_orig_autosomal, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
+    for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
+      chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
+      if (is_set(chrom_info_ptr->haploid_mask, chrom_idx) || ((int32_t)chrom_idx == chrom_info_ptr->mt_code)) {
+	uii = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx];
+	fill_bits(marker_exclude_orig_autosomal, uii, chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1] - uii);
+      }
+    }
+  } else if (is_set(chrom_info_ptr->haploid_mask, 0)) {
+    logprint("Error: DFAM test does not support haploid data.\n");
+    goto dfam_ret_INVALID_CMDLINE;
+  }
+  uii = popcount_longs_exclude(pheno_c, sample_exclude, unfiltered_sample_ct);
+  if (!uii) {
+    logprint("Error: DFAM test requires at least one case.\n");
+    goto dfam_ret_INVALID_CMDLINE;
+  }
+  marker_exclude = marker_exclude_orig_autosomal;
+  marker_ct = marker_ct_orig_autosomal;
+
+  // PLINK 1.07 treats missing phenotypes as controls here
+  if (wkspace_alloc_ul_checked(&pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+    goto dfam_ret_NOMEM;
+  }
+  bitfield_exclude_to_include(sample_exclude, pheno_nm, unfiltered_sample_ct);
+  if (is_set_test) {
+    if (wkspace_alloc_ul_checked(&founder_pnm, unfiltered_sample_ctl * sizeof(intptr_t))) {
+      goto dfam_ret_NOMEM;
+    }
+    memcpy(founder_pnm, pheno_nm, unfiltered_sample_ctl * sizeof(intptr_t));
+    bitfield_and(founder_pnm, founder_info, unfiltered_sample_ctl);
+    if (extract_set_union_unfiltered(sip, NULL, unfiltered_marker_ct, marker_exclude_orig_autosomal, &marker_exclude, &marker_ct)) {
+      goto dfam_ret_NOMEM;
+    }
+  }
+
+  // no --mendel-duos support for now
+  retval = get_trios_and_families(unfiltered_sample_ct, sample_exclude, sample_ct, founder_info, sex_nm, sex_male, sample_ids, max_sample_id_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, NULL, &max_fid_len, NULL, NULL, &family_list, &family_ct, &trio_list, &trio_ct, &trio_error_lookup, 0, multigen);
+  if (retval) {
+    goto dfam_ret_1;
+  }
+#ifdef __LP64__
+  if ((12 * sample_ct + 2 * family_ct) > 0xffffffffLLU) {
+    logprint("Error: Too many samples and families for DFAM test.\n");
+    goto dfam_ret_INVALID_CMDLINE;
+  }
+#endif
+  if (get_sibship_info(unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, founder_info, sample_ids, max_sample_id_len, max_fid_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, family_list, trio_list, family_ct, trio_ct, 0, NULL, NULL, &fs_starts, &fss_contents, &sample_to_fss_idx, &fs_ct, NULL, NULL)) {
+    goto dfam_ret_NOMEM;
+  }
+  // Prepare final family, sibship, and unrelated cluster data structures.
+  // * Families with at least one affected child are processed using regular
+  //   TDT logic when possible; however, when both parents have homozygous
+  //   calls, or they aren't both genotyped, we fall back on sibship logic.
+  //   (Families with no affected children are entirely excluded from the
+  //   test.)
+  // * Only sibships with at least one affected child and one unaffected child
+  //   are considered.  (I.e. the sibship fallback never applies to families
+  //   with only affected children.)
+  // * Only unrelated clusters with at least one affected and one unaffected
+  //   member are considered.
+  // The data structures are optimized for the permutation test, since the
+  // computation is nearly I/O-bound without it.  Phenotypes are permuted
+  // within each sibship/unrelated cluster, while transmitted alleles are
+  // permuted in case-containing families.
+  if (wkspace_alloc_ul_checked(&dfam_sample_exclude, unfiltered_sample_ctl * sizeof(intptr_t)) ||
+      // shrink this later
+      wkspace_alloc_ui_checked(&dfam_iteration_order, (sample_ct + (sample_ct / 2)) * sizeof(int32_t)) ||
+      wkspace_alloc_ui_checked(&idx_to_uidx, sample_ct * sizeof(int32_t))) {
+    goto dfam_ret_NOMEM;
+  }
+  fill_all_bits(dfam_sample_exclude, unfiltered_sample_ct);
+  fill_idx_to_uidx(sample_exclude, unfiltered_sample_ct, sample_ct, idx_to_uidx);
+  cur_dfam_ptr = dfam_iteration_order;
+  for (fs_idx = 0; fs_idx < family_ct; fs_idx++) {
+    // Scan for families with only case children.
+    fssc_start = fs_starts[fs_idx] + 2;
+    fssc_end = fs_starts[fs_idx + 1];
+    cur_case_ct = 0;
+    for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+      cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
+    }
+    if (cur_case_ct == fssc_end - fssc_start) {
+      family_all_case_children_ct++;
+      // Could point to fss_contents, but I assume it's a better idea to
+      // optimize the inner loop for data locality and linear access.
+      // These family entries are temporarily stored as:
+      // [0-1]: parent uidxs
+      // [2]: number of children
+      // [3...]: child uidxs
+      // We collapse the indexes again later.
+      sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
+      clear_bit(dfam_sample_exclude, sample_uidx);
+      *cur_dfam_ptr++ = sample_uidx;
+
+      sample_uidx = idx_to_uidx[fss_contents[fssc_start - 1]];
+      clear_bit(dfam_sample_exclude, sample_uidx);
+      *cur_dfam_ptr++ = sample_uidx;
+
+      *cur_dfam_ptr++ = cur_case_ct;
+      for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+	sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
+	clear_bit(dfam_sample_exclude, sample_uidx);
+	*cur_dfam_ptr++ = sample_uidx;
+      }
+    }
+  }
+  for (fs_idx = 0; fs_idx < family_ct; fs_idx++) {
+    // Scan for families with at least one case and one control child.
+    fssc_start = fs_starts[fs_idx] + 2;
+    fssc_end = fs_starts[fs_idx + 1];
+    cur_case_ct = 0;
+    for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+      cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
+    }
+    if (cur_case_ct && (cur_case_ct != fssc_end - fssc_start)) {
+      family_mixed_ct++;
+      sample_uidx = idx_to_uidx[fss_contents[fssc_start - 2]];
+      clear_bit(dfam_sample_exclude, sample_uidx);
+      *cur_dfam_ptr++ = sample_uidx;
+
+      sample_uidx = idx_to_uidx[fss_contents[fssc_start - 1]];
+      clear_bit(dfam_sample_exclude, sample_uidx);
+      *cur_dfam_ptr++ = sample_uidx;
+
+      *cur_dfam_ptr++ = fssc_end - fssc_start;
+      for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+	sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
+	clear_bit(dfam_sample_exclude, sample_uidx);
+	*cur_dfam_ptr++ = sample_uidx;
+      }
+    }
+  }
+  for (; fs_idx < fs_ct; fs_idx++) {
+    // Scan for sibships with at least one case and one control.
+    fssc_start = fs_starts[fs_idx];
+    fssc_end = fs_starts[fs_idx + 1];
+    cur_case_ct = 0;
+    for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+      cur_case_ct += is_set(pheno_c, idx_to_uidx[fss_contents[fssc_idx]]);
+    }
+    if (cur_case_ct && (cur_case_ct != fssc_end - fssc_start)) {
+      sibship_mixed_ct++;
+      // [0]: sibling ct
+      // [1...]: member uidxs
+      *cur_dfam_ptr++ = fssc_end - fssc_start;
+      for (fssc_idx = fssc_start; fssc_idx < fssc_end; fssc_idx++) {
+	sample_uidx = idx_to_uidx[fss_contents[fssc_idx]];
+	clear_bit(dfam_sample_exclude, sample_uidx);
+	*cur_dfam_ptr++ = sample_uidx;
+      }
+    }
+  }
+  if (!no_unrelateds) {
+    if (wkspace_alloc_ui_checked(&sample_to_cluster, sample_ct * sizeof(int32_t))) {
+      goto dfam_ret_NOMEM;
+    }
+    // --within on an empty file actually causes --dfam to behave differently
+    // than no --within at all in PLINK 1.07.  Replicate this for now.
+    if (within_cmdflag) {
+      if (fill_sample_to_cluster(unfiltered_sample_ct, sample_exclude, sample_ct, cluster_ct, cluster_map, cluster_starts, sample_to_cluster, NULL)) {
+	goto dfam_ret_NOMEM;
+      }
+    } else {
+      // Start everyone in the same cluster.
+      fill_uint_zero(sample_to_cluster, sample_ct);
+      cluster_ct = 1;
+    }
+    for (sample_idx = 0; sample_idx < sample_ct; sample_idx++) {
+      // Remove families and sibships.
+      if (sample_to_fss_idx[sample_idx] != 0xffffffffU) {
+	sample_to_cluster[sample_idx] = 0xffffffffU;
+      }
+    }
+
+    if (wkspace_alloc_ui_checked(&cluster_ctrl_case_cts, cluster_ct * 2 * sizeof(int32_t)) ||
+        wkspace_alloc_ui_checked(&cluster_write_idxs, cluster_ct * sizeof(int32_t))) {
+      goto dfam_ret_NOMEM;
+    }
+    fill_uint_zero(cluster_ctrl_case_cts, 2 * cluster_ct);
+    for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+      unrelated_cluster_idx = sample_to_cluster[sample_idx];
+      if (unrelated_cluster_idx != 0xffffffffU) {
+	cluster_ctrl_case_cts[2 * unrelated_cluster_idx + is_set(pheno_c, sample_uidx)] += 1;
+      }
+    }
+    // Construct reduced clusters -> samples map.
+    write_idx = 0;
+    for (unrelated_cluster_idx = 0; unrelated_cluster_idx < cluster_ct; unrelated_cluster_idx++) {
+      cur_ctrl_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx];
+      cur_case_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx + 1];
+      if (cur_ctrl_ct && cur_case_ct) {
+	unrelated_cluster_ct++;
+	cur_dfam_ptr[write_idx++] = cur_ctrl_ct + cur_case_ct;
+	cluster_write_idxs[unrelated_cluster_idx] = write_idx;
+	write_idx += cur_ctrl_ct + cur_case_ct;
+      }
+    }
+    for (sample_uidx = 0, sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+      next_unset_unsafe_ck(sample_exclude, &sample_uidx);
+      unrelated_cluster_idx = sample_to_cluster[sample_idx];
+      if (unrelated_cluster_idx != 0xffffffffU) {
+        cur_ctrl_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx];
+	cur_case_ct = cluster_ctrl_case_cts[2 * unrelated_cluster_idx + 1];
+	if (cur_ctrl_ct && cur_case_ct) {
+	  uii = cluster_write_idxs[unrelated_cluster_idx];
+	  cur_dfam_ptr[uii] = sample_uidx;
+	  clear_bit(dfam_sample_exclude, sample_uidx);
+	  cluster_write_idxs[unrelated_cluster_idx] = uii + 1;
+	}
+      }
+    }
+    cur_dfam_ptr = &(cur_dfam_ptr[write_idx]);
+  }
+  wkspace_reset((unsigned char*)idx_to_uidx);
+  wkspace_shrink_top(dfam_iteration_order, (cur_dfam_ptr - dfam_iteration_order) * sizeof(int32_t));
+  if (do_perms) {
+    logprint("Error: --dfam permutation tests are currently under development.\n");
+    retval = RET_CALC_NOT_YET_SUPPORTED;
+    goto dfam_ret_1;
+  }
+  if (mtest_adjust || do_perms) {
+    if (wkspace_alloc_d_checked(&orig_chisq, marker_ct * sizeof(double))) {
+      goto dfam_ret_NOMEM;
+    }
+  }
+  dfam_sample_ct = unfiltered_sample_ct - popcount_longs(dfam_sample_exclude, unfiltered_sample_ctl);
+  dfam_sample_ctl2 = (dfam_sample_ct + (BITCT2 - 1)) / BITCT2;
+  if (wkspace_alloc_ui_checked(&uidx_to_idx, unfiltered_sample_ct * sizeof(int32_t))) {
+    goto dfam_ret_NOMEM;
+  }
+  fill_uidx_to_idx(dfam_sample_exclude, unfiltered_sample_ct, dfam_sample_ct, uidx_to_idx);
+  cur_dfam_ptr = dfam_iteration_order;
+  uii = family_all_case_children_ct + family_mixed_ct;
+  for (fs_idx = 0; fs_idx < uii; fs_idx++) {
+    *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+    cur_dfam_ptr++;
+    *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+    cur_dfam_ptr++;
+    sibling_ct = *cur_dfam_ptr++;
+    for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+      *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+      cur_dfam_ptr++;
+    }
+  }
+  uii = sibship_mixed_ct + unrelated_cluster_ct;
+  for (fs_idx = 0; fs_idx < uii; fs_idx++) {
+    sibling_ct = *cur_dfam_ptr++;
+    for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+      *cur_dfam_ptr = uidx_to_idx[*cur_dfam_ptr];
+      cur_dfam_ptr++;
+    }
+  }
+  // DEBUG
+  printf("%u %u %u %u\n", family_all_case_children_ct, family_mixed_ct, sibship_mixed_ct, unrelated_cluster_ct);
+  wkspace_reset((unsigned char*)uidx_to_idx);
+  if (wkspace_alloc_ul_checked(&dfam_pheno_c, dfam_sample_ctl2 * sizeof(intptr_t)) ||
+      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+      wkspace_alloc_ul_checked(&workbuf, unfiltered_sample_ctp1l2 * sizeof(intptr_t)) ||
+      wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * dfam_sample_ctl2 * sizeof(intptr_t))) {
+    goto dfam_ret_NOMEM;
+  }
+  collapse_copy_bitarr(sample_ct, pheno_c, dfam_sample_exclude, dfam_sample_ct, dfam_pheno_c);
+  g_pheno_c = dfam_pheno_c;
+  loadbuf_raw[unfiltered_sample_ctl2 - 1] = 0;
+  workbuf[unfiltered_sample_ctp1l2 - 1] = 0;
+  for (ulii = 1; ulii <= MODEL_BLOCKSIZE; ulii++) {
+    // defensive
+    g_loadbuf[dfam_sample_ctl2 * ulii - 1] = 0;
+  }
+  // no X/haploid/MT, so no haploid filters
+
+  if (fill_orig_chisq) {
+    if (wkspace_alloc_d_checked(&g_orig_stat, marker_ct * sizeof(double))) {
+      goto dfam_ret_NOMEM;
+    }
+  }
+
+  ulii = 2 * max_marker_allele_len + plink_maxsnp + MAX_ID_LEN + 256;
+  if (ulii > MAXLINELEN) {
+    if (wkspace_alloc_c_checked(&textbuf, ulii)) {
+      goto dfam_ret_NOMEM;
+    }
+  }
+
+  // permutation test boilerplate mostly copied from qassoc() in plink_assoc.c,
+  // since it's also restricted to autosomes
+  g_mperm_save_all = NULL;
+  if (perm_maxt_nst) {
+    perms_total = fam_ip->dfam_mperm_val;
+    if (wkspace_alloc_d_checked(&maxt_extreme_stat, perms_total * sizeof(double))) {
+      goto dfam_ret_NOMEM;
+    }
+    g_maxt_extreme_stat = maxt_extreme_stat;
+    fill_double_zero(maxt_extreme_stat, perms_total);
+    if (mperm_save & MPERM_DUMP_ALL) {
+      memcpy(outname_end, ".mperm.dump.all", 16);
+      if (fopen_checked(&outfile_msa, outname, "w")) {
+        goto dfam_ret_OPEN_FAIL;
+      }
+      if (putc_checked('0', outfile_msa)) {
+	goto dfam_ret_WRITE_FAIL;
+      }
+      LOGPRINTF("Dumping all permutation chi-square values to %s .\n", outname);
+    }
+  } else {
+    mperm_save = 0;
+    if (perm_adapt_nst) {
+      g_aperm_alpha = apip->alpha;
+      perms_total = apip->max;
+      if (wkspace_alloc_ui_checked(&g_perm_attempt_ct, marker_ct * sizeof(int32_t)) ||
+          wkspace_alloc_uc_checked(&g_perm_adapt_stop, marker_ct)) {
+        goto dfam_ret_NOMEM;
+      }
+      ujj = apip->max;
+      for (uii = 0; uii < marker_ct; uii++) {
+	g_perm_attempt_ct[uii] = ujj;
+      }
+      fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
+      g_adaptive_ci_zt = ltqnorm(1 - apip->beta / (2.0 * ((intptr_t)marker_ct)));
+      if (apip->min < apip->init_interval) {
+        g_first_adapt_check = (int32_t)(apip->init_interval);
+      } else {
+	g_first_adapt_check = apip->min;
+      }
+      g_adaptive_intercept = apip->init_interval;
+      g_adaptive_slope = apip->interval_slope;
+    }
+  }
+
+  outname_end2 = memcpyb(outname_end, ".dfam", 6);
+  if (fopen_checked(&outfile, outname, "w")) {
+    goto dfam_ret_OPEN_FAIL;
+  }
+  LOGPRINTFWW5("Writing --dfam results to %s ... ", outname);
+  fflush(stdout);
+  sprintf(textbuf, " CHR %%%us   A1   A2      OBS      EXP        CHISQ            P \n", plink_maxsnp);
+  fprintf(outfile, textbuf, "SNP");
+  loop_end = marker_ct / 100;
+  marker_unstopped_ct = marker_ct;
+
+  if (do_perms) {
+    if (fam_ip->dfam_modifier & DFAM_PERM) {
+      if (perm_batch_size > apip->max) {
+        perm_batch_size = apip->max;
+      }
+    } else {
+      if (perm_batch_size > fam_ip->dfam_mperm_val) {
+        perm_batch_size = fam_ip->dfam_mperm_val;
+      }
+    }
+  }
+  
+  fputs("0%", stdout);
+  fflush(stdout);
+  // ----- begin main loop -----
+ dfam_more_perms:
+  if (do_perms_nst) {
+    if (!perm_pass_idx) {
+      // ...
+    }
+  }
+  chrom_fo_idx = 0xffffffffU;
+  marker_uidx = next_unset_unsafe(marker_exclude, 0);
+  if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+    goto dfam_ret_READ_FAIL;
+  }
+  marker_idx = 0;
+  marker_idx2 = 0;
+  chrom_end = 0;
+  do {
+    // since X/haploid/MT is not supported, ignore chromosome boundaries in
+    // this loop
+    block_size = 0;
+    block_end = marker_unstopped_ct - marker_idx;
+    if (block_end > MODEL_BLOCKSIZE) {
+      block_end = MODEL_BLOCKSIZE;
+    }
+    do {
+      if (perm_adapt_nst && g_perm_adapt_stop[marker_idx2]) {
+        do {
+	  marker_uidx++;
+	  next_unset_ul_unsafe_ck(marker_exclude, &marker_uidx);
+	  marker_idx2++;
+	} while (g_perm_adapt_stop[marker_idx2]);
+	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+	  goto dfam_ret_READ_FAIL;
+	}
+      }
+      if (load_raw2(bedfile, loadbuf_raw, unfiltered_sample_ct4, unfiltered_sample_ctl2m1, final_mask)) {
+	goto dfam_ret_READ_FAIL;
+      }
+      if (IS_SET(marker_reverse, marker_uidx)) {
+	reverse_loadbuf((unsigned char*)loadbuf_raw, unfiltered_sample_ct);
+      }
+      erase_mendel_errors(unfiltered_sample_ct, loadbuf_raw, workbuf, trio_error_lookup, trio_ct, multigen);
+      collapse_copy_2bitarr(loadbuf_raw, &(g_loadbuf[block_size * dfam_sample_ctl2]), unfiltered_sample_ct, dfam_sample_ct, dfam_sample_exclude);
+      if (perm_adapt_nst) {
+	g_adapt_m_table[block_size] = marker_idx2++;
+      }
+      mu_table[block_size++] = marker_uidx;
+      if (marker_idx + block_size == marker_unstopped_ct) {
+	break;
+      }
+      marker_uidx++;
+      if (IS_SET(marker_exclude, marker_uidx)) {
+	marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
+	if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
+	  goto dfam_ret_READ_FAIL;
+	}
+      }
+    } while (block_size < block_end);
+    if (!perm_pass_idx) {
+      // Calculate original chi-square values and write to disk:
+      // 1. Iterate through nuclear families with only case children.  If both
+      //    parents are not heterozygous, either parent has a missing call, or
+      //    all children have missing calls, skip.  Otherwise,
+      //      twice_numer          += 2 * [A1 allele count among kids] -
+      //                              ([# of kids] * [parental A1 allele ct])
+      //      quad_denom           += # of het parents
+      //      total_count          += [A1 allele count among kids]
+      //      twice_total_expected += [# of kids] * [parental A1 allele ct]
+      // 2. Iterate through nuclear families with at least one case and at
+      //    least one control child.  If all case children have missing calls,
+      //    skip.  Otherwise, if both parents are not heterozygous, or either
+      //    parent has a missing call, handle the children as in step 3.
+      //    Otherwise,
+      //      twice_numer          += 2 * [A1 allele count among case kids] -
+      //                              ([# of case kids] * [parental A1 ct])
+      //      quad_denom           += # of het parents
+      //      total_count          += [A1 allele count among case kids]
+      //      twice_total_expected += [# of case kids] * [parental A1 ct]
+      // 3. Iterate through sibships.  If all case siblings, or all control
+      //    siblings, have missing genotypes, skip.  Otherwise (see lines
+      //    420-456 of PLINK 1.07 dfam.cpp),
+      //      case_expected_hom_a1 := [case sib ct] * [sib hom A1 ct] /
+      //                              [sib ct]
+      //      case_expected_het    := [case sib ct] * [sib het ct] / [sib ct]
+      //      case_var_hom_a1      := ([case sib ct] * [ctrl sib ct] *
+      //                               [sib hom A1 ct] * [sib non-hom-A1]) /
+      //                              ([sib ct] * [sib ct] * ([sib ct - 1]))
+      //      case_var_het         := ([case sib ct] * [ctrl sib ct] *
+      //                               [sib het ct] * [sib non-het]) /
+      //                              ([sib ct] * [sib ct] * ([sib ct - 1]))
+      //      case_neg_covar       := ([case sib ct] * [ctrl sib ct] *
+      //      (between case hom a1     [sib hom A1 ct] * [sib het ct]) /
+      //       and case het cts)      ([sib ct] * [sib ct] * ([sib ct] - 1))
+      //      case_expected_a1_ct  := 2 * case_expected_hom_a1 +
+      //                              case_expected_het
+      //      case_var_a1_ct       := 4 * case_var_hom_a1 + case_var_het +
+      //                              4 * case_neg_covar
+      //      numer          += case_a1_ct - case_expected_a1_ct
+      //      denom          += case_var_a1_ct
+      //      total_count    += case_a1_ct
+      //      total_expected += case_expected_a1_ct
+      //    Shortcut when all genotypes are identical (this is common):
+      //      total_count    += case_a1_ct
+      //      total_expected += case_a1_ct
+      //      We could entirely skip this instead, but that would lead to a
+      //      different output file than 1.07.
+      // 4. Iterate through clusters of unrelateds.  If all genotypes are
+      //    missing or identical, skip.  Otherwise (see lines 557-571 of
+      //    dfam.cpp),
+      //      case_expected_a1_ct := [case ct] * [A1 ct] / [cluster size]
+      //      case_var_a1_ct      := ([case ct] * [ctrl ct]
+      //                              [A1 ct] * [A2 ct]) /
+      //                             (([clst size]^2) * ([clst size] - 1))
+      //      numer          += case_a1_ct - case_expected_a1_ct
+      //      denom          += case_var_a1_ct
+      //      total_count    += case_a1_ct
+      //      total_expected += case_expected_a1_ct
+      for (marker_bidx = 0; marker_bidx < block_size; marker_bidx++) {
+	marker_uidx2 = mu_table[marker_bidx];
+	// marker_idx_to_uidx[marker_idx + marker_bidx] = marker_uidx2;
+	loadbuf_ptr = &(g_loadbuf[marker_bidx * dfam_sample_ctl2]);
+	cur_dfam_ptr = dfam_iteration_order;
+	twice_numer = 0;
+	quad_denom = 0;
+	total_count = 0;
+	numer = 0.0;
+	denom = 0.0;
+	twice_total_expected = 0;
+	total_expected = 0;
+	for (fs_idx = 0; fs_idx < family_all_case_children_ct; fs_idx++) {
+          paternal_id = *cur_dfam_ptr++;
+	  maternal_id = *cur_dfam_ptr++;
+	  sibling_ct = *cur_dfam_ptr++;
+	  paternal_geno = (loadbuf_ptr[paternal_id / BITCT2] >> (2 * (paternal_id % BITCT2))) & 3;
+	  maternal_geno = (loadbuf_ptr[maternal_id / BITCT2] >> (2 * (maternal_id % BITCT2))) & 3;
+	  parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
+	  if (!parental_a1_ct) {
+	    cur_dfam_ptr = &(cur_dfam_ptr[sibling_ct]);
+	    continue;
+	  }
+	  cur_case_ct = 0;
+	  case_a1_ct = 0;
+          for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+            sample_idx = *cur_dfam_ptr++;
+	    cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+	    if (cur_geno == 1) {
+	      continue;
+	    }
+            cur_case_ct++;
+	    case_a1_ct += (4 - cur_geno) / 2;
+	  }
+	  if (cur_case_ct) {
+	    twice_numer += (int32_t)(2 * case_a1_ct) - (int32_t)(cur_case_ct * parental_a1_ct);
+	    quad_denom += 2 - (parental_a1_ct & 1);
+	    total_count += case_a1_ct;
+	    twice_total_expected += cur_case_ct * parental_a1_ct;
+	  }
+	}
+	for (fs_idx = 0; fs_idx < family_mixed_ct; fs_idx++) {
+          paternal_id = *cur_dfam_ptr++;
+	  maternal_id = *cur_dfam_ptr++;
+	  sibling_ct = *cur_dfam_ptr++;
+	  paternal_geno = (loadbuf_ptr[paternal_id / BITCT2] >> (2 * (paternal_id % BITCT2))) & 3;
+	  maternal_geno = (loadbuf_ptr[maternal_id / BITCT2] >> (2 * (maternal_id % BITCT2))) & 3;
+	  parental_a1_ct = dfam_allele_ct_table[paternal_geno * 4 + maternal_geno];
+	  cur_case_ct = 0;
+	  cur_ctrl_ct = 0;
+	  case_hom_a1_ct = 0;
+	  case_het_ct = 0;
+	  ctrl_hom_a1_ct = 0;
+	  ctrl_het_ct = 0;
+          for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+            sample_idx = *cur_dfam_ptr++;
+	    cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+	    if (cur_geno == 1) {
+	      continue;
+	    }
+	    if (IS_SET(dfam_pheno_c, sample_idx)) {
+	      cur_case_ct++;
+	      if (cur_geno != 3) {
+		if (cur_geno == 2) {
+		  case_het_ct++;
+		} else {
+		  case_hom_a1_ct++;
+		}
+	      }
+	    } else {
+	      cur_ctrl_ct++;
+	      if (cur_geno != 3) {
+		if (cur_geno == 2) {
+                  ctrl_het_ct++;
+		} else {
+		  ctrl_hom_a1_ct++;
+		}
+	      }
+	    }
+	  }
+	  if (!cur_case_ct) {
+	    continue;
+	  }
+          if (!parental_a1_ct) {
+	    dfam_sibship_calc(cur_case_ct, case_hom_a1_ct, case_het_ct, cur_ctrl_ct, ctrl_hom_a1_ct, ctrl_het_ct, &total_count, &numer, &denom, &total_expected);
+	  } else {
+	    case_a1_ct = 2 * case_hom_a1_ct + case_het_ct;
+	    twice_numer += (int32_t)(2 * case_a1_ct) - (int32_t)(cur_case_ct * parental_a1_ct);
+	    quad_denom += 2 - (parental_a1_ct & 1);
+	    total_count += case_a1_ct;
+	    twice_total_expected += cur_case_ct * parental_a1_ct;
+	  }
+	}
+	numer += 0.5 * ((double)twice_numer);
+	denom += 0.25 * ((double)quad_denom);
+	total_expected += 0.5 * ((double)twice_total_expected);
+	for (fs_idx = 0; fs_idx < sibship_mixed_ct; fs_idx++) {
+	  sibling_ct = *cur_dfam_ptr++;
+	  cur_case_ct = 0;
+	  cur_ctrl_ct = 0;
+	  case_hom_a1_ct = 0;
+	  case_het_ct = 0;
+	  ctrl_hom_a1_ct = 0;
+	  ctrl_het_ct = 0;
+          for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+            sample_idx = *cur_dfam_ptr++;
+	    cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+	    if (cur_geno == 1) {
+	      continue;
+	    }
+	    if (IS_SET(dfam_pheno_c, sample_idx)) {
+	      cur_case_ct++;
+	      if (cur_geno != 3) {
+		if (cur_geno == 2) {
+		  case_het_ct++;
+		} else {
+		  case_hom_a1_ct++;
+		}
+	      }
+	    } else {
+	      cur_ctrl_ct++;
+	      if (cur_geno != 3) {
+		if (cur_geno == 2) {
+                  ctrl_het_ct++;
+		} else {
+		  ctrl_hom_a1_ct++;
+		}
+	      }
+	    }
+	  }
+	  if (!cur_case_ct) {
+	    continue;
+	  }
+	  dfam_sibship_calc(cur_case_ct, case_hom_a1_ct, case_het_ct, cur_ctrl_ct, ctrl_hom_a1_ct, ctrl_het_ct, &total_count, &numer, &denom, &total_expected);
+	}
+	for (unrelated_cluster_idx = 0; unrelated_cluster_idx < unrelated_cluster_ct; unrelated_cluster_idx++) {
+	  sibling_ct = *cur_dfam_ptr++; // not actually siblings
+	  cur_case_ct = 0;
+	  cur_ctrl_ct = 0;
+	  case_hom_a1_ct = 0;
+	  case_het_ct = 0;
+	  ctrl_hom_a1_ct = 0;
+	  ctrl_het_ct = 0;
+          for (sib_idx = 0; sib_idx < sibling_ct; sib_idx++) {
+            sample_idx = *cur_dfam_ptr++;
+	    cur_geno = (loadbuf_ptr[sample_idx / BITCT2] >> (2 * (sample_idx % BITCT2))) & 3;
+	    if (cur_geno == 1) {
+	      continue;
+	    }
+	    if (IS_SET(dfam_pheno_c, sample_idx)) {
+	      cur_case_ct++;
+	      if (cur_geno != 3) {
+		if (cur_geno == 2) {
+		  case_het_ct++;
+		} else {
+		  case_hom_a1_ct++;
+		}
+	      }
+	    } else {
+	      cur_ctrl_ct++;
+	      if (cur_geno != 3) {
+		if (cur_geno == 2) {
+		  ctrl_het_ct++;
+		} else {
+		  ctrl_hom_a1_ct++;
+		}
+	      }
+	    }
+	  }
+	  case_a1_ct = 2 * case_hom_a1_ct + case_het_ct;
+	  hom_a1_ct = case_hom_a1_ct + ctrl_hom_a1_ct;
+	  het_ct = case_het_ct + ctrl_het_ct;
+	  uii = cur_case_ct + cur_ctrl_ct;
+	  if ((uii <= 1) || ((!hom_a1_ct) && (!het_ct)) || (hom_a1_ct == uii) || (het_ct == uii)) {
+	    continue;
+	  }
+	  total_count += case_a1_ct;
+	  if ((!cur_case_ct) || (!cur_ctrl_ct)) {
+	    total_expected += (double)((int32_t)case_a1_ct);
+	    continue;
+	  }
+	  dxx = ((double)((int32_t)uii));
+	  case_proportion = ((double)((int32_t)cur_case_ct)) / dxx;
+	  ujj = 2 * hom_a1_ct + het_ct;
+	  case_expected_a1_ct = case_proportion * ((double)((int32_t)ujj));
+	  case_var_a1_ct = case_expected_a1_ct * cur_ctrl_ct * ((double)((int32_t)(2 * uii - ujj))) / (dxx * (dxx - 1));
+          numer += case_a1_ct - case_expected_a1_ct;
+	  denom += case_var_a1_ct;
+	  total_expected += case_expected_a1_ct;
+	}
+	printf("%g %g %u %g\n", numer, denom, total_count, total_expected);
+	if (marker_bidx == 2) {
+	  exit(1);
+	}
+      }
+    }
+    marker_idx += block_size;
+    if ((!perm_pass_idx) && (marker_idx >= loop_end)) {
+      if (marker_idx < marker_unstopped_ct) {
+	if (pct >= 10) {
+	  putchar('\b');
+	}
+        pct = (marker_idx * 100LLU) / marker_unstopped_ct;
+	printf("\b\b%u%%", pct);
+	fflush(stdout);
+	loop_end = (((uint64_t)pct + 1LLU) * marker_unstopped_ct) / 100;
+      }
+    }
+  } while (marker_idx < marker_unstopped_ct);
+  if (!perm_pass_idx) {
+    if (pct >= 10) {
+      putchar('\b');
+    }
+    fputs("\b\b", stdout);
+    logprint("done.\n");
+    if (do_perms_nst) {
+      // wkspace_reset();
+    }
+    if (fclose_null(&outfile)) {
+      goto dfam_ret_WRITE_FAIL;
+    }
+    if (!is_set_test) {
+      if (wkspace_alloc_ui_checked(&idx_to_uidx, marker_ct * sizeof(int32_t))) {
+	goto dfam_ret_NOMEM;
+      }
+      fill_idx_to_uidx(marker_exclude, unfiltered_marker_ct, marker_ct, idx_to_uidx);
+      retval = multcomp(outname, outname_end, idx_to_uidx, marker_ct, marker_ids, max_marker_id_len, plink_maxsnp, chrom_info_ptr, orig_chisq, pfilter, output_min_p, mtest_adjust, 0, adjust_lambda, NULL, NULL);
+      if (retval) {
+	goto dfam_ret_1;
+      }
+      wkspace_reset(idx_to_uidx);
+      // if (mperm_save & MPERM_DUMP_ALL) { ...
+    } else {
+      // retval = dfam_set_test(threads, bedfile, bed_offset, outname, outname_end, ...);
+      if (retval) {
+        goto dfam_ret_1;
+      }
+    }
+  }
+  if (do_perms_nst) {
+    // if (mperm_save & MPERM_DUMP_ALL) { ...
+    // wkspace_reset();
+    if (perms_done < perms_total) {
+    }
+  }
+  // ...
+  
+  while (0) {
+  dfam_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
+  dfam_ret_OPEN_FAIL:
+    retval = RET_OPEN_FAIL;
+    break;
+  dfam_ret_READ_FAIL:
+    retval = RET_READ_FAIL;
+    break;
+  dfam_ret_WRITE_FAIL:
+    retval = RET_WRITE_FAIL;
+    break;
+  dfam_ret_INVALID_CMDLINE:
+    retval = RET_INVALID_CMDLINE;
+    break;
+  }
+ dfam_ret_1:
+  wkspace_reset(wkspace_mark);
+  fclose_cond(outfile);
+  fclose_cond(outfile_msa);
+  return retval;
+  */
+}
+
 void uint32_permute(uint32_t* perm_arr, uint32_t* precomputed_mods, sfmt_t* sfmtp, uint32_t ct) {
   // Sets perm_arr[0..(ct-1)] to a random permutation of 0..(ct-1).  Assumes
   // ct >= 2.
@@ -2871,6 +3881,7 @@ void qfam_compute_bw(uintptr_t* loadbuf, uintptr_t sample_ct, uint32_t* fs_start
     } else {
       clear_bit(nm_fss, cur_idx);
     }
+    cur_start = cur_end;
   }
   for (; cur_idx < fss_ct; cur_idx++) {
     sample_uidx = *fss_ptr++;
@@ -3021,43 +4032,6 @@ static inline uint32_t qfam_regress(uint32_t test_type, uint32_t nind, uint32_t
   return 0;
 }
 
-// multithread globals
-static uintptr_t* g_loadbuf;
-static uintptr_t* g_lm_eligible;
-static uintptr_t* g_lm_within2_founder;
-static uintptr_t* g_qfam_flip;
-static uintptr_t* g_nm_fss;
-static uintptr_t* g_nm_lm;
-static uint32_t* g_qfam_permute;
-static uint32_t* g_permute_edit;
-static uint32_t* g_perm_2success_ct;
-static uint32_t* g_perm_attempt_ct;
-static uint32_t* g_fs_starts;
-static uint32_t* g_fss_contents;
-static uint32_t* g_sample_lm_to_fss_idx;
-static unsigned char* g_perm_adapt_stop;
-static uint32_t g_adapt_m_table[MODEL_BLOCKSIZE];
-static double* g_orig_stat;
-static double* g_pheno_d2;
-static double* g_qfam_b;
-static double* g_qfam_w;
-static uintptr_t g_cur_perm_ct;
-static double g_qt_sum_all;
-static double g_qt_ssq_all;
-static uint32_t g_test_type;
-static uint32_t g_qfam_thread_ct;
-static uint32_t g_fs_ct;
-static uint32_t g_singleton_ct;
-static uint32_t g_lm_ct;
-static uint32_t g_family_ct;
-static uint32_t g_block_size;
-static uint32_t g_perms_done;
-static uint32_t g_first_adapt_check;
-static double g_adaptive_intercept;
-static double g_adaptive_slope;
-static double g_aperm_alpha;
-static double g_adaptive_ci_zt;
-
 THREAD_RET_TYPE qfam_thread(void* arg) {
   uintptr_t tidx = (uintptr_t)arg;
   uint32_t qfam_thread_ct = g_qfam_thread_ct;
@@ -3077,6 +4051,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
   double* qfam_b = &(g_qfam_b[tidx * CACHEALIGN32_DBL(fss_ct)]);
   double* qfam_w = &(g_qfam_w[tidx * CACHEALIGN32_DBL(lm_ct)]);
   double* pheno_d2 = g_pheno_d2;
+  double* beta_sum = g_beta_sum;
+  double* beta_ssq = g_beta_ssq;
   uint32_t* qfam_permute = only_within? NULL : g_qfam_permute;
   uint32_t* permute_edit_buf = only_within? NULL : (&(g_permute_edit[tidx * CACHEALIGN32_INT32(fss_ct)]));
   uint32_t* perm_2success_ct = g_perm_2success_ct;
@@ -3085,6 +4061,7 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
   uint32_t* fss_contents = g_fss_contents;
   uint32_t* sample_lm_to_fss_idx = g_sample_lm_to_fss_idx;
   uint32_t* perm_ptr = NULL;
+  uint32_t* beta_fail_cts = g_beta_fail_cts;
   uintptr_t cur_perm_ct = g_cur_perm_ct;
   uintptr_t sample_ct = g_sample_ct;
   uintptr_t sample_ctl2 = (sample_ct + (BITCT2 - 1)) / BITCT2;
@@ -3111,6 +4088,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
   double qt_ssq;
   double nind_recip;
   double beta;
+  double cur_beta_sum;
+  double cur_beta_ssq;
   double tstat;
   double pval;
   double dxx;
@@ -3122,6 +4101,7 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
   uint32_t success_2start;
   uint32_t success_2incr;
   uint32_t next_adapt_check;
+  uint32_t cur_beta_fail_cts;
   uint32_t cur_fss_ct;
   uint32_t nind;
   uint32_t orig_fss_idx;
@@ -3159,6 +4139,9 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
       if (only_within) {
 	flip_precalc(lm_ct, qfam_w, pheno_d2, nm_lm, &geno_sum, &geno_ssq, &qt_g_prod);
       }
+      cur_beta_sum = 0.0;
+      cur_beta_ssq = 0.0;
+      cur_beta_fail_cts = 0;
 
       for (pidx = 0; pidx < cur_perm_ct;) {
 	if (!only_within) {
@@ -3189,6 +4172,8 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
 	  }
 	}
 	if (!qfam_regress(test_type, nind, lm_ct, sample_lm_to_fss_idx, nm_lm, pheno_d2, qfam_b, qfam_w, perm_ptr, &(qfam_flip[pidx * flip_ctl]), nind_recip, qt_sum, qt_ssq, geno_sum, geno_ssq, qt_g_prod, &beta, &tstat)) {
+	  cur_beta_sum += beta;
+	  cur_beta_ssq += beta * beta;
 	  tstat = fabs(tstat);
 	  if (tstat > stat_high) {
 	    success_2incr += 2;
@@ -3198,6 +4183,7 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
 	} else {
 	  // conservative handling of permutation regression failure
 	  success_2incr += 2;
+	  cur_beta_fail_cts++;
 	}
 	if (++pidx == next_adapt_check - pidx_offset) {
 	  // won't ever get here with fixed number of permutations
@@ -3217,6 +4203,13 @@ THREAD_RET_TYPE qfam_thread(void* arg) {
 	}
       }
       perm_2success_ct[marker_idx] += success_2incr;
+      if (beta_sum) {
+	beta_sum[marker_idx] += cur_beta_sum;
+	beta_ssq[marker_idx] += cur_beta_ssq;
+	if (cur_beta_fail_cts) {
+	  beta_fail_cts[marker_idx] += cur_beta_fail_cts;
+	}
+      }
     }
   qfam_thread_skip_all:
     if ((!tidx) || g_is_last_thread_block) {
@@ -3241,6 +4234,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   double geno_sum = 0.0;
   double geno_ssq = 0.0;
   double qt_g_prod = 0.0;
+  double* orig_beta = NULL;
   char* chrom_name_ptr = NULL;
   uint32_t unfiltered_sample_ctl2m1 = (unfiltered_sample_ct - 1) / BITCT2;
   uint32_t test_type = fam_ip->qfam_modifier & QFAM_TEST;
@@ -3248,6 +4242,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   uint32_t multigen = (fam_ip->mendel_modifier / MENDEL_MULTIGEN) & 1;
   uint32_t only_within = (test_type & (QFAM_WITHIN1 | QFAM_WITHIN2))? 1 : 0;
   uint32_t perm_count = fam_ip->qfam_modifier & QFAM_PERM_COUNT;
+  uint32_t emp_se = fam_ip->qfam_modifier & QFAM_EMP_SE;
   uint32_t perms_done = 0;
   uint32_t chrom_idx = 0;
   uint32_t qfam_thread_ct = g_thread_ct;
@@ -3363,7 +4358,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     goto qfam_ret_INVALID_CMDLINE;
   }
 #endif
-  if (get_sibship_info(unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, pheno_d, founder_info, sample_ids, max_sample_id_len, max_fid_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, family_list, trio_list, family_ct, trio_ct, test_type, &lm_eligible, &lm_within2_founder, &fs_starts, &fss_contents, &sample_lm_to_fss_idx, &fs_ct, &lm_ct, &singleton_ct)) {
+  if (get_sibship_info(unfiltered_sample_ct, sample_exclude, sample_ct, pheno_nm, founder_info, sample_ids, max_sample_id_len, max_fid_len, paternal_ids, max_paternal_id_len, maternal_ids, max_maternal_id_len, family_list, trio_list, family_ct, trio_ct, test_type, &lm_eligible, &lm_within2_founder, &fs_starts, &fss_contents, &sample_lm_to_fss_idx, &fs_ct, &lm_ct, &singleton_ct)) {
     goto qfam_ret_NOMEM;
   }
   fss_ct = fs_ct + singleton_ct;
@@ -3446,6 +4441,21 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       goto qfam_ret_NOMEM;
     }
   }
+  if (emp_se) {
+    if (wkspace_alloc_d_checked(&orig_beta, marker_ct * sizeof(double)) ||
+        wkspace_alloc_d_checked(&g_beta_sum, marker_ct * sizeof(double)) ||
+        wkspace_alloc_d_checked(&g_beta_ssq, marker_ct * sizeof(double)) ||
+        wkspace_alloc_ui_checked(&g_beta_fail_cts, marker_ct * sizeof(double))) {
+      goto qfam_ret_NOMEM;
+    }
+    fill_double_zero(g_beta_sum, marker_ct);
+    fill_double_zero(g_beta_ssq, marker_ct);
+    fill_uint_zero(g_beta_fail_cts, marker_ct);
+  } else {
+    g_beta_sum = NULL;
+    g_beta_ssq = NULL;
+    g_beta_fail_cts = NULL;
+  }
   if (wkspace_alloc_ul_checked(&g_loadbuf, MODEL_BLOCKSIZE * sample_ctl2 * sizeof(intptr_t)) ||
       wkspace_alloc_d_checked(&g_orig_stat, marker_ct * sizeof(double)) ||
       wkspace_alloc_ul_checked(&g_qfam_flip, perm_batch_size * flip_ctl * sizeof(intptr_t)) ||
@@ -3629,6 +4639,9 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
 	    // do not apply --output-min-p since only the empirical p-value is
 	    // supposed to be postprocessed here, not this one
 	    bufptr = double_g_writewx4x(bufptr, calc_tprob(tstat, nind - 2), 12, '\n');
+	    if (emp_se) {
+	      orig_beta[marker_idx_base + block_idx] = beta;
+	    }
 	    *orig_stat_ptr++ = fabs(tstat);
 	  } else {
 	    bufptr = memcpya(bufptr, "        NA           NA           NA\n", 37);
@@ -3688,7 +4701,7 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
   if (fopen_checked(&outfile, outname, "w")) {
     goto qfam_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, " CHR %%%us         EMP1           NP \n", plink_maxsnp);
+  sprintf(tbuf, emp_se? " CHR %%%us         BETA     EMP_BETA       EMP_SE         EMP1           NP \n" : " CHR %%%us         EMP1           NP \n", plink_maxsnp);
   fprintf(outfile, tbuf, "SNP");
   chrom_fo_idx = 0xffffffffU;
   chrom_end = 0;
@@ -3711,6 +4724,9 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
     bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
     *bufptr++ = ' ';
     if (g_orig_stat[marker_idx] == -9) {
+      if (emp_se) {
+	bufptr = memcpya(bufptr, "          NA           NA           NA ", 39);
+      }
       bufptr = memcpya(bufptr, "          NA           NA\n", 26);
     } else {
       uii = g_perm_2success_ct[marker_idx];
@@ -3719,6 +4735,18 @@ int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outn
       } else {
 	ujj = perms_total;
       }
+      if (emp_se) {
+	bufptr = double_g_writewx4x(bufptr, orig_beta[marker_idx], 12, ' ');
+	ukk = ujj - g_beta_fail_cts[marker_idx];
+	if (ukk <= 1) {
+          bufptr = memcpya(bufptr, "          NA ", 13);
+	} else {
+	  dxx = g_beta_sum[marker_idx] / ((double)((int32_t)ukk));
+	  bufptr = double_g_writewx4x(bufptr, dxx, 12, ' ');
+	  dxx = sqrt((g_beta_ssq[marker_idx] - g_beta_sum[marker_idx] * dxx) / ((double)((int32_t)(ukk - 1))));
+          bufptr = double_g_writewx4x(bufptr, dxx, 12, ' ');
+	}
+      }
       if (!perm_count) {
         dxx = ((double)(uii + 2)) / ((double)(2 * (ujj + 1)));
       } else {
diff --git a/plink_family.h b/plink_family.h
index 2434199..1558046 100644
--- a/plink_family.h
+++ b/plink_family.h
@@ -1,16 +1,25 @@
 #ifndef __PLINK_FAMILY_H__
 #define __PLINK_FAMILY_H__
 
+#include "plink_set.h"
+
 #define TDT_EXACT 1
 #define TDT_MIDP 2
 #define TDT_POO 4
 #define TDT_PERM 8
 #define TDT_MPERM 0x10
-#define TDT_PARENPERM1 0x20
-#define TDT_PARENPERM2 0x40
-#define TDT_POOPERM_PAT 0x80
-#define TDT_POOPERM_MAT 0x100
-#define TDT_SET_TEST 0x200
+#define TDT_PERM_COUNT 0x20
+#define TDT_PARENPERM1 0x40
+#define TDT_PARENPERM2 0x80
+#define TDT_POOPERM_PAT 0x100
+#define TDT_POOPERM_MAT 0x200
+#define TDT_SET_TEST 0x400
+
+#define DFAM_NO_UNRELATEDS 1
+#define DFAM_PERM 2
+#define DFAM_MPERM 4
+#define DFAM_PERM_COUNT 8
+#define DFAM_SET_TEST 0x10
 
 #define QFAM_WITHIN1 1
 #define QFAM_WITHIN2 2
@@ -20,6 +29,7 @@
 #define QFAM_PERM 0x10
 #define QFAM_MPERM 0x20
 #define QFAM_PERM_COUNT 0x40
+#define QFAM_EMP_SE 0x80
 
 extern const uint32_t mendel_error_table[];
 extern const uint32_t mendel_error_table_x[];
@@ -31,6 +41,8 @@ typedef struct {
   uint32_t mendel_modifier;
   uint32_t tdt_modifier;
   uint32_t tdt_mperm_val;
+  uint32_t dfam_modifier;
+  uint32_t dfam_mperm_val;
   uint32_t qfam_modifier;
   uint32_t qfam_mperm_val;
 } Family_info;
@@ -70,7 +82,9 @@ typedef struct {
 
 int32_t populate_pedigree_rel_info(Pedigree_rel_info* pri_ptr, uintptr_t unfiltered_sample_ct, char* sample_ids, uintptr_t max_sample_id_len, char* paternal_ids, uintptr_t max_paternal_id_len, char* maternal_ids, uintptr_t max_maternal_id_len, uintptr_t* founder_info);
 
-int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
+int32_t tdt(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double ci_size, double ci_zt, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfilter [...]
+
+int32_t dfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double pfilter, double output_min_p, uint32_t mtest_adjust, double adjust_lambda, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude_orig, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintp [...]
 
 int32_t qfam(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, uint32_t* marker_pos, char** marker_allele_ptrs, uintptr_t* marker_reverse, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, Aperm_info* apip, uintptr_t* pheno_nm, double* pheno_d, uintptr_t* founder_info, u [...]
 
diff --git a/plink_filter.c b/plink_filter.c
index bcd2310..b17c998 100644
--- a/plink_filter.c
+++ b/plink_filter.c
@@ -3,6 +3,8 @@
 #include "plink_filter.h"
 #include "plink_stats.h"
 
+#include "pigz.h"
+
 void oblig_missing_init(Oblig_missing_info* om_ip) {
   om_ip->cluster_ct = 0;
   om_ip->entry_ct = 0;
@@ -1037,6 +1039,72 @@ int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marke
   return retval;
 }
 
+uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
+  uint32_t sample_ct = unfiltered_sample_ct - *sample_exclude_ct_ptr;
+  uint32_t sample_uidx = 0;
+  uint32_t samples_done = 0;
+  uint32_t removed_ct = 0;
+  uint32_t uint32_thresh = (uint32_t)(thin_keep_prob * 4294967296.0 + 0.5);
+  uint32_t sample_uidx_stop;
+  while (samples_done < sample_ct) {
+    sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
+    sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
+    samples_done += sample_uidx_stop - sample_uidx;
+    do {
+      if(sfmt_genrand_uint32(&sfmt) >= uint32_thresh) {
+        SET_BIT(sample_exclude, sample_uidx);
+        removed_ct++;
+      }
+    } while (++sample_uidx < sample_uidx_stop);
+  }
+  if (sample_ct == removed_ct) {
+    LOGPRINTF("Error: All %s removed by --thin-indiv. Try a higher probability.\n", g_species_plural);
+    return 1;
+  }
+  LOGPRINTF("--thin-indiv: %u %s removed (%u remaining).\n", removed_ct, (removed_ct==1)? g_species_singular : g_species_plural, sample_ct - removed_ct);
+  *sample_exclude_ct_ptr += removed_ct;
+  return 0;
+}
+
+int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr) {
+  unsigned char* wkspace_mark = wkspace_base;
+  uint32_t sample_ct = unfiltered_sample_ct - *sample_exclude_ct_ptr;
+  uint32_t sample_uidx = 0;
+  uintptr_t sample_ctl = (sample_ct + (BITCT - 1)) / BITCT;
+  int32_t retval = 0;
+  uintptr_t* perm_buf;
+  uint32_t sample_idx;
+  if (thin_keep_ct > sample_ct) {
+    LOGPRINTF("Error: --thin-indiv-count parameter exceeds number of remaining %s.\n", g_species_plural);
+    goto random_thin_samples_ct_ret_INVALID_CMDLINE;
+  }
+  if (wkspace_alloc_ul_checked(&perm_buf, sample_ctl * sizeof(intptr_t))) {
+    goto random_thin_samples_ct_ret_NOMEM;
+  }
+
+  generate_perm1_interleaved(sample_ct, sample_ct - thin_keep_ct, 0, 1, perm_buf);
+  sample_uidx = 0;
+  for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
+    next_unset_unsafe_ck(sample_exclude, &sample_uidx);
+    if (is_set(perm_buf, sample_idx)) {
+      set_bit(sample_exclude, sample_uidx);
+    }
+  }
+  LOGPRINTF("--thin-indiv-count: %u %s removed (%u remaining).\n", sample_ct - thin_keep_ct, (sample_ct - thin_keep_ct == 1)? g_species_singular : g_species_plural, thin_keep_ct);
+  *sample_exclude_ct_ptr = unfiltered_sample_ct - thin_keep_ct;
+  while(0) {
+  random_thin_samples_ct_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
+  random_thin_samples_ct_ret_INVALID_CMDLINE:
+    retval = RET_INVALID_CMDLINE;
+    break;
+  }
+  wkspace_reset(wkspace_mark);
+  return retval;
+}
+
+
 int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip) {
   // 1. load and validate cluster file
   // 2. load marker file, sort by uidx
@@ -2109,7 +2177,7 @@ static inline void haploid_single_marker_freqs(uintptr_t unfiltered_sample_ct, u
   *hethap_incr_ptr = hethap_incr;
 }
 
-int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
+int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
   FILE* hhfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + BITCT - 1) / BITCT;
@@ -2170,7 +2238,6 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   uintptr_t* geno_excl_bitfield = NULL;
   uintptr_t* ac_excl_bitfield = NULL;
   uint64_t* om_entry_ptr = NULL;
-  double* marker_weights = NULL;
   uint32_t sample_nonmale_ct = 0;
   uint32_t sample_f_nonmale_ct = 0;
   uint32_t sample_f_ctl_nonmale_ct = 0;
@@ -2202,20 +2269,6 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   uint32_t ujj;
   double maf;
   double cur_genotyping_rate;
-  if (wt_needed) {
-    // this is a pretty ugly hack... but no worse than what preceded it, I
-    // suppose
-    marker_weights = (double*)top_alloc(topsize_ptr, CACHEALIGN(unfiltered_marker_ct * sizeof(double)));
-    if (!marker_weights) {
-      goto calc_freqs_and_hwe_ret_NOMEM;
-    }
-    wkspace_left -= *topsize_ptr;
-    *marker_weights_ptr = marker_weights;
-    for (marker_uidx = 0; marker_uidx < unfiltered_marker_ct; marker_uidx++) {
-      marker_weights[marker_uidx] = -1.0;
-    }
-  }
-
   if (!hwe_needed) {
     *hwe_lls_ptr = (int32_t*)wkspace_base;
   } else {
@@ -2629,9 +2682,6 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
 	  maf = ((double)ujj) / ((double)uii);
 	}
 	set_allele_freqs[marker_uidx] = maf;
-	if (wt_needed) {
-	  marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, maf);
-	}
       }
       nonmissing_rate_tot += cur_genotyping_rate;
       if (geno_excl_bitfield && (cur_genotyping_rate < geno_thresh)) {
@@ -2678,9 +2728,8 @@ int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uint
   return retval;
 }
 
-int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_male, uint32_t sampl [...]
+int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
   unsigned char* wkspace_mark = wkspace_base;
-  FILE* outfile = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t unfiltered_sample_ct2l = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
   uintptr_t unfiltered_sample_ctv2 = (unfiltered_sample_ct2l + 1) & (~1);
@@ -2688,6 +2737,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   uintptr_t* sample_male_include2 = NULL;
   uint64_t* om_entry_ptr = NULL;
   uintptr_t* cur_omidxs = NULL;
+  char* pzwritep = NULL;
   uint32_t* sample_to_cluster = NULL;
   uint32_t* missing_ct_by_cluster = NULL;
   uint32_t* oblig_missing_ct_by_cluster = NULL;
@@ -2705,6 +2755,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   uint32_t om_cluster_ct = 0;
   uint32_t om_cluster_ctl = 0;
   int32_t retval = 0;
+  Pigz_state ps;
   uintptr_t* loadbuf;
   uintptr_t* sample_include2;
   uintptr_t* cur_nm;
@@ -2712,7 +2763,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   uintptr_t* lptr2;
   uint32_t* missing_cts;
   uint32_t* cur_cluster_sizes;
-  char* wptr;
+  unsigned char* overflow_buf;
   char* cptr;
   char* cptr2;
   uintptr_t marker_ct_nony;
@@ -2734,7 +2785,9 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   uint32_t ukk;
   uint32_t umm;
   uint32_t unn;
-  if (wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
+  pzwrite_init_null(&ps);
+  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + MAXLINELEN) ||
+      wkspace_alloc_ui_checked(&missing_cts, unfiltered_sample_ct * sizeof(int32_t)) ||
       wkspace_alloc_ul_checked(&loadbuf, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
       wkspace_alloc_ul_checked(&sample_include2, unfiltered_sample_ctv2 * sizeof(intptr_t)) ||
       wkspace_alloc_ul_checked(&sample_male_include2, unfiltered_sample_ctv2 * sizeof(intptr_t))) {
@@ -2753,10 +2806,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   if (fseeko(bedfile, bed_offset, SEEK_SET)) {
     goto write_missingness_reports_ret_READ_FAIL;
   }
-  memcpy(outname_end, ".lmiss", 7);
-  if (fopen_checked(&outfile, outname, "w")) {
-    goto write_missingness_reports_ret_WRITE_FAIL;
+  memcpy(outname_end, output_gz? ".lmiss.gz" : ".lmiss", output_gz? 10 : 7);
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+    goto write_missingness_reports_ret_OPEN_FAIL;
   }
+  pzwritep = (char*)overflow_buf;
+
   if (om_ip->entry_ct) {
     om_entry_ptr = om_ip->entries;
     om_cluster_ref_cts = om_ip->cluster_ref_cts;
@@ -2789,7 +2844,7 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
   }
   ujj = unfiltered_sample_ct2l * BITCT2;
   if (!cluster_ct) {
-    sprintf(tbuf, " CHR %%%us   N_MISS   N_GENO   F_MISS\n", plink_maxsnp);
+    sprintf(tbuf, " CHR %%%us   N_MISS   N_GENO   F_MISS" EOLN_STR, plink_maxsnp);
   } else {
     if (wkspace_alloc_ui_checked(&sample_to_cluster, unfiltered_sample_ct * sizeof(int32_t)) ||
         wkspace_alloc_ui_checked(&missing_ct_by_cluster, cluster_ct * sizeof(int32_t)) ||
@@ -2815,9 +2870,10 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	}
       }
     }
-    sprintf(tbuf, " CHR %%%us       CLST   N_MISS   N_CLST   N_GENO   F_MISS\n", plink_maxsnp);
+    sprintf(tbuf, " CHR %%%us       CLST   N_MISS   N_CLST   N_GENO   F_MISS" EOLN_STR, plink_maxsnp);
   }
-  fprintf(outfile, tbuf, "SNP");
+
+  pzwritep += sprintf(pzwritep, tbuf, "SNP");
   for (chrom_fo_idx = 0; chrom_fo_idx < chrom_info_ptr->chrom_ct; chrom_fo_idx++) {
     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
     chrom_end = chrom_info_ptr->chrom_file_order_marker_idx[chrom_fo_idx + 1];
@@ -2879,10 +2935,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	      ulii &= ulii - 1;
 	    }
 	  }
-	  wptr = uint32_writew8x(cptr2, ukk - oblig_ct, ' ');
-          wptr = uint32_writew8x(wptr, cur_tot - oblig_ct, ' ');
-	  wptr = double_g_writewx4x(wptr, ((double)((int32_t)(ukk - oblig_ct))) / ((double)((int32_t)(cur_tot - oblig_ct))), 8, '\n');
-          if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+          pzwritep = memcpya(pzwritep, tbuf, cptr2 - tbuf);
+	  pzwritep = uint32_writew8x(pzwritep, ukk - oblig_ct, ' ');
+          pzwritep = uint32_writew8x(pzwritep, cur_tot - oblig_ct, ' ');
+	  pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)(ukk - oblig_ct))) / ((double)((int32_t)(cur_tot - oblig_ct))), 8);
+          append_binary_eoln(&pzwritep);
+	  if (flex_pzwrite(&ps, &pzwritep)) {
 	    goto write_missingness_reports_ret_WRITE_FAIL;
 	  }
 	} else {
@@ -2926,16 +2984,18 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	    }
 	  }
 	  for (clidx = 0; clidx < cluster_ct; clidx++) {
-            wptr = fw_strcpy(10, &(cluster_ids[clidx * max_cluster_id_len]), cptr2);
-	    *wptr++ = ' ';
+            pzwritep = memcpya(pzwritep, tbuf, cptr2 - tbuf);
+            pzwritep = fw_strcpy(10, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+	    *pzwritep++ = ' ';
 	    uii = missing_ct_by_cluster[clidx];
-            wptr = uint32_writew8x(wptr, uii, ' ');
+            pzwritep = uint32_writew8x(pzwritep, uii, ' ');
 	    umm = cur_cluster_sizes[clidx];
-	    wptr = uint32_writew8x(wptr, umm, ' ');
+	    pzwritep = uint32_writew8x(pzwritep, umm, ' ');
 	    umm -= oblig_missing_ct_by_cluster[clidx];
-	    wptr = uint32_writew8x(wptr, umm, ' ');
-            wptr = double_g_writewx4x(wptr, ((double)((int32_t)uii)) / ((double)((int32_t)umm)), 8, '\n');
-	    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	    pzwritep = uint32_writew8x(pzwritep, umm, ' ');
+            pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)uii)) / ((double)((int32_t)umm)), 8);
+	    append_binary_eoln(&pzwritep);
+	    if (flex_pzwrite(&ps, &pzwritep)) {
 	      goto write_missingness_reports_ret_WRITE_FAIL;
 	    }
 	  }
@@ -2952,15 +3012,16 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
       } while (marker_uidx < chrom_end);
     }
   }
-  if (fclose_null(&outfile)) {
+  if (flex_pzwrite_close_null(&ps, pzwritep)) {
     goto write_missingness_reports_ret_WRITE_FAIL;
   }
   outname_end[1] = 'i';
-  if (fopen_checked(&outfile, outname, "w")) {
-    goto write_missingness_reports_ret_WRITE_FAIL;
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+    goto write_missingness_reports_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, "%%%us %%%us MISS_PHENO   N_MISS   N_GENO   F_MISS\n", plink_maxfid, plink_maxiid);
-  fprintf(outfile, tbuf, "FID", "IID");
+  pzwritep = (char*)overflow_buf;
+  sprintf(tbuf, "%%%us %%%us MISS_PHENO   N_MISS   N_GENO   F_MISS" EOLN_STR, plink_maxfid, plink_maxiid);
+  pzwritep += sprintf(pzwritep, tbuf, "FID", "IID");
   do {
     sample_uidx = next_unset_unsafe(sample_exclude, sample_uidx);
     sample_uidx_stop = next_set(sample_exclude, sample_uidx, unfiltered_sample_ct);
@@ -2969,12 +3030,12 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
       cptr = &(sample_ids[sample_uidx * max_sample_id_len]);
       cptr2 = (char*)memchr(cptr, '\t', max_sample_id_len);
       slen = (uintptr_t)(cptr2 - cptr);
-      wptr = memseta(tbuf, 32, plink_maxfid - slen);
-      wptr = memcpyax(wptr, cptr, slen, ' ');
-      wptr = fw_strcpy(plink_maxiid, &(cptr2[1]), wptr);
-      wptr = memseta(wptr, 32, 10);
-      *wptr++ = 'Y' - (is_set(pheno_nm, sample_uidx) * 11);
-      *wptr++ = ' ';
+      pzwritep = memseta(pzwritep, 32, plink_maxfid - slen);
+      pzwritep = memcpyax(pzwritep, cptr, slen, ' ');
+      pzwritep = fw_strcpy(plink_maxiid, &(cptr2[1]), pzwritep);
+      pzwritep = memseta(pzwritep, 32, 10);
+      *pzwritep++ = 'Y' - (is_set(pheno_nm, sample_uidx) * 11);
+      *pzwritep++ = ' ';
       uii = missing_cts[sample_uidx];
       ukk = is_set(sex_male, sample_uidx);
       ujj = marker_ct_nony + (ukk * marker_ct_y);
@@ -2986,23 +3047,28 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
 	  ujj -= umm;
 	}
       }
-      wptr = uint32_writew8x(wptr, uii, ' ');
-      wptr = uint32_writew8x(wptr, ujj, ' ');
-      wptr = double_g_writewx4x(wptr, ((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 8, '\n');
-      if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+      pzwritep = uint32_writew8x(pzwritep, uii, ' ');
+      pzwritep = uint32_writew8x(pzwritep, ujj, ' ');
+      pzwritep = double_g_writewx4(pzwritep, ((double)((int32_t)uii)) / ((double)((int32_t)ujj)), 8);
+      append_binary_eoln(&pzwritep);
+      if (flex_pzwrite(&ps, &pzwritep)) {
 	goto write_missingness_reports_ret_WRITE_FAIL;
       }
     } while (++sample_uidx < sample_uidx_stop);
   } while (sample_idx < sample_ct);
-  if (fclose_null(&outfile)) {
+
+  if (flex_pzwrite_close_null(&ps, pzwritep)) {
     goto write_missingness_reports_ret_WRITE_FAIL;
   }
   *outname_end = '\0';
-  LOGPRINTFWW("--missing: Sample missing data report written to %s.imiss, and variant-based %smissing data report written to %s.lmiss.\n", outname, cluster_ct? "cluster-stratified " : "", outname);
+  LOGPRINTFWW("--missing: Sample missing data report written to %s.imiss%s, and variant-based %smissing data report written to %s.lmiss%s.\n", outname, output_gz? ".gz" : "", cluster_ct? "cluster-stratified " : "", outname, output_gz? ".gz" : "");
   while (0) {
   write_missingness_reports_ret_NOMEM:
     retval = RET_NOMEM;
     break;
+  write_missingness_reports_ret_OPEN_FAIL:
+    retval = RET_OPEN_FAIL;
+    break;
   write_missingness_reports_ret_READ_FAIL:
     retval = RET_READ_FAIL;
     break;
@@ -3011,45 +3077,53 @@ int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* out
     break;
   }
   wkspace_reset(wkspace_mark);
-  fclose_cond(outfile);
+  flex_pzwrite_close_cond(&ps, pzwritep);
   return retval;
 }
 
-int32_t hardy_report_write_line(FILE* outfile, char* prefix_buf, uint32_t prefix_len, uint32_t reverse, uint32_t ll_ct, uint32_t lh_ct, uint32_t hh_ct, char* midbuf_ptr, double pval, double output_min_p) {
+int32_t hardy_report_write_line(Pigz_state* ps_ptr, char** pzwritep_ptr, char* prefix_buf, uint32_t prefix_len, uint32_t reverse, uint32_t ll_ct, uint32_t lh_ct, uint32_t hh_ct, char* midbuf_ptr, double pval, double output_min_p) {
+  char* pzwritep = *pzwritep_ptr;
   char wbuf[48];
   char* cptr;
   uint32_t denom;
   double drecip;
   double minor_freq;
-  fwrite(prefix_buf, 1, prefix_len, outfile);
+  pzwritep = memcpya(pzwritep, prefix_buf, prefix_len);
   if (reverse) {
     cptr = uint32_write(uint32_writex(uint32_writex(wbuf, hh_ct, '/'), lh_ct, '/'), ll_ct);
   } else {
     cptr = uint32_write(uint32_writex(uint32_writex(wbuf, ll_ct, '/'), lh_ct, '/'), hh_ct);
   }
-  cptr = fw_strcpyn(20, cptr - wbuf, wbuf, midbuf_ptr);
-  *cptr++ = ' ';
+  pzwritep = fw_strcpyn(20, cptr - wbuf, wbuf, pzwritep);
+  *pzwritep++ = ' ';
   denom = (ll_ct + lh_ct + hh_ct) * 2;
   if (denom) {
     drecip = 1.0 / ((double)denom);
     minor_freq = (2 * ll_ct + lh_ct) * drecip;
-    cptr = double_g_writewx4x(double_g_writewx4x(double_g_writewx4x(cptr, (lh_ct * 2) * drecip, 8, ' '), minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' '), MAXV(pval, output_min_p), 12, '\n');
+    pzwritep = double_g_writewx4(double_g_writewx4x(double_g_writewx4x(pzwritep, (lh_ct * 2) * drecip, 8, ' '), minor_freq * (2 * hh_ct + lh_ct) * drecip * 2, 8, ' '), MAXV(pval, output_min_p), 12);
   } else {
-    cptr = memcpya(cptr, "     nan      nan           NA\n", 31);
+    pzwritep = memcpya(pzwritep, "     nan      nan           NA", 30);
   }
-  return fwrite_checked(midbuf_ptr, (cptr - midbuf_ptr), outfile);
+  append_binary_eoln(&pzwritep);
+  if (flex_pzwrite(ps_ptr, &pzwritep)) {
+    return 1;
+  }
+  *pzwritep_ptr = pzwritep;
+  return 0;
 }
 
 int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t nonfounders, int32_t* hwe_ll_cases, int32_t* hwe_lh_cases, int32_t* hwe_hh_cases, int [...]
-  FILE* outfile = NULL;
   unsigned char* wkspace_mark = wkspace_base;
+  char* pzwritep = NULL;
   uintptr_t marker_ct = unfiltered_marker_ct - marker_exclude_ct;
   uintptr_t marker_uidx = 0;
   uintptr_t marker_idx = 0;
   uint32_t hwe_midp = hwe_modifier & HWE_MIDP;
+  uint32_t output_gz = (hwe_modifier / HWE_GZ) & 1;
   int32_t retval = 0;
   uint32_t skip_chrom = 0;
   uint32_t pct = 0;
+  Pigz_state ps;
   uint32_t prefix_len;
   uint32_t loop_end;
   uint32_t uii;
@@ -3062,6 +3136,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
   uint32_t chrom_end;
   uint32_t reverse;
   double* p_values;
+  unsigned char* overflow_buf;
   char* writebuf;
   char* cptr0;
   char* cptr;
@@ -3069,13 +3144,15 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
   char* cptr3;
   char* cptr4;
   char* cptr5;
+  pzwrite_init_null(&ps);
   if (pheno_nm_ct) {
     report_type = pheno_c? 0 : 1;
   } else {
     report_type = 2;
   }
   uii = report_type? 1 : 3;
-  if (wkspace_alloc_d_checked(&p_values, uii * marker_ct * sizeof(double)) ||
+  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN) ||
+      wkspace_alloc_d_checked(&p_values, uii * marker_ct * sizeof(double)) ||
       wkspace_alloc_c_checked(&writebuf, 2 * max_marker_allele_len + MAXLINELEN)) {
     goto hardy_report_ret_NOMEM;
   }
@@ -3097,16 +3174,18 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
   marker_uidx = 0;
   marker_idx = 0;
 
-  memcpy(outname_end, ".hwe", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  memcpy(outname_end, output_gz? ".hwe.gz" : ".hwe", output_gz? 8 : 5);
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
     goto hardy_report_ret_OPEN_FAIL;
   }
+  pzwritep = (char*)overflow_buf;
+
   LOGPRINTFWW5("--hardy: Writing Hardy-Weinberg report (%s) to %s ... ", nonfounders? "all samples" : "founders only", outname);
   fputs("0%", stdout);
   fflush(stdout);
-  sprintf(writebuf, " CHR %%%us     TEST   A1   A2                 GENO   O(HET)   E(HET)            P \n", plink_maxsnp);
-  fprintf(outfile, writebuf, "SNP");
- 
+  sprintf(writebuf, " CHR %%%us     TEST   A1   A2                 GENO   O(HET)   E(HET)            P " EOLN_STR, plink_maxsnp);
+  pzwritep += sprintf(pzwritep, writebuf, "SNP");
+
   chrom_fo_idx = 0;
   refresh_chrom_info(chrom_info_ptr, marker_uidx, &chrom_end, &chrom_fo_idx, &is_x, &is_y, &is_mt, &is_haploid);
   skip_chrom = (is_haploid && (!is_x)) || is_mt;
@@ -3152,7 +3231,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
 	cptr5 = fw_strcpy(4, cptr4, &(cptr5[1]));
 	*cptr5 = ' ';
 	prefix_len = 1 + (cptr5 - writebuf);
-	if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[marker_idx], output_min_p)) {
+	if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[marker_idx], output_min_p)) {
 	  goto hardy_report_ret_WRITE_FAIL;
 	}
       }
@@ -3195,17 +3274,17 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
 	cptr5 = fw_strcpy(4, cptr4, &(cptr5[1]));
 	*cptr5 = ' ';
 	prefix_len = 1 + (cptr5 - writebuf);
-	if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[3 * marker_idx], output_min_p)) {
+	if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_allfs[marker_uidx], hwe_lh_allfs[marker_uidx], hwe_hh_allfs[marker_uidx], cptr2, p_values[3 * marker_idx], output_min_p)) {
 	  goto hardy_report_ret_WRITE_FAIL;
 	}
 
 	memcpy(&(cptr0[7 + plink_maxsnp]), "FF", 2);
-	if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_ll_cases[marker_uidx], hwe_lh_cases[marker_uidx], hwe_hh_cases[marker_uidx], cptr2, p_values[3 * marker_idx + 1], output_min_p)) {
+	if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_ll_cases[marker_uidx], hwe_lh_cases[marker_uidx], hwe_hh_cases[marker_uidx], cptr2, p_values[3 * marker_idx + 1], output_min_p)) {
 	  goto hardy_report_ret_WRITE_FAIL;
 	}
 
 	memcpy(&(cptr0[4 + plink_maxsnp]), "UN", 2);
-	if (hardy_report_write_line(outfile, writebuf, prefix_len, reverse, hwe_lls[marker_uidx], hwe_lhs[marker_uidx], hwe_hhs[marker_uidx], cptr2, p_values[3 * marker_idx + 2], output_min_p)) {
+	if (hardy_report_write_line(&ps, &pzwritep, writebuf, prefix_len, reverse, hwe_lls[marker_uidx], hwe_lhs[marker_uidx], hwe_hhs[marker_uidx], cptr2, p_values[3 * marker_idx + 2], output_min_p)) {
 	  goto hardy_report_ret_WRITE_FAIL;
 	}
       }
@@ -3232,7 +3311,7 @@ int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uint
     retval = RET_WRITE_FAIL;
     break;
   }
-  fclose_cond(outfile);
+  flex_pzwrite_close_cond(&ps, pzwritep);
   wkspace_reset(wkspace_mark);
   return retval;
 }
diff --git a/plink_filter.h b/plink_filter.h
index 9c79b7b..9c5194d 100644
--- a/plink_filter.h
+++ b/plink_filter.h
@@ -31,6 +31,10 @@ uint32_t random_thin_markers(double thin_keep_prob, uintptr_t unfiltered_marker_
 
 int32_t random_thin_markers_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t* marker_exclude_ct_ptr);
 
+uint32_t random_thin_samples(double thin_keep_prob, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
+
+int32_t random_thin_samples_ct(uint32_t thin_keep_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr);
+
 int32_t load_oblig_missing(FILE* bedfile, uintptr_t bed_offset, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, char* sorted_sample_ids, uintptr_t sorted_sample_ct, uintptr_t max_sample_id_len, uint32_t* sample_id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
 
 int32_t filter_samples_file(char* filtername, char* sorted_sample_ids, uintptr_t sorted_ids_len, uintptr_t max_sample_id_len, uint32_t* id_map, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* filtervals_flattened, uint32_t mfilter_col);
@@ -39,9 +43,9 @@ void filter_samples_bitfields(uintptr_t unfiltered_sample_ct, uintptr_t* sample_
 
 int32_t mind_filter(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, double mind_thresh, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* sample_exclude_ct_ptr, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* sex_male, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip);
 
-int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
+int32_t calc_freqs_and_hwe(FILE* bedfile, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_exclude_ct, char* sample_ids, uintptr_t max_sample_id_len, uintptr_t* founder_info, int32_t nonfounders, int32_t maf_succ, double* set_allele_freqs, uintptr_t bed_offset, uint32_t hwe_needed, uint32_t hwe_all, uin [...]
 
-int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_male, uint32_t sampl [...]
+int32_t write_missingness_reports(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxfid, uint32_t plink_maxiid, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, Oblig_missing_info* om_ip, char* marker_ids, uintptr_t max_marker_id_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* sex_ [...]
 
 int32_t hardy_report(char* outname, char* outname_end, double output_min_p, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t plink_maxsnp, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t* marker_reverse, int32_t* hwe_lls, int32_t* hwe_lhs, int32_t* hwe_hhs, uint32_t hwe_modifier, uint32_t nonfounders, int32_t* hwe_ll_cases, int32_t* hwe_lh_cases, int32_t* hwe_hh_cases, int [...]
 
diff --git a/plink_glm.c b/plink_glm.c
index 7d93bdb..27759b0 100644
--- a/plink_glm.c
+++ b/plink_glm.c
@@ -3953,12 +3953,13 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
 	}
       }
     }
+    np_base = param_ct_max - np_diploid;
     if (!sex_covar_everywhere) {
       np_sex = popcount_bit_idx(active_params, sex_start_idx, param_raw_ct_max);
-    }
-    np_base = param_ct_max - np_diploid - np_sex;
-    if (!np_sex) {
-      variation_in_sex = 0;
+      np_base -= np_sex;
+      if (!np_sex) {
+	variation_in_sex = 0;
+      }
     }
   } else {
     fill_all_bits(active_params, param_raw_ct_max);
@@ -3967,13 +3968,6 @@ int32_t glm_common_init(FILE* bedfile, uintptr_t bed_offset, uint32_t glm_modifi
     np_diploid = np_diploid_raw;
     np_sex = np_sex_raw;
   }
-  if (sample_valid_ct <= param_ct_max) {
-    logprint("Warning: Skipping --linear since # variables >= # samples.\n");
-    if (pheno_nm_ct > param_ct_max) {
-      logprint("(Check your covariates--all samples with at least one missing covariate are\nexcluded from this analysis.)\n");
-    }
-    goto glm_common_init_ret_1;
-  }
   // parameter sequence:
   // 1. intercept
   // 2. allelic dosage
@@ -4698,6 +4692,13 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
   if (retval) {
     goto glm_linear_assoc_ret_1;
   }
+  if (sample_valid_ct <= param_ct_max) {
+    logprint("Warning: Skipping --linear since # variables >= # samples.\n");
+    if (pheno_nm_ct > param_ct_max) {
+      logprint("(Check your covariates--all samples with at least one missing covariate are\nexcluded from this analysis.)\n");
+    }
+    goto glm_linear_assoc_ret_1;
+  }
   sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
   final_mask = get_final_mask(sample_valid_ct);
   param_ctx_max_m1 = param_ctx_max - 1;
@@ -4714,6 +4715,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
     // use this array to track regression failures even in max(T) case
     fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
   } else {
+    g_perm_adapt_stop = NULL;
     ulii = (marker_initial_ct + (BITCT - 1)) / BITCT;
     if (wkspace_alloc_ul_checked(&regression_skip, ulii * sizeof(intptr_t))) {
       goto glm_linear_assoc_ret_NOMEM;
@@ -5219,12 +5221,15 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
       } else if ((!g_min_ploidy_1) || (!genotypic_or_hethom)) {
 	cur_param_ct = np_base + np_diploid;
 	if (constraint_ct_max) {
-          cur_constraint_ct = popcount_bit_idx(g_joint_test_params, 0, constraint_ct_max - np_sex);
+	  // bugfix: this incorrectly had constraint_ct_max as last parameter
+          cur_constraint_ct = popcount_bit_idx(g_joint_test_params, 0, cur_param_ct);
 	} else {
 	  cur_constraint_ct = 0;
 	}
 	cur_param_names = param_names;
       } else {
+	// er, is this still reachable with forced --xchr-model 0?  should it
+	// be reachable?
 	cur_param_ct = np_base;
 	cur_constraint_ct = 0;
 	if (constraint_ct_max) {
@@ -5451,7 +5456,7 @@ int32_t glm_linear_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offset
 		goto glm_linear_assoc_ret_WRITE_FAIL;
 	      }
 	    }
-	  } else if (orig_pvals) {
+	  } else if (orig_pvals && constraint_ct_max) {
 	    orig_pvals[marker_idx3] = -9;
 	  }
 	} else {
@@ -6195,6 +6200,10 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   uint32_t ujj;
   uint32_t ukk;
   numbuf[0] = ' ';
+  if (pheno_nm_ct < 2) {
+    logprint("Warning: Skipping --logistic since less than two phenotypes are present.\n");
+    goto glm_logistic_assoc_ret_1;
+  }
   if ((chrom_info_ptr->mt_code != -1) && is_set(chrom_info_ptr->chrom_mask, chrom_info_ptr->mt_code)) {
     hh_or_mt_exists |= NXMHH_EXISTS;
   }
@@ -6221,6 +6230,13 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
   if (retval) {
     goto glm_logistic_assoc_ret_1;
   }
+  if (sample_valid_ct <= param_ct_max) {
+    logprint("Warning: Skipping --logistic since # variables >= # samples.\n");
+    if (pheno_nm_ct > param_ct_max) {
+      logprint("(Check your covariates--all samples with at least one missing covariate are\nexcluded from this analysis.)\n");
+    }
+    goto glm_logistic_assoc_ret_1;
+  }
   sample_valid_cta4 = (sample_valid_ct + 3) & (~3);
   sample_valid_ctv2 = 2 * ((sample_valid_ct + BITCT - 1) / BITCT);
   final_mask = get_final_mask(sample_valid_ct);
@@ -6239,6 +6255,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
     // use this array to track regression failures even in max(T) case
     fill_ulong_zero((uintptr_t*)g_perm_adapt_stop, (marker_initial_ct + sizeof(intptr_t) - 1) / sizeof(intptr_t));
   } else {
+    g_perm_adapt_stop = NULL;
     ulii = (marker_initial_ct + (BITCT - 1)) / BITCT;
     if (wkspace_alloc_ul_checked(&regression_skip, ulii * sizeof(intptr_t))) {
       goto glm_logistic_assoc_ret_NOMEM;
@@ -6867,7 +6884,7 @@ int32_t glm_logistic_assoc(pthread_t* threads, FILE* bedfile, uintptr_t bed_offs
 		goto glm_logistic_assoc_ret_WRITE_FAIL;
 	      }
 	    }
-	  } else if (orig_pvals) {
+	  } else if (orig_pvals && constraint_ct_max) {
 	    orig_pvals[marker_idx3] = -9;
 	  }
 	} else {
diff --git a/plink_help.c b/plink_help.c
index 62e26f3..dc1aaf7 100644
--- a/plink_help.c
+++ b/plink_help.c
@@ -382,7 +382,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --recode <01 | 12> <23 | A{-transpose} | AD | beagle{-nomap} | bimbam{-1chr}\n"
 "           | compound-genotypes | fastphase{-1chr} | HV{-1chr} | lgen{-ref} |\n"
 "           list | oxford | rlist | structure | transpose | vcf | vcf-fid |\n"
-"           vcf-iid> <tab | tabx | spacex> <include-alt>\n"
+"           vcf-iid> <tab | tabx | spacex | bgz> <include-alt>\n"
 "    Create a new text fileset with all filters applied.  By default, the\n"
 "    fileset consists of a .ped and a .map file, readable with --file.\n"
 "    * The '12' modifier causes A1 (usually minor) alleles to be coded as '1'\n"
@@ -422,6 +422,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "      'vcf-fid' and 'vcf-iid' cause family IDs or within-family IDs\n"
 "      respectively to be used for the sample IDs in the last header row, while\n"
 "      'vcf' merges both IDs and puts an underscore between them.\n"
+"      If the 'bgz' modifier is added, the VCF file is block-gzipped.\n"
 "      The A2 allele is saved as the reference and normally flagged as not based\n"
 "      on a real reference genome ('PR' INFO field value).  When it is important\n"
 "      for reference alleles to be correct, you'll also want to include\n"
@@ -498,8 +499,8 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "      from the report.\n\n"
 	       );
     help_print("freq\tfreqx\tfrqx\tcounts", &help_ctrl, 1,
-"  --freq <counts>\n"
-"  --freqx\n"
+"  --freq <counts> <gz>\n"
+"  --freqx <gz>\n"
 "    --freq generates a basic allele frequency (or count, if the 'counts'\n"
 "    modifier is present) report.  This can be combined with --within/--family\n"
 "    to produce a cluster-stratified allele frequency/count report instead.\n"
@@ -507,16 +508,17 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    with --read-freq.\n\n"
 		);
     help_print("missing", &help_ctrl, 1,
-"  --missing\n"
+"  --missing <gz>\n"
 "    Generate sample- and variant-based missing data reports.  If clusters are\n"
-"    defined, the variant-based report is cluster-stratified.\n\n"
+"    defined, the variant-based report is cluster-stratified.  'gz' causes the\n"
+"    output files to be gzipped.\n\n"
 	       );
     help_print("test-mishap", &help_ctrl, 1,
 "  --test-mishap\n"
 "    Check for association between missing calls and flanking haplotypes.\n\n"
                );
     help_print("hardy\thardy2", &help_ctrl, 1,
-"  --hardy <midp>\n"
+"  --hardy <midp> <gz>\n"
 "    Generate a Hardy-Weinberg exact test p-value report.  (This does NOT\n"
 "    simultaneously filter on the p-value any more; use --hwe for that.)  With\n"
 "    the 'midp' modifier, the test applies the mid-p adjustment described in\n"
@@ -528,7 +530,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "    Generate a Mendel error report.\n\n"
 	       );
     help_print("het\tibc", &help_ctrl, 1,
-"  --het <small-sample>\n"
+"  --het <small-sample> <gz>\n"
 "  --ibc\n"
 "    Estimate inbreeding coefficients.  --het reports method-of-moments\n"
 "    estimates, while --ibc calculates all three values described in Yang J, Lee\n"
@@ -1071,7 +1073,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 #endif
 #endif
     help_print("tdt\tpoo\tperm\tmperm\tparentdt1\tparentdt2\tpat\tmat\tset-test", &help_ctrl, 1,
-"  --tdt <exact | exact-midp | poo> <perm | mperm=[value]>\n"
+"  --tdt <exact | exact-midp | poo> <perm | mperm=[value]> <perm-count>\n"
 "        <parentdt1 | parentdt2 | pat | mat> <set-test>\n"
 "    Report transmission disequilibrium test statistics, given case/control\n"
 "    phenotypes and pedigree information.\n"
@@ -1093,17 +1095,27 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "      parent-of-origin test Z score; 'pat'/'mat' cause paternal or maternal TDT\n"
 "      chi-square statistics, respectively, to be considered instead.\n\n"
 	       );
+#ifndef STABLE_BUILD
+    help_print("dfam", &help_ctrl, 1,
+"  --dfam <no-unrelateds> <perm | mperm=[value]> <perm-count> <set-test>\n"
+"    Sib-TDT-based association test.  By default, clusters of unrelated\n"
+"    individuals are included in the test; the 'no-unrelateds' modifier removes\n"
+"    this component, leaving the original sib-TDT.\n\n"
+	       );
+#endif
     help_print("qfam\tqfam-between\tqfam-parents\tqfam-total", &help_ctrl, 1,
-"  --qfam <perm | mperm=[value]> <perm-count>\n"
-"  --qfam-parents <perm | mperm=[value]> <perm-count>\n"
-"  --qfam-between <perm | mperm=[value]> <perm-count>\n"
-"  --qfam-total <perm | mperm=[value]> <perm-count>\n"
+"  --qfam <perm | mperm=[value]> <perm-count> <emp-se>\n"
+"  --qfam-parents <perm | mperm=[value]> <perm-count> <emp-se>\n"
+"  --qfam-between <perm | mperm=[value]> <perm-count> <emp-se>\n"
+"  --qfam-total <perm | mperm=[value]> <perm-count> <emp-se>\n"
 "    QFAM family-based association test for quantitative traits.\n"
 "    * A Mendel error check is performed before the main tests; offending\n"
 "      genotypes are treated as missing by this analysis.\n"
 "    * This procedure requires permutation.  'perm' and 'perm-count' have the\n"
 "      usual meanings.  However, 'mperm=[value]' just specifies a fixed number\n"
-"      of permutations; the method does not support a proper max(T) test.\n\n"
+"      of permutations; the method does not support a proper max(T) test.\n"
+"    * The 'emp-se' modifier adds BETA and EMP_SE (empirical standard error for\n"
+"      beta) fields to the .perm output file.\n\n"
 	       );
     help_print("annotate", &help_ctrl, 1,
 "  --annotate [PLINK report] <attrib=[file]> <ranges=[file]> <filter=[file]>\n"
@@ -1240,10 +1252,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 #if defined __cplusplus && !defined _WIN32 && !defined STABLE_BUILD
     help_print("R\tR-debug", &help_ctrl, 1,
 "  --R [R script file] <debug>\n"
-"    Connect to a Rserve background process, and execute the Rplink function\n"
-"    defined in the input file.  (Unless the 'debug' modifier is present; in\n"
-"    that case, the R commands that PLINK would have tried to execute are logged\n"
-"    to a file.)\n\n"
+"    Connect to a Rserve (preferably version 1.7 or later) background process,\n"
+"    and execute the Rplink function defined in the input file.  (Unless the\n"
+"    'debug' modifier is present; in that case, the R commands that PLINK would\n"
+"    have tried to execute are logged to a file.)\n\n"
 	       );
 #endif
     /*
@@ -1408,7 +1420,7 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "                     using the values in the main input fileset.\n"
 "  --all-pheno      : For basic association tests, loop through all phenotypes\n"
 "                     in --pheno file.\n"
-"  --mpheno [col]   : Specify phenotype column number in --pheno file.\n"
+"  --mpheno [n]     : Load phenotype from column (n+2) in --pheno file.\n"
 "  --pheno-name [c] : If --pheno file has a header row, use column with the\n"
 "                     given name.\n"
 "  --pheno-merge    : When the main input fileset contains an phenotype value\n"
@@ -1551,6 +1563,10 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "  --bp-space [bps] : Remove variants so that each pair is no closer than the\n"
 "                     given bp distance.  (Equivalent to VCFtools --thin.)\n"
 	       );
+    help_print("thin-indiv\tthin-indiv-count\tmax-indv", &help_ctrl, 0,
+"  --thin-indiv [p]         : Randomly remove samples, retaining with prob. p.\n"
+"  --thin-indiv-count [n]   : Randomly remove samples until n of them remain.\n"
+	       );
     help_print("filter\tmfilter", &help_ctrl, 0,
 "  --filter [f] [val(s)...] : Exclude all samples without a 3rd column entry in\n"
 "                             the given file matching one of the given\n"
@@ -1777,8 +1793,9 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "                       7 = report mismatching nonmissing calls without merging\n"
 	       );
     help_print("merge\tbmerge\tmerge-list\tmerge-mode\tmerge-equal-pos", &help_ctrl, 0,
-"  --merge-equal-pos  : Merge variants with different names but identical\n"
-"                       positions.\n"
+"  --merge-equal-pos  : With --merge/--bmerge/--merge-list, merge variants with\n"
+"                       different names but identical positions.  (Exception:\n"
+"                       same-position chromosome code 0 variants aren't merged.)\n"
 	       );
     help_print("mendel-duos\tmendel-multigen\tme\tmendel\ttdt\tset-me-missing", &help_ctrl, 0,
 "  --mendel-duos      : Make Mendel error checks consider samples with only one\n"
@@ -1820,12 +1837,17 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 "                               informative pairs] ratios to be larger than this\n"
 "                               value (default 0.95).\n"
 	       );
-    help_print("distance-exp\texponent\tdistance", &help_ctrl, 0,
-"  --distance-exp [x] : When computing genomic distances, assign each variant a\n"
-"                       weight of (2q(1-q))^{-x}, where q is the inferred MAF.\n"
-"                       (Use --read-freq if you want to explicitly specify some\n"
-"                       or all of the MAFs.)\n"
+    help_print("distance-wts\tdistance-exp\texponent\tdistance", &help_ctrl, 0,
+"  --distance-wts exp=[x]        : When computing genomic distances, assign each\n"
+"                                  variant a weight of (2q(1-q))^{-x}, where q\n"
+"                                  is the loaded or inferred MAF.\n"
 	       );
+#ifndef STABLE_BUILD
+    help_print("distance-wts\tdistance\tmake-grm-gz\tmake-grm-bin", &help_ctrl, 0,
+"  --distance-wts [f] <noheader> : When computing genomic distances, assign each\n"
+"                                  variant the weight specified in the file.\n"
+	       );
+#endif
     help_print("read-dists\tload-dists\tibs-test\tgroupdist\tregress-distance\tcluster\tneighbour\tneighbor", &help_ctrl, 0,
 "  --read-dists [dist file] {id file} : Load a triangular binary distance matrix\n"
 "                                       instead of recalculating from scratch.\n"
@@ -2148,7 +2170,11 @@ int32_t disp_help(uint32_t param_ct, char** argv) {
 #endif
     if (!param_ct) {
       fputs(
-"\nFor further documentation and support, consult the main webpage\n"
+"\nPrimary methods paper:\n"
+"Chang CC, Chow CC, Tellier LCAM, Vattikuti S, Purcell SM, Lee JJ (2015)\n"
+"Second-generation PLINK: rising to the challenge of larger and richer datasets.\n"
+"GigaScience, 4.\n\n"
+"For further documentation and support, consult the main webpage\n"
 "(https://www.cog-genomics.org/plink2 ) and/or the mailing list\n"
 "(https://groups.google.com/d/forum/plink2-users ).\n"
 , stdout);
diff --git a/plink_ld.c b/plink_ld.c
index 2c22be4..fb97c82 100644
--- a/plink_ld.c
+++ b/plink_ld.c
@@ -2156,11 +2156,11 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   int32_t retval = 0;
   unsigned char* wkspace_mark2;
   uintptr_t* ulptr;
+  unsigned char* overflow_buf;
   uint64_t tests_completed;
   uintptr_t thread_workload;
   uintptr_t cur_idx2_block_size;
   uintptr_t marker_idx2_end;
-  uintptr_t marker_uidx1_tmp;
   uintptr_t block_idx1;
   uintptr_t marker_uidx2;
   uintptr_t marker_idx2;
@@ -2173,6 +2173,9 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uint32_t chrom_end;
   uint32_t is_last_block;
 
+  if (wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+    goto ld_report_matrix_ret_NOMEM;
+  }
   if (output_single_prec) {
     // force divisibility by 16 instead (cacheline = 64 bytes, float = 4)
     marker_ctm8 = (marker_ctm8 + 8) & (~15);
@@ -2322,26 +2325,26 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
       }
     }
     g_ld_idx1_block_size = idx1_block_size;
-    marker_uidx1_tmp = marker_uidx1;
+    // marker_uidx1_tmp = marker_uidx1;
     if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
       goto ld_report_matrix_ret_READ_FAIL;
     }
     chrom_end = 0;
-    for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1_tmp++, block_idx1++) {
-      if (IS_SET(marker_exclude, marker_uidx1_tmp)) {
-        marker_uidx1_tmp = next_unset_ul_unsafe(marker_exclude, marker_uidx1_tmp);
-        if (fseeko(bedfile, bed_offset + (marker_uidx1_tmp * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
+    for (block_idx1 = 0; block_idx1 < idx1_block_size; marker_uidx1++, block_idx1++) {
+      if (IS_SET(marker_exclude, marker_uidx1)) {
+        marker_uidx1 = next_unset_ul_unsafe(marker_exclude, marker_uidx1);
+        if (fseeko(bedfile, bed_offset + (marker_uidx1 * ((uint64_t)unfiltered_sample_ct4)), SEEK_SET)) {
 	  goto ld_report_matrix_ret_READ_FAIL;
 	}
       }
-      if (marker_uidx1_tmp >= chrom_end) {
-        chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx1_tmp);
+      if (marker_uidx1 >= chrom_end) {
+        chrom_fo_idx = get_marker_chrom_fo_idx(chrom_info_ptr, marker_uidx1);
         chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
         is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
 	is_x = (((int32_t)chrom_idx) == chrom_info_ptr->x_code);
 	is_y = (((int32_t)chrom_idx) == chrom_info_ptr->y_code);
       }
-      if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno1[block_idx1 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1_tmp))) {
+      if (load_and_collapse_incl(bedfile, loadbuf, unfiltered_sample_ct, &(g_ld_geno1[block_idx1 * founder_ct_192_long]), founder_ct, founder_info, final_mask, IS_SET(marker_reverse, marker_uidx1))) {
 	goto ld_report_matrix_ret_READ_FAIL;
       }
       if (is_haploid && hh_exists) {
@@ -2477,9 +2480,9 @@ int32_t ld_report_matrix(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
 	g_ld_idx2_block_size = marker_idx1 + 1;
       }
       if (output_gz) {
-        parallel_compress(outname, not_first_write, ld_matrix_emitn);
+        parallel_compress(outname, overflow_buf, not_first_write, ld_matrix_emitn);
       } else {
-        write_uncompressed(outname, not_first_write, ld_matrix_emitn);
+        write_uncompressed(outname, overflow_buf, not_first_write, ld_matrix_emitn);
       }
       not_first_write = 1;
     }
@@ -3048,7 +3051,7 @@ static void two_locus_count_table_zmiss1(uintptr_t* lptr1, uintptr_t* lptr2, uin
   counts_3x3[1] = popcount_longs_intersect(lptr1, &(lptr2[sample_ctv3]), sample_ctv3);
   if (!is_zmiss2) {
     counts_3x3[2] = popcount_longs_intersect(lptr1, &(lptr2[2 * sample_ctv3]), sample_ctv3);
-    counts_3x3[5] = popcount_longs_intersect(&(lptr1[2 * sample_ctv3]), &(lptr2[2 * sample_ctv3]), sample_ctv3);
+    counts_3x3[5] = popcount_longs_intersect(&(lptr1[sample_ctv3]), &(lptr2[2 * sample_ctv3]), sample_ctv3);
   }
   lptr1 = &(lptr1[sample_ctv3]);
   counts_3x3[3] = popcount_longs_intersect(lptr1, lptr2, sample_ctv3);
@@ -5182,6 +5185,7 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uintptr_t* dummy_nm;
   uintptr_t* ulptr;
   uint32_t* uiptr;
+  unsigned char* overflow_buf;
   unsigned char* wkspace_mark2;
   uintptr_t thread_workload;
   uintptr_t idx1_block_size;
@@ -5202,7 +5206,8 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
   uint32_t cur_marker_pos;
   uint32_t is_last_block;
   uint32_t uii;
-  if (wkspace_alloc_ul_checked(&loadbuf, founder_ctl * 2 * sizeof(intptr_t)) ||
+  if (wkspace_alloc_uc_checked(&overflow_buf, 262144) ||
+      wkspace_alloc_ul_checked(&loadbuf, founder_ctl * 2 * sizeof(intptr_t)) ||
       wkspace_alloc_ul_checked(&dummy_nm, founder_ctl * sizeof(intptr_t))) {
     goto ld_report_dprime_ret_NOMEM;
   }
@@ -5488,9 +5493,9 @@ int32_t ld_report_dprime(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uintp
     g_ld_idx2_block_start = 0;
     g_ld_block_idx2 = 0;
     if (output_gz) {
-      parallel_compress(outname, not_first_write, ld_regular_emitn);
+      parallel_compress(outname, overflow_buf, not_first_write, ld_regular_emitn);
     } else {
-      write_uncompressed(outname, not_first_write, ld_regular_emitn);
+      write_uncompressed(outname, overflow_buf, not_first_write, ld_regular_emitn);
     }
     not_first_write = 1;
     g_ld_is_first_block = 0;
@@ -5573,6 +5578,7 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   uint32_t chrom_last = 0;
   int32_t retval = 0;
   unsigned char* wkspace_mark2;
+  unsigned char* overflow_buf;
   uint32_t* id_map;
   char* sorted_ids;
   char* bufptr;
@@ -5608,6 +5614,9 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
   uint32_t is_last_block;
   uint32_t uii;
   int32_t ii;
+  if (wkspace_alloc_uc_checked(&overflow_buf, 262144)) {
+    goto ld_report_regular_ret_NOMEM;
+  }
   if (idx1_subset) {
     if (wkspace_alloc_ul_checked(&marker_exclude_idx1, unfiltered_marker_ctl * sizeof(intptr_t))) {
       goto ld_report_regular_ret_NOMEM;
@@ -5980,9 +5989,9 @@ int32_t ld_report_regular(pthread_t* threads, Ld_info* ldip, FILE* bedfile, uint
     g_ld_idx2_block_start = 0;
     g_ld_block_idx2 = 0;
     if (output_gz) {
-      parallel_compress(outname, not_first_write, ld_regular_emitn);
+      parallel_compress(outname, overflow_buf, not_first_write, ld_regular_emitn);
     } else {
-      write_uncompressed(outname, not_first_write, ld_regular_emitn);
+      write_uncompressed(outname, overflow_buf, not_first_write, ld_regular_emitn);
     }
     not_first_write = 1;
     g_ld_is_first_block = 0;
diff --git a/plink_misc.c b/plink_misc.c
index 70a6fe4..9ec831d 100644
--- a/plink_misc.c
+++ b/plink_misc.c
@@ -3,6 +3,8 @@
 #include "plink_misc.h"
 #include "plink_stats.h"
 
+#include "pigz.h"
+
 void misc_init(Score_info* sc_ip) {
   sc_ip->fname = NULL;
   sc_ip->range_fname = NULL;
@@ -1712,20 +1714,6 @@ uint32_t calc_plink_maxsnp(uint32_t unfiltered_marker_ct, uintptr_t* marker_excl
   return plink_maxsnp;
 }
 
-double calc_wt_mean(double exponent, int32_t lhi, int32_t lli, int32_t hhi) {
-  double lcount = (double)lli + ((double)lhi * 0.5);
-  int64_t tot = lhi + lli + hhi;
-  double dtot = (double)tot;
-  int64_t subcount = lli; // avoid 32-bit integer overflow
-  double weight;
-  if ((!lhi) && ((!lli) || (!hhi))) {
-    return 0.0;
-  }
-  weight = pow(2 * lcount * (dtot - lcount) / (dtot * dtot), -exponent);
-  subcount = lhi * (subcount + hhi) + 2 * subcount * hhi;
-  return (subcount * weight * 2) / (double)(tot * tot);
-}
-
 // aptr1 = minor, aptr2 = major
 int32_t load_one_freq(uint32_t alen1, const char* aptr1, uint32_t alen2, const char* aptr2, double maf, double* set_allele_freq_ptr, char** mastrs_ptr, char missing_geno) {
   uint32_t malen0 = strlen(mastrs_ptr[0]);
@@ -1848,7 +1836,7 @@ uint32_t get_freq_file_type(char* bufptr) {
   return 0;
 }
 
-int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ, double exponent, uint32_t wt_needed, double* marker_weights) {
+int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ) {
   unsigned char* wkspace_mark = wkspace_base;
   FILE* freqfile = NULL;
   uintptr_t line_idx = 0;
@@ -1994,9 +1982,6 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
 	  if (retval) {
 	    goto read_external_freqs_ret_ALLELE_MISMATCH;
 	  }
-	  if (wt_needed) {
-	    marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, set_allele_freqs[marker_uidx]);
-	  }
         }
       }
     }
@@ -2080,13 +2065,6 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
 	  if (retval) {
 	    goto read_external_freqs_ret_ALLELE_MISMATCH;
 	  }
-	  if (wt_needed) {
-	    if (c_hap_a1 || c_hap_a2) {
-	      marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, set_allele_freqs[marker_uidx]);
-	    } else {
-	      marker_weights[marker_uidx] = calc_wt_mean(exponent, c_het, c_hom_a1, c_hom_a2);
-	    }
-	  }
         }
       }
     }
@@ -2134,9 +2112,6 @@ int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uint
 	if (retval) {
 	  goto read_external_freqs_ret_ALLELE_MISMATCH;
 	}
-	if (wt_needed) {
-	  marker_weights[marker_uidx] = calc_wt_mean_maf(exponent, set_allele_freqs[marker_uidx]);
-	}
       } else {
 	// if there aren't exactly 3 columns, this isn't a GCTA .freq file
 	bufptr = next_token(bufptr);
@@ -2356,9 +2331,10 @@ int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_c
   return retval;
 }
 
-int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t sample_f_male_ct, uintptr_t* marker_reve [...]
+int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
   unsigned char* wkspace_mark = wkspace_base;
-  FILE* outfile = NULL;
+  char* writebuf = tbuf;
+  char* pzwritep = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
   uint32_t* cur_cluster_map = cluster_map;
@@ -2371,10 +2347,12 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
   uint32_t cslen = 10;
   int32_t retval = 0;
   uint32_t cur_cts[4];
+  Pigz_state ps;
   uintptr_t* readbuf;
   uint32_t* uiptr;
   uint32_t* uiptr2;
   uint32_t* uiptr3;
+  unsigned char* overflow_buf;
   char* csptr;
   char* col_2_start;
   char* wptr_start;
@@ -2391,9 +2369,17 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
   uint32_t a1_obs;
   uint32_t tot_obs;
   uint32_t uii;
-  if (wkspace_alloc_ul_checked(&readbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
+  pzwrite_init_null(&ps);
+  uii = 2 * max_marker_allele_len + max_marker_id_len + max_cluster_id_len + 256;
+  if (wkspace_alloc_uc_checked(&overflow_buf, uii + PIGZ_BLOCK_SIZE) ||
+      wkspace_alloc_ul_checked(&readbuf, unfiltered_sample_ctl2 * sizeof(intptr_t))) {
     goto write_stratified_freqs_ret_NOMEM;
   }
+  if (uii > MAXLINELEN) {
+    if (wkspace_alloc_c_checked(&writebuf, uii)) {
+      goto write_stratified_freqs_ret_NOMEM;
+    }
+  }
   if ((sample_ct > sample_f_ct) && (!nonfounders)) {
     if (wkspace_alloc_ui_checked(&cur_cluster_starts, (cluster_ct + 1) * sizeof(int32_t)) ||
         wkspace_alloc_ui_checked(&cur_cluster_map, sample_f_ct * sizeof(int32_t))) {
@@ -2453,11 +2439,13 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
       cluster_starts_male[clidx + 1] = clmpos;
     }
   }
-  if (fopen_checked(&outfile, outname, "w")) {
+  memcpy(outname_end, output_gz? ".frq.strat.gz" : ".frq.strat", output_gz? 14 : 11);
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
     goto write_stratified_freqs_ret_OPEN_FAIL;
   }
-  sprintf(tbuf, " CHR %%%ds     CLST   A1   A2      MAF    MAC  NCHROBS\n", plink_maxsnp);
-  fprintf(outfile, tbuf, "SNP");
+  pzwritep = (char*)overflow_buf;
+  sprintf(tbuf, " CHR %%%us     CLST   A1   A2      MAF    MAC  NCHROBS" EOLN_STR, plink_maxsnp);
+  pzwritep += sprintf(pzwritep, tbuf, "SNP");
   if (wkspace_alloc_c_checked(&csptr, 2 * max_marker_allele_len + 16)) {
     goto write_stratified_freqs_ret_NOMEM;
   }
@@ -2477,7 +2465,7 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
     if (fseeko(bedfile, bed_offset + ((uint64_t)marker_uidx) * unfiltered_sample_ct4, SEEK_SET)) {
       goto write_stratified_freqs_ret_READ_FAIL;
     }
-    col_2_start = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, chrom_idx));
+    col_2_start = width_force(4, writebuf, chrom_name_write(writebuf, chrom_info_ptr, chrom_idx));
     *col_2_start++ = ' ';
     do {
       sptr = &(marker_ids[marker_uidx * max_marker_id_len]);
@@ -2502,8 +2490,9 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	uiptr = cluster_map_nonmale;
 	uiptr2 = cluster_map_male;
 	for (clidx = 0; clidx < cluster_ct; clidx++) {
-	  wptr = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), wptr_start);
-	  wptr = memcpyax(wptr, csptr, cslen, ' ');
+          pzwritep = memcpya(pzwritep, writebuf, wptr_start - writebuf);
+	  pzwritep = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+	  pzwritep = memcpya(pzwritep, csptr, cslen);
 	  fill_uint_zero(cur_cts, 4);
 	  uiptr3 = &(cluster_map_nonmale[cluster_starts_nonmale[clidx + 1]]);
 	  while (uiptr < uiptr3) {
@@ -2521,22 +2510,24 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	  a1_obs += cur_cts[0];
 	  tot_obs += cur_cts[0] + cur_cts[3];
 	  if (tot_obs) {
-            wptr = double_g_writewx4x(wptr, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
-	    wptr = uint32_writew6x(wptr, a1_obs, ' ');
-	    wptr = uint32_writew8(wptr, tot_obs);
-	    wptr = memcpya(wptr, " \n", 2);
+            pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
+	    pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
+	    pzwritep = uint32_writew8(pzwritep, tot_obs);
+	    *pzwritep++ = ' ';
 	  } else {
-	    wptr = memcpya(wptr, "       0      0        0 \n", 26);
+	    pzwritep = memcpya(pzwritep, "       0      0        0 ", 25);
 	  }
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  append_binary_eoln(&pzwritep);
+	  if (flex_pzwrite(&ps, &pzwritep)) {
 	    goto write_stratified_freqs_ret_WRITE_FAIL;
 	  }
 	}
       } else if (is_y) {
 	uiptr = cluster_map_male;
 	for (clidx = 0; clidx < cluster_ct; clidx++) {
-	  wptr = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), wptr_start);
-	  wptr = memcpyax(wptr, csptr, cslen, ' ');
+	  pzwritep = memcpya(pzwritep, writebuf, wptr_start - writebuf);
+	  pzwritep = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+	  pzwritep = memcpya(pzwritep, csptr, cslen);
 	  fill_uint_zero(cur_cts, 4);
 	  uiptr2 = &(cluster_map_male[cluster_starts_male[clidx + 1]]);
 	  while (uiptr < uiptr2) {
@@ -2551,22 +2542,24 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	    tot_obs = 2 * (cur_cts[0] + cur_cts[2] + cur_cts[3]);
 	  }
 	  if (tot_obs) {
-            wptr = double_g_writewx4x(wptr, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
-	    wptr = uint32_writew6x(wptr, a1_obs, ' ');
-	    wptr = uint32_writew8(wptr, tot_obs);
-	    wptr = memcpya(wptr, " \n", 2);
+            pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
+	    pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
+	    pzwritep = uint32_writew8(pzwritep, tot_obs);
+	    *pzwritep++ = ' ';
 	  } else {
-	    wptr = memcpya(wptr, "       0      0        0 \n", 26);
+	    pzwritep = memcpya(pzwritep, "       0      0        0 ", 25);
 	  }
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  append_binary_eoln(&pzwritep);
+	  if (flex_pzwrite(&ps, &pzwritep)) {
 	    goto write_stratified_freqs_ret_WRITE_FAIL;
 	  }
 	}
       } else {
         uiptr = cur_cluster_map;
 	for (clidx = 0; clidx < cluster_ct; clidx++) {
-	  wptr = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), wptr_start);
-	  wptr = memcpyax(wptr, csptr, cslen, ' ');
+	  pzwritep = memcpya(pzwritep, writebuf, wptr_start - writebuf);
+	  pzwritep = fw_strcpy(8, &(cluster_ids[clidx * max_cluster_id_len]), pzwritep);
+	  pzwritep = memcpya(pzwritep, csptr, cslen);
 	  fill_uint_zero(cur_cts, 4);
 	  uiptr2 = &(cur_cluster_map[cur_cluster_starts[clidx + 1]]);
 	  while (uiptr < uiptr2) {
@@ -2581,14 +2574,15 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
 	    tot_obs = 2 * (cur_cts[0] + cur_cts[2] + cur_cts[3]);
 	  }
 	  if (tot_obs) {
-            wptr = double_g_writewx4x(wptr, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
-	    wptr = uint32_writew6x(wptr, a1_obs, ' ');
-	    wptr = uint32_writew8(wptr, tot_obs);
-	    wptr = memcpya(wptr, " \n", 2);
+            pzwritep = double_g_writewx4x(pzwritep, ((double)((int32_t)a1_obs)) / ((double)tot_obs), 8, ' ');
+	    pzwritep = uint32_writew6x(pzwritep, a1_obs, ' ');
+	    pzwritep = uint32_writew8(pzwritep, tot_obs);
+	    *pzwritep++ = ' ';
 	  } else {
-	    wptr = memcpya(wptr, "       0      0        0 \n", 26);
+	    pzwritep = memcpya(pzwritep, "       0      0        0 ", 25);
 	  }
-	  if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+	  append_binary_eoln(&pzwritep);
+	  if (flex_pzwrite(&ps, &pzwritep)) {
 	    goto write_stratified_freqs_ret_WRITE_FAIL;
 	  }
 	}
@@ -2602,7 +2596,7 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
       }
     } while (marker_uidx < chrom_end);
   }
-  if (fclose_null(&outfile)) {
+  if (flex_pzwrite_close_null(&ps, pzwritep)) {
     goto write_stratified_freqs_ret_WRITE_FAIL;
   }
   LOGPRINTFWW("--freq: Cluster-stratified allele frequencies (%s) written to %s .\n", nonfounders? "all samples" : "founders only", outname);
@@ -2621,18 +2615,22 @@ int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outnam
     break;
   }
   wkspace_reset(wkspace_mark);
-  fclose_cond(outfile);
+  flex_pzwrite_close_cond(&ps, pzwritep);
   return retval;
 }
 
-int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* marker_reverse) {
-  FILE* outfile = NULL;
+int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
+  unsigned char* wkspace_mark = wkspace_base;
+  char* pzwritep = NULL;
   uint32_t reverse = 0;
   uint32_t freq_counts = (misc_flags / MISC_FREQ_COUNTS) & 1;
   uint32_t freqx = (misc_flags / MISC_FREQX) & 1;
+  uint32_t output_gz = (misc_flags / MISC_FREQ_GZ) & 1;
   uint32_t maf_succ = (misc_flags / MISC_MAF_SUCC) & 1;
   int32_t chrom_code_end = chrom_info_ptr->max_code + 1 + chrom_info_ptr->name_ct;
   int32_t retval = 0;
+  Pigz_state ps;
+  unsigned char* overflow_buf;
   char* minor_ptr;
   char* major_ptr;
   char* bufptr;
@@ -2644,32 +2642,33 @@ int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_m
   uint32_t missing_ct;
   int32_t chrom_idx;
   uint32_t uii;
-  if (fopen_checked(&outfile, outname, "w")) {
-    goto write_freqs_ret_OPEN_FAIL;
+  pzwrite_init_null(&ps);
+  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + 2 * max_marker_allele_len + MAXLINELEN)) {
+    goto write_freqs_ret_NOMEM;
   }
+
+  bufptr = memcpya(outname_end, ".frq", 4);
   if (freqx) {
-    if (fputs_checked("CHR\tSNP\tA1\tA2\tC(HOM A1)\tC(HET)\tC(HOM A2)\tC(HAP A1)\tC(HAP A2)\tC(MISSING)\n", outfile)) {
-      goto write_freqs_ret_WRITE_FAIL;
-    }
-  } else if (plink_maxsnp < 5) {
-    if (freq_counts) {
-      if (fputs_checked(" CHR  SNP   A1   A2     C1     C2     G0\n", outfile)) {
-	goto write_freqs_ret_WRITE_FAIL;
-      }
-    } else {
-      if (fputs_checked(" CHR  SNP   A1   A2          MAF  NCHROBS\n", outfile)) {
-        goto write_freqs_ret_WRITE_FAIL;
-      }
-    }
+    *bufptr++ = 'x';
   } else if (freq_counts) {
-    sprintf(tbuf, " CHR %%%us   A1   A2     C1     C2     G0\n", plink_maxsnp);
-    fprintf(outfile, tbuf, "SNP");
+    bufptr = memcpya(bufptr, ".counts", 7);
+  }
+  if (!output_gz) {
+    *bufptr = '\0';
   } else {
-    sprintf(tbuf, " CHR %%%us   A1   A2          MAF  NCHROBS\n", plink_maxsnp);
-    fprintf(outfile, tbuf, "SNP");
+    memcpy(bufptr, ".gz", 4);
   }
-  if (ferror(outfile)) {
-    goto write_freqs_ret_WRITE_FAIL;
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
+    goto write_freqs_ret_OPEN_FAIL;
+  }
+  pzwritep = (char*)overflow_buf;
+  if (freqx) {
+    pzwritep = strcpya(pzwritep, "CHR\tSNP\tA1\tA2\tC(HOM A1)\tC(HET)\tC(HOM A2)\tC(HAP A1)\tC(HAP A2)\tC(MISSING)" EOLN_STR);
+  } else if (plink_maxsnp < 5) {
+    pzwritep = strcpya(pzwritep, freq_counts? (" CHR  SNP   A1   A2     C1     C2     G0" EOLN_STR) : (" CHR  SNP   A1   A2          MAF  NCHROBS" EOLN_STR));
+  } else {
+    sprintf(tbuf, freq_counts? (" CHR %%%us   A1   A2     C1     C2     G0" EOLN_STR) : (" CHR %%%us   A1   A2          MAF  NCHROBS" EOLN_STR), plink_maxsnp);
+    pzwritep += sprintf(pzwritep, tbuf, "SNP");
   }
   for (chrom_idx = 0; chrom_idx < chrom_code_end; chrom_idx++) {
     if (!chrom_exists(chrom_info_ptr, chrom_idx)) {
@@ -2697,67 +2696,63 @@ int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_m
 	  missing_ct = sample_f_ct - (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]);
 	}
 	if (freqx) {
-	  bufptr = chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx));
-	  *bufptr++ = '\t';
-	  bufptr = strcpyax(bufptr, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
-	  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-          fputs(minor_ptr, outfile);
-	  putc('\t', outfile);
-          fputs(major_ptr, outfile);
-	  tbuf[0] = '\t';
-          bufptr = uint32_writex(&(tbuf[1]), reverse? hh_cts[marker_uidx] : ll_cts[marker_uidx], '\t');
-	  bufptr = uint32_writex(bufptr, lh_cts[marker_uidx], '\t');
-          bufptr = uint32_writex(bufptr, reverse? ll_cts[marker_uidx] : hh_cts[marker_uidx], '\t');
-          bufptr = uint32_writex(bufptr, reverse? haph_cts[marker_uidx] : hapl_cts[marker_uidx], '\t');
-          bufptr = uint32_writex(bufptr, reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t');
-          bufptr = uint32_writex(bufptr, missing_ct, '\n');
-	  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+	  pzwritep = chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx));
+	  *pzwritep++ = '\t';
+	  pzwritep = strcpyax(pzwritep, &(marker_ids[marker_uidx * max_marker_id_len]), '\t');
+	  pzwritep = strcpyax(pzwritep, minor_ptr, '\t');
+          pzwritep = strcpyax(pzwritep, major_ptr, '\t');
+          pzwritep = uint32_writex(pzwritep, reverse? hh_cts[marker_uidx] : ll_cts[marker_uidx], '\t');
+	  pzwritep = uint32_writex(pzwritep, lh_cts[marker_uidx], '\t');
+          pzwritep = uint32_writex(pzwritep, reverse? ll_cts[marker_uidx] : hh_cts[marker_uidx], '\t');
+          pzwritep = uint32_writex(pzwritep, reverse? haph_cts[marker_uidx] : hapl_cts[marker_uidx], '\t');
+          pzwritep = uint32_writex(pzwritep, reverse? hapl_cts[marker_uidx] : haph_cts[marker_uidx], '\t');
+          pzwritep = uint32_write(pzwritep, missing_ct);
 	} else {
-	  bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
-	  *bufptr++ = ' ';
-	  bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
-	  *bufptr++ = ' ';
-	  fwrite(tbuf, 1, bufptr - tbuf, outfile);
-	  fputs_w4(minor_ptr, outfile);
-          putc(' ', outfile);
-          fputs_w4(major_ptr, outfile);
-	  tbuf[0] = ' ';
-          bufptr = uint32_writew6x(&(tbuf[1]), 2 * ll_cts[marker_uidx] + lh_cts[marker_uidx] + hapl_cts[marker_uidx], ' ');
-	  bufptr = uint32_writew6x(bufptr, 2 * hh_cts[marker_uidx] + lh_cts[marker_uidx] + haph_cts[marker_uidx], ' ');
-	  bufptr = uint32_writew6x(bufptr, missing_ct, '\n');
-	  fwrite(tbuf, 1, bufptr - tbuf, outfile);
+	  pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+	  *pzwritep++ = ' ';
+	  pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
+	  *pzwritep++ = ' ';
+	  pzwritep = fw_strcpy(4, minor_ptr, pzwritep);
+	  *pzwritep++ = ' ';
+	  pzwritep = fw_strcpy(4, major_ptr, pzwritep);
+	  *pzwritep++ = ' ';
+          pzwritep = uint32_writew6x(pzwritep, 2 * ll_cts[marker_uidx] + lh_cts[marker_uidx] + hapl_cts[marker_uidx], ' ');
+	  pzwritep = uint32_writew6x(pzwritep, 2 * hh_cts[marker_uidx] + lh_cts[marker_uidx] + haph_cts[marker_uidx], ' ');
+	  pzwritep = uint32_writew6(pzwritep, missing_ct);
 	}
       } else {
-	bufptr = width_force(4, tbuf, chrom_name_write(tbuf, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
-	*bufptr++ = ' ';
-	bufptr = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), bufptr);
-        *bufptr++ = ' ';
-	fwrite(tbuf, 1, bufptr - tbuf, outfile);
-	fputs_w4(minor_ptr, outfile);
-	putc(' ', outfile);
-        fputs_w4(major_ptr, outfile);
-        tbuf[0] = ' ';
+	pzwritep = width_force(4, pzwritep, chrom_name_write(pzwritep, chrom_info_ptr, get_marker_chrom(chrom_info_ptr, marker_uidx)));
+	*pzwritep++ = ' ';
+	pzwritep = fw_strcpy(plink_maxsnp, &(marker_ids[marker_uidx * max_marker_id_len]), pzwritep);
+        *pzwritep++ = ' ';
+	pzwritep = fw_strcpy(4, minor_ptr, pzwritep);
+	*pzwritep++ = ' ';
+	pzwritep = fw_strcpy(4, major_ptr, pzwritep);
+	*pzwritep++ = ' ';
 	uii = 2 * (ll_cts[marker_uidx] + lh_cts[marker_uidx] + hh_cts[marker_uidx]) + hapl_cts[marker_uidx] + haph_cts[marker_uidx];
 	if (maf_succ || uii || (set_allele_freqs[marker_uidx] != 0.5)) {
-          bufptr = double_g_writewx4(&(tbuf[1]), 1.0 - set_allele_freqs[marker_uidx], 12);
+          pzwritep = double_g_writewx4(pzwritep, 1.0 - set_allele_freqs[marker_uidx], 12);
 	} else {
-	  bufptr = memcpya(&(tbuf[1]), "          NA", 12);
+	  pzwritep = memcpya(pzwritep, "          NA", 12);
 	}
-	*bufptr++ = ' ';
-        bufptr = uint32_writew8x(bufptr, uii, '\n');
-	fwrite(tbuf, 1, bufptr - tbuf, outfile);
+	*pzwritep++ = ' ';
+        pzwritep = uint32_writew8(pzwritep, uii);
       }
-      if (ferror(outfile)) {
+      append_binary_eoln(&pzwritep);
+      if (flex_pzwrite(&ps, &pzwritep)) {
 	goto write_freqs_ret_WRITE_FAIL;
       }
       marker_uidx = next_unset(marker_exclude, marker_uidx + 1, chrom_end);
     }
   }
-  if (fclose_null(&outfile)) {
+  if (flex_pzwrite_close_null(&ps, pzwritep)) {
     goto write_freqs_ret_WRITE_FAIL;
   }
   LOGPRINTFWW("--freq%s: Allele frequencies (%s) written to %s .\n", freqx? "x" : "", nonfounders? "all samples" : "founders only", outname);
   while (0) {
+  write_freqs_ret_NOMEM:
+    retval = RET_NOMEM;
+    break;
   write_freqs_ret_OPEN_FAIL:
     retval = RET_OPEN_FAIL;
     break;
@@ -2765,26 +2760,11 @@ int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_m
     retval = RET_WRITE_FAIL;
     break;
   }
-  fclose_cond(outfile);
+  flex_pzwrite_close_cond(&ps, pzwritep);
+  wkspace_reset(wkspace_mark);
   return retval;
 }
 
-void calc_marker_weights(double exponent, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t marker_ct, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, double* marker_weights) {
-  uint32_t marker_uidx = 0;
-  uint32_t markers_done = 0;
-  uint32_t marker_uidx_stop;
-  do {
-    marker_uidx = next_unset_unsafe(marker_exclude, marker_uidx);
-    marker_uidx_stop = next_set(marker_exclude, marker_uidx, unfiltered_marker_ct);
-    markers_done += marker_uidx_stop - marker_uidx;
-    do {
-      if (marker_weights[marker_uidx] < 0.0) {
-	marker_weights[marker_uidx] = calc_wt_mean(exponent, lh_cts[marker_uidx], ll_cts[marker_uidx], hh_cts[marker_uidx]);
-      }
-    } while (++marker_uidx < marker_uidx_stop);
-  } while (markers_done < marker_ct);
-}
-
 int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uint64_t misc_flags, double check_sex_fthresh, double check_sex_mthresh, uint32_t max_f_yobs, uint32_t min_m_yobs, Chrom_info* chrom [...]
   unsigned char* wkspace_mark = wkspace_base;
   FILE* outfile = NULL;
@@ -3497,12 +3477,12 @@ int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_mo
   return retval;
 }
 
-int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs) {
+int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs) {
   // Same F coefficient computation as sexcheck().
   unsigned char* wkspace_mark = wkspace_base;
-  FILE* outfile = NULL;
   uintptr_t* loadbuf_f = NULL;
   uintptr_t* founder_vec11 = NULL;
+  char* pzwritep = NULL;
   uintptr_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
   uintptr_t unfiltered_sample_ctl2 = (unfiltered_sample_ct + (BITCT2 - 1)) / BITCT2;
   uintptr_t unfiltered_sample_ctl = (unfiltered_sample_ct + (BITCT - 1)) / BITCT;
@@ -3515,15 +3495,16 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
   uint32_t chrom_fo_idx = 0xffffffffU; // deliberate overflow
   uint32_t chrom_end = 0;
   int32_t retval = 0;
+  Pigz_state ps;
   uintptr_t* loadbuf_raw;
   uintptr_t* loadbuf;
   uintptr_t* lptr;
   uint32_t* het_cts;
   uint32_t* missing_cts;
   double* nei_offsets;
+  unsigned char* overflow_buf;
   char* fid_ptr;
   char* iid_ptr;
-  char* wptr;
   double dpp;
   double dtot;
   double cur_nei;
@@ -3539,11 +3520,13 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
   uintptr_t cur_word;
   uintptr_t ulii;
   uint32_t obs_ct;
+  pzwrite_init_null(&ps);
   if (is_set(chrom_info_ptr->haploid_mask, 0)) {
     logprint("Error: --het cannot be used on haploid genomes.\n");
     goto het_report_ret_INVALID_CMDLINE;
   }
-  if (wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
+  if (wkspace_alloc_uc_checked(&overflow_buf, PIGZ_BLOCK_SIZE + MAXLINELEN) ||
+      wkspace_alloc_ul_checked(&loadbuf_raw, unfiltered_sample_ctl2 * sizeof(intptr_t)) ||
       wkspace_alloc_ul_checked(&loadbuf, sample_ctl2 * sizeof(intptr_t)) ||
       wkspace_alloc_ui_checked(&het_cts, sample_ct * sizeof(int32_t)) ||
       wkspace_alloc_ui_checked(&missing_cts, sample_ct * sizeof(int32_t)) ||
@@ -3659,39 +3642,41 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
   if (!marker_ct) {
     goto het_report_ret_INVALID_CMDLINE;
   }
-  memcpy(outname_end, ".het", 5);
-  if (fopen_checked(&outfile, outname, "w")) {
+  memcpy(outname_end, output_gz? ".het.gz" : ".het", output_gz? 8 : 5);
+  if (flex_pzwrite_init(output_gz, outname, overflow_buf, 0, &ps)) {
     goto het_report_ret_OPEN_FAIL;
   }
+  pzwritep = (char*)overflow_buf;
   sprintf(tbuf, "%%%us %%%us       O(HOM)       E(HOM)        N(NM)            F\n", plink_maxfid, plink_maxiid);
-  fprintf(outfile, tbuf, "FID", "IID");
+  pzwritep += sprintf(pzwritep, tbuf, "FID", "IID");
   sample_uidx = 0;
   for (sample_idx = 0; sample_idx < sample_ct; sample_idx++, sample_uidx++) {
     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
     fid_ptr = &(sample_ids[sample_uidx * max_sample_id_len]);
     iid_ptr = (char*)memchr(fid_ptr, '\t', max_sample_id_len);
-    wptr = fw_strcpyn(plink_maxfid, (uintptr_t)(iid_ptr - fid_ptr), fid_ptr, tbuf);
-    *wptr++ = ' ';
-    wptr = fw_strcpy(plink_maxiid, &(iid_ptr[1]), wptr);
-    wptr = memseta(wptr, 32, 3);
+    pzwritep = fw_strcpyn(plink_maxfid, (uintptr_t)(iid_ptr - fid_ptr), fid_ptr, pzwritep);
+    *pzwritep++ = ' ';
+    pzwritep = fw_strcpy(plink_maxiid, &(iid_ptr[1]), pzwritep);
+    pzwritep = memseta(pzwritep, 32, 3);
     obs_ct = marker_ct - missing_cts[sample_idx];
     if (obs_ct) {
-      wptr = uint32_writew10x(wptr, obs_ct - het_cts[sample_idx], ' ');
+      pzwritep = uint32_writew10x(pzwritep, obs_ct - het_cts[sample_idx], ' ');
       dee = nei_sum - nei_offsets[sample_idx];
-      wptr = double_g_writewx4(wptr, dee, 12);
-      wptr = memseta(wptr, 32, 3);
-      wptr = uint32_writew10x(wptr, obs_ct, ' ');
+      pzwritep = double_g_writewx4(pzwritep, dee, 12);
+      pzwritep = memseta(pzwritep, 32, 3);
+      pzwritep = uint32_writew10x(pzwritep, obs_ct, ' ');
       dtot = (double)((int32_t)obs_ct) - dee;
       dff = (dtot - ((double)((int32_t)(het_cts[sample_idx])))) / dtot;
-      wptr = double_g_writewx4x(wptr, dff, 12, '\n');
+      pzwritep = double_g_writewx4(pzwritep, dff, 12);
     } else {
-      wptr = memcpya(wptr, "         0            0            0          nan\n", 50);
+      pzwritep = memcpya(pzwritep, "         0            0            0          nan", 49);
     }
-    if (fwrite_checked(tbuf, wptr - tbuf, outfile)) {
+    append_binary_eoln(&pzwritep);
+    if (flex_pzwrite(&ps, &pzwritep)) {
       goto het_report_ret_WRITE_FAIL;
     }
   }
-  if (fclose_null(&outfile)) {
+  if (flex_pzwrite_close_null(&ps, pzwritep)) {
     goto het_report_ret_WRITE_FAIL;
   }
   LOGPRINTFWW("--het%s: %" PRIuPTR " variant%s scanned, report written to %s .\n", loadbuf_f? " small-sample" : "", marker_ct, (marker_ct == 1)? "" : "s", outname);
@@ -3714,7 +3699,7 @@ int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
     break;
   }
   wkspace_reset(wkspace_mark);
-  fclose_cond(outfile);
+  flex_pzwrite_close_cond(&ps, pzwritep);
   return retval;
 }
 
@@ -3852,7 +3837,7 @@ int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* out
   loop_end = marker_ct / 100;
   for (marker_uidx = 0, marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
     if (IS_SET(marker_exclude, marker_uidx)) {
-      marker_uidx = next_set_ul_unsafe(marker_exclude, marker_uidx);
+      marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx);
       seek_flag = 1;
     }
     if (marker_uidx >= chrom_end) {
@@ -4842,11 +4827,14 @@ int32_t meta_analysis_open_and_read_header(const char* fname, char* loadbuf, uin
     }
   }
 #ifdef __cplusplus
-  std::sort(parse_table, &(parse_table[token_ct]));
+  // suppress bogus gcc 4.4 warning, this is not performance-critical
+  qsort((int32_t*)parse_table, token_ct, sizeof(int32_t), intcmp);
+  // std::sort(parse_table, &(parse_table[token_ct]));
 #else
   qsort((int32_t*)parse_table, token_ct, sizeof(int32_t), intcmp);
 #endif
-  if (!weighted_z) {
+  // bugfix: this caused a segfault in no-map case
+  if ((!weighted_z) && (token_ct > 5)) {
     token_ct -= 2;
   }
   col_skips[0] = parse_table[0] >> 4;
@@ -5541,7 +5529,7 @@ int32_t meta_analysis(char* input_fnames, char* snpfield_search_order, char* a1f
   if (!no_allele) {
     fputs("  A1  A2", outfile);
   }
-  fputs("   N           P        P(R)      OR   OR(R)       Q       I", outfile);
+  fputs(output_beta? "   N           P        P(R)    BETA BETA(R)       Q       I" : "   N           P        P(R)      OR   OR(R)       Q       I", outfile);
   if (weighted_z) {
     fputs("  WEIGHTED_Z       P(WZ)", outfile);
   }
diff --git a/plink_misc.h b/plink_misc.h
index b33926c..5bbdfd0 100644
--- a/plink_misc.h
+++ b/plink_misc.h
@@ -63,15 +63,13 @@ void calc_plink_maxfid(uint32_t unfiltered_sample_ct, uintptr_t* sample_exclude,
 
 uint32_t calc_plink_maxsnp(uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len);
 
-int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ, double exponent, uint32_t wt_needed, double* marker_weights);
+int32_t read_external_freqs(char* freqname, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char* marker_ids, uintptr_t max_marker_id_len, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs, double* set_allele_freqs, uint32_t* nchrobs, uint32_t maf_succ);
 
 int32_t load_ax_alleles(Two_col_params* axalleles, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_exclude_ct, char** marker_allele_ptrs, uintptr_t* max_marker_allele_len_ptr, uintptr_t* marker_reverse, char* marker_ids, uintptr_t max_marker_id_len, double* set_allele_freqs, uint32_t is_a2);
 
-int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t sample_f_male_ct, uintptr_t* marker_reve [...]
+int32_t write_stratified_freqs(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, uintptr_t unfiltered_sample_ct, uintptr_t sample_ct, uint32_t sample_f_ct, uintptr_t* founder_info, uint32_t nonfounders, uintptr_t* sex_male, uint32_t s [...]
 
-int32_t write_freqs(char* outname, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* marker_reverse);
-
-void calc_marker_weights(double exponent, uint32_t unfiltered_marker_ct, uintptr_t* marker_exclude, uint32_t marker_ct, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, double* marker_weights);
+int32_t write_freqs(char* outname, char* outname_end, uint32_t plink_maxsnp, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, double* set_allele_freqs, Chrom_info* chrom_info_ptr, char* marker_ids, uintptr_t max_marker_id_len, char** marker_allele_ptrs, uintptr_t max_marker_allele_len, int32_t* ll_cts, int32_t* lh_cts, int32_t* hh_cts, int32_t* hapl_cts, int32_t* haph_cts, uint32_t sample_f_ct, uint32_t sample_f_male_ct, uint32_t nonfounders, uint64_t misc_flags, uintptr_t* mar [...]
 
 int32_t sexcheck(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* sex_nm, uintptr_t* sex_male, uint64_t misc_flags, double check_sex_fthresh, double check_sex_mthresh, uint32_t max_f_yobs, uint32_t min_m_yobs, Chrom_info* chrom [...]
 
@@ -81,7 +79,7 @@ int32_t write_var_ranges(char* outname, char* outname_end, uintptr_t unfiltered_
 
 int32_t list_duplicate_vars(char* outname, char* outname_end, uint32_t dupvar_modifier, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, char** marker_allele_ptrs);
 
-int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs);
+int32_t het_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uint32_t output_gz, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t sample_ct, char* sample_ids, uint32_t plink_maxfid, uint32_t plink_maxiid, uintptr_t max_sample_id_len, uintptr_t* founder_info, Chrom_info* chrom_info_ptr, double* set_allele_freqs);
 
 int32_t fst_report(FILE* bedfile, uintptr_t bed_offset, char* outname, char* outname_end, uintptr_t unfiltered_marker_ct, uintptr_t* marker_exclude, uintptr_t marker_ct, char* marker_ids, uintptr_t max_marker_id_len, uint32_t* marker_pos, Chrom_info* chrom_info_ptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_exclude, uintptr_t* pheno_nm, uintptr_t* pheno_c, uintptr_t cluster_ct, uint32_t* cluster_map, uint32_t* cluster_starts);
 
diff --git a/plink_set.c b/plink_set.c
index e8aea8d..25fc84b 100644
--- a/plink_set.c
+++ b/plink_set.c
@@ -1929,6 +1929,7 @@ void unpack_set_unfiltered(uintptr_t marker_ct, uintptr_t unfiltered_marker_ct,
     unpack_set_unfiltered_late_start:
       range_end = *uiptr++;
       if (range_end == marker_ct) {
+	last_uidx = unfiltered_marker_ct;
 	break;
       }
       last_uidx = jump_forward_unset_unsafe(marker_exclude, marker_uidx + 1, range_end - range_start);
diff --git a/yarn.h b/yarn.h
index 436a675..63acf76 100644
--- a/yarn.h
+++ b/yarn.h
@@ -109,6 +109,9 @@
         handler will exit (set to NULL by default for no action)
  */
 
+#ifndef __YARN_H__
+#define __YARN_H__
+
 extern const char *yarn_prefix;
 extern void (*yarn_abort)(int);
 
@@ -132,3 +135,5 @@ enum wait_op {
 void wait_for(lock *, enum wait_op, long);
 long peek_lock(lock *);
 void free_lock(lock *);
+
+#endif // __YARN_H__

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/plink1.9.git



More information about the debian-med-commit mailing list