[med-svn] [libfastahack] 01/02: Imported Upstream version 0.0+20160309

Thu Jun 23 09:12:06 UTC 2016

This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository libfastahack.

commit 56f345e677c5523ea2ecb4ee6c8ec276e9c5075c
Author: Andreas Tille <tille at debian.org>
Date:   Thu Jun 23 11:10:21 2016 +0200

    Imported Upstream version 0.0+20160309
---
 .gitignore                    |   2 +
 .travis.yml                   |  13 ++
 Fasta.cpp                     | 323 ++++++++++++++++++++++++++++++++++++++++
 Fasta.h                       |  85 +++++++++++
 FastaHack.cpp                 | 179 ++++++++++++++++++++++
 LargeFileSupport.h            |  16 ++
 Makefile                      |  42 ++++++
 README                        |  70 +++++++++
 Region.h                      |  51 +++++++
 disorder.c                    | 192 ++++++++++++++++++++++++
 disorder.h                    |  62 ++++++++
 libdisorder.LICENSE           | 339 ++++++++++++++++++++++++++++++++++++++++++
 split.cpp                     |  33 ++++
 split.h                       |  20 +++
 tests/correct.fasta           |  30 ++++
 tests/embedded_newline.fasta  |  35 +++++
 tests/mismatched_lines.fasta  |  30 ++++
 tests/trailing_newlines.fasta |  34 +++++
 18 files changed, 1556 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1aa62e4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.o
+fastahack
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..d6add9e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,13 @@
+# Control file for continuous integration testing at http://travis-ci.org/
+
+language: cpp
+compiler: gcc
+sudo: required
+dist: trusty
+script: make
+os:
+  - linux
+  - osx
+compiler:
+  - gcc
+  - clang
diff --git a/Fasta.cpp b/Fasta.cpp
new file mode 100644
index 0000000..572f2d0
--- /dev/null
+++ b/Fasta.cpp
@@ -0,0 +1,323 @@
+// ***************************************************************************
+// FastaIndex.cpp (c) 2010 Erik Garrison <erik.garrison at bc.edu>
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 9 February 2010 (EG)
+// ---------------------------------------------------------------------------
+
+#include "Fasta.h"
+
+FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len)
+    : name(name)
+    , length(length)
+    , offset(offset)
+    , line_blen(line_blen)
+    , line_len(line_len)
+{}
+
+FastaIndexEntry::FastaIndexEntry(void) // empty constructor
+{ clear(); }
+
+FastaIndexEntry::~FastaIndexEntry(void)
+{}
+
+void FastaIndexEntry::clear(void)
+{
+    name = "";
+    length = 0;
+    offset = -1;  // no real offset will ever be below 0, so this allows us to
+                  // check if we have already recorded a real offset
+    line_blen = 0;
+    line_len = 0;
+}
+
+ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
+    // just write the first component of the name, for compliance with other tools
+    output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" <<
+        e.line_blen << "\t" << e.line_len;
+    return output;  // for multiple << operators.
+}
+
+FastaIndex::FastaIndex(void) 
+{}
+
+void FastaIndex::readIndexFile(string fname) {
+    string line;
+    long long linenum = 0;
+    indexFile.open(fname.c_str(), ifstream::in);
+    if (indexFile.is_open()) {
+        while (getline (indexFile, line)) {
+            ++linenum;
+            // the fai format defined in samtools is tab-delimited, every line being:
+            // fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len
+            vector<string> fields = split(line, '\t');
+            if (fields.size() == 5) {  // if we don't get enough fields then there is a problem with the file
+                // note that fields[0] is the sequence name
+                char* end;
+                string name = split(fields[0], " \t").at(0);  // key by first token of name
+                sequenceNames.push_back(name);
+                this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()),
+                                                    strtoll(fields[2].c_str(), &end, 10),
+                                                    atoi(fields[3].c_str()),
+                                                    atoi(fields[4].c_str()))));
+            } else {
+                cerr << "Warning: malformed fasta index file " << fname << 
+                    "does not have enough fields @ line " << linenum << endl;
+                cerr << line << endl;
+                exit(1);
+            }
+        }
+    } else {
+        cerr << "could not open index file " << fname << endl;
+        exit(1);
+    }
+}
+
+// for consistency this should be a class method
+bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset<b.offset); }
+
+ostream& operator<<(ostream& output, FastaIndex& fastaIndex) {
+    vector<FastaIndexEntry> sortedIndex;
+    for(vector<string>::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it)
+    {
+        sortedIndex.push_back(fastaIndex[*it]);
+    }
+    sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare);
+    for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) {
+        output << *fit << endl;
+    }
+}
+
+void FastaIndex::indexReference(string refname) {
+    // overview:
+    //  for line in the reference fasta file
+    //  track byte offset from the start of the file
+    //  if line is a fasta header, take the name and dump the last sequnece to the index
+    //  if line is a sequence, add it to the current sequence
+    //cerr << "indexing fasta reference " << refname << endl;
+    string line;
+    FastaIndexEntry entry;  // an entry buffer used in processing
+    entry.clear();
+    int line_length = 0;
+    long long offset = 0;  // byte offset from start of file
+    long long line_number = 0; // current line number
+    bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file
+                                        // this will be used to raise an error
+                                        // if we have a line length change at
+                                        // any line other than the last line in
+                                        // the sequence
+    bool emptyLine = false;  // flag to catch empty lines, which we allow for
+                             // index generation only on the last line of the sequence
+    ifstream refFile;
+    refFile.open(refname.c_str());
+    if (refFile.is_open()) {
+        while (getline(refFile, line)) {
+            ++line_number;
+            line_length = line.length();
+            if (line[0] == ';') {
+                // fasta comment, skip
+            } else if (line[0] == '+') {
+                // fastq quality header
+                getline(refFile, line);
+                line_length = line.length();
+                offset += line_length + 1;
+                // get and don't handle the quality line
+                getline(refFile, line);
+                line_length = line.length();
+            } else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header
+                // if we aren't on the first entry, push the last sequence into the index
+                if (entry.name != "") {
+                    mismatchedLineLengths = false; // reset line length error tracker for every new sequence
+                    emptyLine = false;
+                    flushEntryToIndex(entry);
+                    entry.clear();
+                }
+                entry.name = line.substr(1, line_length - 1);
+            } else { // we assume we have found a sequence line
+                if (entry.offset == -1) // NB initially the offset is -1
+                    entry.offset = offset;
+                entry.length += line_length;
+                if (entry.line_len) {
+                    //entry.line_len = entry.line_len ? entry.line_len : line_length + 1;
+                    if (mismatchedLineLengths || emptyLine) {
+                        if (line_length == 0) {
+                            emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
+                        } else {
+                            if (emptyLine) {
+                                cerr << "ERROR: embedded newline";
+                            } else {
+                                cerr << "ERROR: mismatched line lengths";
+                            }
+                            cerr << " at line " << line_number << " within sequence " << entry.name <<
+                                endl << "File not suitable for fasta index generation." << endl;
+                            exit(1);
+                        }
+                    }
+                    // this flag is set here and checked on the next line
+                    // because we may have reached the end of the sequence, in
+                    // which case a mismatched line length is OK
+                    if (entry.line_len != line_length + 1) {
+                        mismatchedLineLengths = true;
+                        if (line_length == 0) {
+                            emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
+                        }
+                    }
+                } else {
+                    entry.line_len = line_length + 1; // first line
+                }
+                entry.line_blen = entry.line_len - 1;
+            }
+            offset += line_length + 1;
+        }
+        // we've hit the end of the fasta file!
+        // flush the last entry
+        flushEntryToIndex(entry);
+    } else {
+        cerr << "could not open reference file " << refname << " for indexing!" << endl;
+        exit(1);
+    }
+}
+
+void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
+    string name = split(entry.name, " \t").at(0);  // key by first token of name
+    sequenceNames.push_back(name);
+    this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
+                        entry.offset, entry.line_blen,
+                        entry.line_len)));
+
+}
+
+void FastaIndex::writeIndexFile(string fname) {
+    //cerr << "writing fasta index file " << fname << endl;
+    ofstream file;
+    file.open(fname.c_str()); 
+    if (file.is_open()) {
+        file << *this;
+    } else { 
+        cerr << "could not open index file " << fname << " for writing!" << endl;
+        exit(1);
+    }
+}
+
+FastaIndex::~FastaIndex(void) {
+    indexFile.close();
+}
+
+FastaIndexEntry FastaIndex::entry(string name) {
+    FastaIndex::iterator e = this->find(name);
+    if (e == this->end()) {
+        cerr << "unable to find FASTA index entry for '" << name << "'" << endl;
+        exit(1);
+    } else {
+        return e->second;
+    }
+}
+
+string FastaIndex::indexFileExtension() { return ".fai"; }
+
+/*
+FastaReference::FastaReference(string reffilename) {
+}
+*/
+
+void FastaReference::open(string reffilename) {
+    filename = reffilename;
+    if (!(file = fopen(filename.c_str(), "r"))) {
+        cerr << "could not open " << filename << endl;
+        exit(1);
+    }
+    index = new FastaIndex();
+    struct stat stFileInfo; 
+    string indexFileName = filename + index->indexFileExtension(); 
+    // if we can find an index file, use it
+    if(stat(indexFileName.c_str(), &stFileInfo) == 0) { 
+        index->readIndexFile(indexFileName);
+    } else { // otherwise, read the reference and generate the index file in the cwd
+        cerr << "index file " << indexFileName << " not found, generating..." << endl;
+        index->indexReference(filename);
+        index->writeIndexFile(indexFileName);
+    }
+}
+
+FastaReference::~FastaReference(void) {
+    if (file != NULL)
+      fclose(file);
+    if (index != NULL)
+      delete index;
+}
+
+string FastaReference::getSequence(string seqname) {
+    FastaIndexEntry entry = index->entry(seqname);
+    int newlines_in_sequence = entry.length / entry.line_blen;
+    int seqlen = newlines_in_sequence  + entry.length;
+    char* seq = (char*) calloc (seqlen + 1, sizeof(char));
+    fseek64(file, entry.offset, SEEK_SET);
+    string s;
+    if (fread(seq, sizeof(char), seqlen, file)) {
+        seq[seqlen] = '\0';
+        char* pbegin = seq;
+        char* pend = seq + (seqlen/sizeof(char));
+        pend = remove(pbegin, pend, '\n');
+        pend = remove(pbegin, pend, '\0');
+        s = seq;
+        free(seq);
+        s.resize((pend - pbegin)/sizeof(char));
+    }
+    return s;
+}
+
+// TODO cleanup; odd function.  use a map
+string FastaReference::sequenceNameStartingWith(string seqnameStart) {
+    try {
+        return (*index)[seqnameStart].name;
+    } catch (exception& e) {
+        cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl;
+        exit(1);
+    }
+}
+
+string FastaReference::getTargetSubSequence(FastaRegion& target) {
+    if (target.startPos == -1) {
+        return getSequence(target.startSeq);
+    } else {
+        return getSubSequence(target.startSeq, target.startPos - 1, target.length());
+    }
+}
+
+string FastaReference::getSubSequence(string seqname, int start, int length) {
+    FastaIndexEntry entry = index->entry(seqname);
+    length = min(length, entry.length - start);
+    if (start < 0 || length < 1) {
+        //cerr << "Empty sequence" << endl;
+        return "";
+    }
+    // we have to handle newlines
+    // approach: count newlines before start
+    //           count newlines by end of read
+    //             subtracting newlines before start find count of embedded newlines
+    int newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0;
+    int newlines_by_end = (start + length - 1) / entry.line_blen;
+    int newlines_inside = newlines_by_end - newlines_before;
+    int seqlen = length + newlines_inside;
+    char* seq = (char*) calloc (seqlen + 1, sizeof(char));
+    fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET);
+    string s;
+    if (fread(seq, sizeof(char), (off_t) seqlen, file)) {
+        seq[seqlen] = '\0';
+        char* pbegin = seq;
+        char* pend = seq + (seqlen/sizeof(char));
+        pend = remove(pbegin, pend, '\n');
+        pend = remove(pbegin, pend, '\0');
+        s = seq;
+        free(seq);
+        s.resize((pend - pbegin)/sizeof(char));
+    }
+    return s;
+}
+
+long unsigned int FastaReference::sequenceLength(string seqname) {
+    FastaIndexEntry entry = index->entry(seqname);
+    return entry.length;
+}
+
diff --git a/Fasta.h b/Fasta.h
new file mode 100644
index 0000000..e2f44fe
--- /dev/null
+++ b/Fasta.h
@@ -0,0 +1,85 @@
+// ***************************************************************************
+// FastaIndex.h (c) 2010 Erik Garrison <erik.garrison at bc.edu>
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 5 February 2010 (EG)
+// ---------------------------------------------------------------------------
+
+#ifndef _FASTA_H
+#define _FASTA_H
+
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <stdint.h>
+#include <stdio.h>
+#include <algorithm>
+#include "LargeFileSupport.h"
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "split.h"
+#include <stdlib.h>
+#include <ctype.h>
+#include <unistd.h>
+#include "Region.h"
+
+using namespace std;
+
+class FastaIndexEntry {
+    friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
+    public:
+        FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
+        FastaIndexEntry(void);
+        ~FastaIndexEntry(void);
+        string name;  // sequence name
+        int length;  // length of sequence
+        long long offset;  // bytes offset of sequence from start of file
+        int line_blen;  // line length in bytes, sequence characters
+        int line_len;  // line length including newline
+        void clear(void);
+};
+
+class FastaIndex : public map<string, FastaIndexEntry> {
+    friend ostream& operator<<(ostream& output, FastaIndex& i);
+    public:
+        FastaIndex(void);
+        ~FastaIndex(void);
+        vector<string> sequenceNames;
+        map<string, unsigned int> sequenceID;
+        void indexReference(string refName);
+        void readIndexFile(string fname);
+        void writeIndexFile(string fname);
+        ifstream indexFile;
+        FastaIndexEntry entry(string key);
+        void flushEntryToIndex(FastaIndexEntry& entry);
+        string indexFileExtension(void);
+};
+
+class FastaReference {
+    public:
+        void open(string reffilename);
+        bool usingmmap;
+        string filename;
+        FastaReference(void) : usingmmap(false) {
+	  file  = NULL;
+	  index = NULL;
+	}
+        ~FastaReference(void);
+        FILE* file;
+        void* filemm;
+        size_t filesize;
+        FastaIndex* index;
+        vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
+        string getSequence(string seqname);
+        // potentially useful for performance, investigate
+        // void getSequence(string seqname, string& sequence);
+        string getSubSequence(string seqname, int start, int length);
+        string getTargetSubSequence(FastaRegion& target);
+        string sequenceNameStartingWith(string seqnameStart);
+        unsigned int getSequenceID(string seqname);
+        long unsigned int sequenceLength(string seqname);
+};
+
+#endif
diff --git a/FastaHack.cpp b/FastaHack.cpp
new file mode 100644
index 0000000..07cc445
--- /dev/null
+++ b/FastaHack.cpp
@@ -0,0 +1,179 @@
+#include "Fasta.h"
+#include <stdlib.h>
+#include <getopt.h>
+#include "disorder.h"
+#include "Region.h"
+
+void printSummary() {
+    cerr << "usage: fastahack [options] <fasta reference>" << endl
+         << endl
+         << "options:" << endl 
+         << "    -i, --index          generate fasta index <fasta reference>.fai" << endl
+         << "    -r, --region REGION  print the specified region" << endl
+         << "    -c, --stdin          read a stream of line-delimited region specifiers on stdin" << endl
+         << "                         and print the corresponding sequence for each on stdout" << endl
+         << "    -e, --entropy        print the shannon entropy of the specified region" << endl
+         << "    -d, --dump           print the fasta file in the form 'seq_name <tab> sequence'" << endl
+         << endl
+         << "REGION is of the form <seq>, <seq>:<start>[sep]<end>, <seq1>:<start>[sep]<seq2>:<end>" << endl
+         << "where start and end are 1-based, and the region includes the end position." << endl
+         << "[sep] is \"-\" or \"..\"" << endl
+         << endl
+         << "Specifying a sequence name alone will return the entire sequence, specifying" << endl
+         << "range will return that range, and specifying a single coordinate pair, e.g." << endl
+         << "<seq>:<start> will return just that base." << endl
+         << endl
+         << "author: Erik Garrison <erik.garrison at bc.edu>" << endl;
+}
+
+
+int main (int argc, char** argv) {
+
+    string command;
+    string fastaFileName;
+    string seqname;
+    string longseqname;
+    long long start;
+    long long length;
+    bool dump = false;
+
+    bool buildIndex = false;  // flag to force index building
+    bool printEntropy = false;  // entropy printing
+    bool readRegionsFromStdin = false;
+    //bool printLength = false;
+    string region;
+
+    int c;
+
+    while (true) {
+        static struct option long_options[] =
+        {
+            /* These options set a flag. */
+            //{"verbose", no_argument,       &verbose_flag, 1},
+            //{"brief",   no_argument,       &verbose_flag, 0},
+            {"help", no_argument, 0, 'h'},
+            {"index",  no_argument, 0, 'i'},
+            //{"length",  no_argument, &printLength, true},
+            {"entropy", no_argument, 0, 'e'},
+            {"region", required_argument, 0, 'r'},
+            {"stdin", no_argument, 0, 'c'},
+            {0, 0, 0, 0}
+        };
+        /* getopt_long stores the option index here. */
+        int option_index = 0;
+
+        c = getopt_long (argc, argv, "hciedr:",
+                         long_options, &option_index);
+
+      /* Detect the end of the options. */
+          if (c == -1)
+            break;
+ 
+          switch (c)
+            {
+            case 0:
+            /* If this option set a flag, do nothing else now. */
+            if (long_options[option_index].flag != 0)
+              break;
+            printf ("option %s", long_options[option_index].name);
+            if (optarg)
+              printf (" with arg %s", optarg);
+            printf ("\n");
+            break;
+
+          case 'e':
+            printEntropy = true;
+            break;
+
+          case 'c':
+            readRegionsFromStdin = true;
+            break;
+ 
+          case 'i':
+            buildIndex = true;
+            break;
+ 
+          case 'r':
+            region = optarg;
+            break;
+
+            case 'd':
+                dump = true;
+                break;
+
+          case 'h':
+            printSummary();
+            exit(0);
+            break;
+ 
+          case '?':
+            /* getopt_long already printed an error message. */
+            printSummary();
+            exit(1);
+            break;
+ 
+          default:
+            abort ();
+          }
+      }
+
+    /* Print any remaining command line arguments (not options). */
+    if (optind < argc) {
+        //cerr << "fasta file: " << argv[optind] << endl;
+        fastaFileName = argv[optind];
+    } else {
+        cerr << "please specify a fasta file" << endl;
+        printSummary();
+        exit(1);
+    }
+
+    if (buildIndex) {
+        FastaIndex* fai = new FastaIndex();
+        //cerr << "generating fasta index file for " << fastaFileName << endl;
+        fai->indexReference(fastaFileName);
+        fai->writeIndexFile((string) fastaFileName + fai->indexFileExtension());
+    }
+    
+    string sequence;  // holds sequence so we can optionally process it
+
+    FastaReference fr;
+    fr.open(fastaFileName);
+
+    if (dump) {
+        for (vector<string>::iterator s = fr.index->sequenceNames.begin(); s != fr.index->sequenceNames.end(); ++s) {
+            cout << *s << "\t" << fr.getSequence(*s) << endl;
+        }
+        return 0;
+    }
+
+    if (region != "") {
+        FastaRegion target(region);
+        sequence = fr.getTargetSubSequence(target);
+    }
+
+    if (readRegionsFromStdin) {
+        string regionstr;
+        while (getline(cin, regionstr)) {
+            FastaRegion target(regionstr);
+            if (target.startPos == -1) {
+                cout << fr.getSequence(target.startSeq) << endl;
+            } else {
+                cout << fr.getSubSequence(target.startSeq, target.startPos - 1, target.length()) << endl;
+            }
+        }
+    } else {
+        if (sequence != "") {
+            if (printEntropy) {
+                if (sequence.size() > 0) {
+                    cout << shannon_H((char*) sequence.c_str(), sequence.size()) << endl;
+                } else {
+                    cerr << "please specify a region or sequence for which to calculate the shannon entropy" << endl;
+                }
+            } else {  // if no statistical processing is requested, just print the sequence
+                cout << sequence << endl;
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/LargeFileSupport.h b/LargeFileSupport.h
new file mode 100644
index 0000000..b90e585
--- /dev/null
+++ b/LargeFileSupport.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#define _FILE_OFFSET_BITS 64
+#ifdef WIN32
+#define ftell64(a)     _ftelli64(a)
+#define fseek64(a,b,c) _fseeki64(a,b,c)
+typedef __int64 off_type;
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+#define ftell64(a)     ftello(a)
+#define fseek64(a,b,c) fseeko(a,b,c)
+typedef off_t off_type;
+#else
+#define ftell64(a)     ftello(a)
+#define fseek64(a,b,c) fseeko(a,b,c)
+typedef __off64_t off_type;
+#endif
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e397326
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,42 @@
+
+# Use ?= to allow overriding from the env or command-line
+CXX ?=		g++
+CXXFLAGS ?=	-O3
+PREFIX ?=	./stage
+STRIP_CMD ?=	strip
+INSTALL ?=	install -c
+MKDIR ?=	mkdir -p
+
+# Required flags that we shouldn't override
+CXXFLAGS +=	-D_FILE_OFFSET_BITS=64
+
+OBJS =	Fasta.o FastaHack.o split.o disorder.o
+
+all:	fastahack
+
+fastahack: $(OBJS)
+	$(CXX) $(CXXFLAGS) $(OBJS) -o fastahack
+
+FastaHack.o: Fasta.h FastaHack.cpp
+	$(CXX) $(CXXFLAGS) -c FastaHack.cpp
+
+Fasta.o: Fasta.h Fasta.cpp
+	$(CXX) $(CXXFLAGS) -c Fasta.cpp
+
+split.o: split.h split.cpp
+	$(CXX) $(CXXFLAGS) -c split.cpp
+
+disorder.o: disorder.c disorder.h
+	$(CXX) $(CXXFLAGS) -c disorder.c
+
+install: fastahack
+	$(MKDIR) $(DESTDIR)$(PREFIX)/bin
+	$(INSTALL) fastahack $(DESTDIR)$(PREFIX)/bin
+
+install-strip: install
+	$(STRIP_CMD) $(DESTDIR)$(PREFIX)/bin/fastahack
+
+clean:
+	rm -rf fastahack *.o stage
+
+.PHONY: clean
diff --git a/README b/README
new file mode 100644
index 0000000..a33f5d9
--- /dev/null
+++ b/README
@@ -0,0 +1,70 @@
+fastahack --- *fast* FASTA file indexing, subsequence and sequence extraction
+
+Author: Erik Garrison <erik.garrison at bc.edu>, Marth Lab, Boston College
+Date:   May 7, 2010
+
+
+Overview:
+
+fastahack is a small application for indexing and extracting sequences and
+subsequences from FASTA files.  The included Fasta.cpp library provides a FASTA
+reader and indexer that can be embedded into applications which would benefit
+from directly reading subsequences from FASTA files.  The library automatically
+handles index file generation and use.
+
+
+Features:
+
+ - FASTA index (.fai) generation for FASTA files
+ - Sequence extraction
+ - Subsequence extraction
+ - Sequence statistics (TODO: currently only entropy is provided)
+
+Sequence and subsequence extraction use fseek64 to provide fastest-possible
+extraction without RAM-intensive file loading operations.  This makes fastahack
+a useful tool for bioinformaticists who need to quickly extract many
+subsequences from a reference FASTA sequence.
+
+
+Notes:
+
+The index files generated by this system should be numerically equivalent to
+those generated by samtools (http://samtools.sourceforge.net/).  However, while
+samtools truncates sequence names in the index file, fastahack provides them
+completely.
+
+To simplify use, sequences can be addressed by first whitespace-separated
+field; e.g. "8 SN(Homo sapiens) GA(HG18) URI(NC_000008.9)" can be addressed
+simply as "8", provided "8" is a unique first-field name in the FASTA file.
+Thus, to extract 20bp starting at position 323202 in chromosome 8 from the
+human reference:
+
+  % fastahack -r 8:323202..20 h.sapiens.fasta
+  ACATTGTAATAGATCTCAGA
+
+Usage information is provided by running fastahack with no arguments:
+
+  % usage: fastahack [options] <fasta reference>
+  
+  options:
+      -i, --index          generate fasta index <fasta reference>.fai
+      -r, --region REGION  print the specified region
+      -c, --stdin          read a stream of line-delimited region specifiers on stdin
+                           and print the corresponding sequence for each on stdout
+      -e, --entropy        print the shannon entropy of the specified region
+  
+  REGION is of the form <seq>, <seq>:<start>..<end>, <seq1>:<start>..<seq2>:<end>
+  where start and end are 1-based, and the region includes the end position.
+  Specifying a sequence name alone will return the entire sequence, specifying
+  range will return that range, and specifying a single coordinate pair, e.g.
+  <seq>:<start> will return just that base.
+
+
+Limitations:
+
+fastahack will only generate indexes for FASTA files in which the sequences
+have self-consistent line lengths.  Trailing whitespace is allowed at the end
+of sequences, but not embedded within the sequence.  These limitations are
+necessitated by the complexity of indexing sequences whose lines change in
+length--- the use of indexes is frustrated by such inconsistencies; each change
+in line length would require a new entry in the index file.
diff --git a/Region.h b/Region.h
new file mode 100644
index 0000000..cd5fb57
--- /dev/null
+++ b/Region.h
@@ -0,0 +1,51 @@
+#ifndef FASTA_REGION_H
+#define FASTA_REGION_H
+
+#include <string>
+#include <stdlib.h>
+
+using namespace std;
+
+class FastaRegion {
+public:
+    string startSeq;
+    int startPos;
+    int stopPos;
+
+    FastaRegion(string& region) {
+        startPos = -1;
+        stopPos = -1;
+        size_t foundFirstColon = region.find(":");
+        // we only have a single string, use the whole sequence as the target
+        if (foundFirstColon == string::npos) {
+            startSeq = region;
+        } else {
+            startSeq = region.substr(0, foundFirstColon);
+            size_t foundRangeDots = region.find("..", foundFirstColon);
+	    size_t foundRangeDash = region.find("-", foundFirstColon);
+            if (foundRangeDots == string::npos && foundRangeDash == string::npos) {
+                startPos = atoi(region.substr(foundFirstColon + 1).c_str());
+                stopPos = startPos; // just print one base if we don't give an end
+            } else {
+		if (foundRangeDash == string::npos) {
+		    startPos = atoi(region.substr(foundFirstColon + 1, foundRangeDots - foundRangeDots - 1).c_str());
+		    stopPos = atoi(region.substr(foundRangeDots + 2).c_str()); // to the start of this chromosome
+		} else {
+		    startPos = atoi(region.substr(foundFirstColon + 1, foundRangeDash - foundRangeDash - 1).c_str());
+		    stopPos = atoi(region.substr(foundRangeDash + 1).c_str()); // to the start of this chromosome
+		}
+            }
+        }
+    }
+
+    int length(void) {
+        if (stopPos > 0) {
+            return stopPos - startPos + 1;
+        } else {
+            return 1;
+        }
+    }
+
+};
+
+#endif
diff --git a/disorder.c b/disorder.c
new file mode 100644
index 0000000..a5f7c35
--- /dev/null
+++ b/disorder.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+ *  libdisorder: A Library for Measuring Byte Stream Entropy
+ *  Copyright (C) 2010 Michael E. Locasto
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the:
+ *       Free Software Foundation, Inc.
+ *       59 Temple Place, Suite 330
+ *       Boston, MA  02111-1307  USA
+ *
+ * $Id$
+ **************************************************************************/
+
+#include <math.h> //for log2()
+#include <stdio.h> //for NULL
+#include "disorder.h"
+
+#if defined(__FreeBSD__)
+#define        log2(x) (log((x)) * (1./M_LN2))
+#endif
+
+/** Frequecies for each byte */
+static int m_token_freqs[LIBDO_MAX_BYTES]; //frequency of each token in sample
+static float m_token_probs[LIBDO_MAX_BYTES]; //P(each token appearing)
+static int m_num_tokens = 0; //actual number of `seen' tokens, max 256 
+static float m_maxent = 0.0;
+static float m_ratio = 0.0;
+static int LIBDISORDER_INITIALIZED = 0;
+
+static void
+initialize_lib()
+{
+  int i = 0;
+  if(1==LIBDISORDER_INITIALIZED)
+    return;
+
+  m_num_tokens = 0;
+
+  for(i=0;i<LIBDO_MAX_BYTES;i++)
+  {
+    m_token_freqs[i]=0;
+    m_token_probs[i]=0.0;
+  }
+
+  LIBDISORDER_INITIALIZED = 1;
+}
+
+/**
+ * Set m_num_tokens by iterating over m_token_freq[] and maintaining
+ * a counter of each position that does not hold the value of zero.
+ */
+static void
+count_num_tokens()
+{
+  int i = 0;
+  int counter = 0;
+  for(i=0;i<LIBDO_MAX_BYTES;i++)
+  {
+    if(0!=m_token_freqs[i])
+    {
+      counter++;
+    }
+  }
+  m_num_tokens = counter;
+  return;
+}
+
+/**
+ * Sum frequencies for each token (i.e., byte values 0 through 255)
+ * We assume the `length' parameter is correct.
+ *
+ * This function is available only to functions in this file.
+ */
+static void
+get_token_frequencies(char* buf, 
+		      long long length)
+{
+  int i=0;
+  char* itr=NULL;
+  unsigned char c=0;
+
+  itr = buf;
+
+  //reset number of tokens
+  m_num_tokens = 0;
+
+  //make sure freqency and probability arrays are cleared
+  for(i=0;i<LIBDO_MAX_BYTES;i++)
+  {
+    m_token_freqs[i] = 0;
+    m_token_probs[i] = 0.0;
+  }
+
+  for(i=0;i<length;i++)
+  {
+    c = (unsigned char)*itr;
+    //assert(0<=c<LIBDO_MAX_BYTES);
+    m_token_freqs[c]++;
+    itr++;
+  }
+}
+
+/**
+ * Return entropy (in bits) of this buffer of bytes. We assume that the
+ * `length' parameter is correct. This implementation is a translation
+ * of the PHP code found here:
+ *
+ *    http://onlamp.com/pub/a/php/2005/01/06/entropy.html
+ *
+ * with a helpful hint on the `foreach' statement from here:
+ *
+ *    http://php.net/manual/en/control-structures.foreach.php
+ */
+float
+shannon_H(char* buf, 
+	  long long length)
+{
+  int i = 0;
+  float bits = 0.0;
+  char* itr=NULL; //values of itr should be zero to 255
+  unsigned char token;
+  int num_events = 0; //`length' parameter
+  float freq = 0.0; //loop variable for holding freq from m_token_freq[]
+  float entropy = 0.0; //running entropy sum
+
+  if(NULL==buf || 0==length)
+    return 0.0;
+
+  if(0==LIBDISORDER_INITIALIZED)
+    initialize_lib();
+
+  itr = buf;
+  m_maxent = 0.0;
+  m_ratio = 0.0;
+  num_events = length;
+  get_token_frequencies(itr, num_events); //modifies m_token_freqs[]
+  //set m_num_tokens by counting unique m_token_freqs entries
+  count_num_tokens(); 
+
+  if(m_num_tokens>LIBDO_MAX_BYTES)
+  {
+    //report error somehow?
+    return 0.0;
+  }
+
+  //iterate through whole m_token_freq array, but only count
+  //spots that have a registered token (i.e., freq>0)
+  for(i=0;i<LIBDO_MAX_BYTES;i++)
+  {
+    if(0!=m_token_freqs[i])
+    {
+      token = i;
+      freq = ((float)m_token_freqs[token]); 
+      m_token_probs[token] = (freq / ((float)num_events));
+      entropy += m_token_probs[token] * log2(m_token_probs[token]);
+    }
+  }
+
+  bits = -1.0 * entropy;
+  m_maxent = log2(m_num_tokens);
+  m_ratio = bits / m_maxent;
+
+  return bits;
+}
+
+int 
+get_num_tokens()
+{
+  return m_num_tokens;
+}
+
+float 
+get_max_entropy()
+{
+  return m_maxent;
+}
+
+float 
+get_entropy_ratio()
+{
+  return m_ratio;
+}
diff --git a/disorder.h b/disorder.h
new file mode 100644
index 0000000..3458774
--- /dev/null
+++ b/disorder.h
@@ -0,0 +1,62 @@
+/***************************************************************************
+ *  libdisorder: A Library for Measuring Byte Stream Entropy
+ *  Copyright (C) 2010 Michael E. Locasto
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the:
+ *       Free Software Foundation, Inc.
+ *       59 Temple Place, Suite 330
+ *       Boston, MA  02111-1307  USA
+ *
+ * $Id$
+ **************************************************************************/
+
+#ifndef __DISORDER_H_
+#define __DISORDER_H_
+
+/** Max number of bytes (i.e., tokens) */
+#define LIBDO_MAX_BYTES      256
+
+/** A convienance value for clients of this library. Feel free to change
+ * if you plan to use a larger buffer. You can also safely ignore it, as
+ * libdisorder does not use this value internally; it relies on the
+ * client-supplied `length' parameter.
+ *
+ * NB: Might become deprecated because it is potentially misleading and
+ * has zero relationship to any library internal state.
+ */
+#define LIBDO_BUFFER_LEN   16384
+
+/** 
+ * Given a pointer to an array of bytes, return a float indicating the
+ * level of entropy in bits (a number between zero and eight),
+ * assuming a space of 256 possible byte values. The second argument
+ * indicates the number of bytes in the sequence. If this sequence
+ * runs into unallocated memory, this function should fail with a
+ * SIGSEGV.
+ */
+float    shannon_H(char*, long long);
+
+/** Report the number of (unique) tokens seen. This is _not_ the
+    number of individual events seen. For example, if the library sees
+    the string `aaab', the number of events is 4 and the number of
+    tokens is 2. */
+int      get_num_tokens(void);
+
+/** Returns maximum entropy for byte distributions log2(256)=8 bits*/
+float    get_max_entropy(void);
+
+/** Returns the ratio of entropy to maxentropy */
+float    get_entropy_ratio(void);
+
+#endif
diff --git a/libdisorder.LICENSE b/libdisorder.LICENSE
new file mode 100644
index 0000000..8bbcff9
--- /dev/null
+++ b/libdisorder.LICENSE
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+

+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+

+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+

+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+

+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+

+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/split.cpp b/split.cpp
new file mode 100644
index 0000000..5f1dc4e
--- /dev/null
+++ b/split.cpp
@@ -0,0 +1,33 @@
+#include "split.h"
+
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+    std::stringstream ss(s);
+    std::string item;
+    while(std::getline(ss, item, delim)) {
+        elems.push_back(item);
+    }
+    return elems;
+}
+
+std::vector<std::string> split(const std::string &s, char delim) {
+    std::vector<std::string> elems;
+    return split(s, delim, elems);
+}
+
+std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
+    char* tok;
+    char cchars [s.size()+1];
+    char* cstr = &cchars[0];
+    strcpy(cstr, s.c_str());
+    tok = strtok(cstr, delims.c_str());
+    while (tok != NULL) {
+        elems.push_back(tok);
+        tok = strtok(NULL, delims.c_str());
+    }
+    return elems;
+}
+
+std::vector<std::string> split(const std::string &s, const std::string& delims) {
+    std::vector<std::string> elems;
+    return split(s, delims, elems);
+}
diff --git a/split.h b/split.h
new file mode 100644
index 0000000..bd5525d
--- /dev/null
+++ b/split.h
@@ -0,0 +1,20 @@
+#ifndef __SPLIT_H
+#define __SPLIT_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
+
+// split a string on a single delimiter character (delim)
+std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
+std::vector<std::string>  split(const std::string &s, char delim);
+
+// split a string on any character found in the string of delimiters (delims)
+std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
+std::vector<std::string>  split(const std::string &s, const std::string& delims);
+
+#endif
diff --git a/tests/correct.fasta b/tests/correct.fasta
new file mode 100644
index 0000000..99af0c0
--- /dev/null
+++ b/tests/correct.fasta
@@ -0,0 +1,30 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
diff --git a/tests/embedded_newline.fasta b/tests/embedded_newline.fasta
new file mode 100644
index 0000000..26b21e7
--- /dev/null
+++ b/tests/embedded_newline.fasta
@@ -0,0 +1,35 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+
+
+
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
diff --git a/tests/mismatched_lines.fasta b/tests/mismatched_lines.fasta
new file mode 100644
index 0000000..56a7020
--- /dev/null
+++ b/tests/mismatched_lines.fasta
@@ -0,0 +1,30 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
diff --git a/tests/trailing_newlines.fasta b/tests/trailing_newlines.fasta
new file mode 100644
index 0000000..377513f
--- /dev/null
+++ b/tests/trailing_newlines.fasta
@@ -0,0 +1,34 @@
+>1
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+
+
+
+>2
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCTAACC
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC
+
+>3
+TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTAACCCTAACCCTAA
+CCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCTAACCCTAACCCTAACCCTAACCCTAACCCT
+AACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAAACCCTAAACCCTAACCCTAACCCTAACCCTA
+CTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCCTAACCCCTAACCCTAACCCTAACCCTA
+ACCCTAACCCTAACCCTAACCCCTAACCCTAACCCTAACCCTAACCCTCGCGGTACCCTCAGCCGGCCCG
+CCCGCCCGGGTCTGACCTGAGGAGAACTGTGCTCCGCCTTCAGAGTACCACCGAAATCTGTGCAGAGGAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/libfastahack.git