[med-svn] [Git][med-team/andi][master] 3 commits: Imported Upstream version 0.12

Mon Feb 26 14:15:50 UTC 2018

Fabian Klötzl pushed to branch master at Debian Med / andi


Commits:
e28213ab by Fabian Klötzl at 2018-02-26T15:00:32+01:00
Imported Upstream version 0.12
- - - - -
09638a2e by Fabian Klötzl at 2018-02-26T15:00:33+01:00
Merge tag 'upstream/0.12'

Upstream version 0.12

- - - - -
1012cec5 by Fabian Klötzl at 2018-02-26T15:15:26+01:00
prepare for debian/0.12-1

- - - - -


22 changed files:

- .gitignore
- .travis.yml
- Makefile.am
- andi-manual.pdf
- configure.ac
- debian/changelog
- docs/andi.1.in
- docs/manual/andi-manual.tex
- scripts/_andi
- src/andi.c
- src/dist_hack.h
- src/esa.c
- src/global.h
- src/io.c
- src/model.c
- src/process.c
- test/Makefile.am
- + test/low_homo.sh
- + test/nan.sh
- test/test_join.sh
- + test/test_process.c
- test/test_random.sh


Changes:

=====================================
.gitignore
=====================================

--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ test-driver
 test_esa
 test_seq
 test_fasta
+test_process
 *.trs
 
 # Coverage


=====================================
.travis.yml
=====================================
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,4 @@
 language: cpp
-os:
-  - linux
-  # - osx
 compiler:
   - gcc
   - clang
@@ -13,9 +10,9 @@ addons:
       - ubuntu-toolchain-r-test
     packages:
       - cmake
-      - g++-4.8
       - libglib2.0-dev
       - libgsl0-dev
+
 install:
   - export LIBDIVDIR="$HOME/libdivsufsort"
   - pip install --user cpp-coveralls
@@ -24,30 +21,26 @@ install:
   - cd libdivsufsort-master && mkdir build && cd build
   - cmake -DCMAKE_BUILD_TYPE="Release" -DCMAKE_INSTALL_PREFIX="$LIBDIVDIR" ..
   - make && make install
-  - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install gsl; fi
-  - if [ "${TRAVIS_OS_NAME}" = "osx" ]; then brew install glib; fi
-before_install:
-  - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
 
-# - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.8 90
 script:
-- export DYLD_LIBRARY_PATH="$DYLD_LIBRARY_PATH:$LIBDIVDIR/lib"
+- CONFIGURE_FLAGS=""
 - export LD_LIBRARY_PATH="$LIBDIVDIR:$LIBDIVDIR/lib"
 - export LIBRARY_PATH="$LIBDIVDIR:$LIBRARY_PATH"
 - cd $TRAVIS_BUILD_DIR
 - autoreconf -fvi -Im4
 - export MYFLAGS="-fprofile-arcs -ftest-coverage -I$LIBDIVDIR/include"
-- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
+- if [ "${CC}" = "clang" ]; then export CONFIGURE_FLAGS="--disable-openmp"; fi
+- ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
 - make
-- make check
+- make check || cat ./test-suite.log || exit 1
 - export MYFLAGS="-I$LIBDIVDIR/include"
-- ./configure --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
-- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\""
+- ./configure $CONFIGURE_FLAGS --enable-unit-tests LDFLAGS="-L$LIBDIVDIR/lib" CFLAGS="$MYFLAGS" CXXFLAGS="$MYFLAGS"
+- make distcheck DISTCHECK_CONFIGURE_FLAGS="LDFLAGS=\"-L$LIBDIVDIR/lib\" CFLAGS=\"-I$LIBDIVDIR/include\" CXXFLAGS=\"-I$LIBDIVDIR/include\" $CONFIGURE_FLAGS"
 - tar xzvf andi-*.tar.gz
 - cd andi-*
-- ./configure --enable-unit-tests --without-libdivsufsort
+- ./configure $CONFIGURE_FLAGS --enable-unit-tests --without-libdivsufsort
 - make
-- make check
+- make check || cat ./test-suite.log || exit 1
 - cd ..
 after_success:
-- if [ "${TRAVIS_OS_NAME}" = "linux" -a "$CXX" = "g++-4.8" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi
+- if [ "$CXX" = "g++" ]; then coveralls --exclude libdivsufsort-master -E '^andi-.*' --exclude libs --exclude test --gcov `which gcov-4.8` --gcov-options '\-lp'; fi


=====================================
Makefile.am
=====================================
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,8 @@ SUBDIRS+= test
 AM_TESTS_ENVIRONMENT= \
 	RANDOM_SEED='@SEED@' ; export RANDOM_SEED ;
 
-TESTS = test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh
+XFAIL_TESTS=
+TESTS = $(XFAIL_TESTS) test/nan.sh test/low_homo.sh test/test_esa test/test_seq test/test_extra.sh test/test_random.sh test/test_join.sh test/test_process
 
 $(TESTS): src/andi
 


=====================================
andi-manual.pdf
=====================================
Binary files a/andi-manual.pdf and b/andi-manual.pdf differ


=====================================
configure.ac
=====================================
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
-AC_INIT([andi], [0.11])
-AM_INIT_AUTOMAKE([-Wall foreign ])
+AC_INIT([andi], [0.12])
+AM_INIT_AUTOMAKE([-Wall foreign])
 
 AC_CONFIG_MACRO_DIR([m4])
 
@@ -78,8 +78,7 @@ AC_ARG_WITH([seed],
 
 AC_SUBST([SEED])
 
-if test "x${try_unit_tests}" = xyes; then
-
+AS_IF([test "x${try_unit_tests}" = xyes], [
 	have_glib=yes
 	PKG_CHECK_MODULES([GLIB], [glib-2.0], [], [have_glib=no])
 
@@ -88,7 +87,7 @@ if test "x${try_unit_tests}" = xyes; then
 	fi
 
 	AX_CXX_COMPILE_STDCXX_11([],[mandatory])
-fi
+])
 
 
 # Check for various headers including those used by libdivsufsort.


=====================================
debian/changelog
=====================================
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,9 +1,10 @@
-andi (0.11-2) UNRELEASED; urgency=medium
+andi (0.12-1) UNRELEASED; urgency=medium
 
   * Team upload.
 
   [ Fabian Klötzl ]
   * Remove useless dh-autoreconf build dependency
+  * Import new upstream release
 
   [ Steffen Moeller ]
   * debian/upstream/metadata: added refs to registries


=====================================
docs/andi.1.in
=====================================
--- a/docs/andi.1.in
+++ b/docs/andi.1.in
@@ -1,9 +1,9 @@
-.TH ANDI "1" "2017-06-30" "@VERSION@" "andi manual"
+.TH ANDI "1" "2017-09-17" "@VERSION@" "andi manual"
 .SH NAME
 andi \- estimates evolutionary distance
 .SH SYNOPSIS
 .B andi
-[\fI-jlv\fR] [\fI-b INT\fR] [\fI-p FLOAT\fR] [\fI-m MODEL\fR] [\fI-t INT\fR] \fIFILES\fR...
+[\fIOPTIONS...\fR] \fIFILES\fR...
 .SH DESCRIPTION
 .TP
 \fBandi\fR estimates the evolutionary distance between closely related genomes. For this \fBandi\fR reads the input sequences from \fIFASTA\fR files and computes the pairwise anchor distance. The idea behind this is explained in a paper by Haubold et al. (2015).
@@ -11,10 +11,10 @@ andi \- estimates evolutionary distance
 The output is a symmetrical distance matrix in \fIPHYLIP\fR format, with each entry representing divergence with a positive real number. A distance of zero means that two sequences are identical, whereas other values are estimates for the nucleotide substitution rate (Jukes-Cantor corrected). For technical reasons the comparison might fail and no estimate can be computed. In such cases \fInan\fR is printed. This either means that the input sequences were too short (<200bp) or too diverse (K>0.5) for our method to work properly.
 .SH OPTIONS
 .TP
-\fB\-b\fR, \fB\-\-bootstrap\fR <INT>
+\fB\-b\fR \fIINT\fR, \fB\-\-bootstrap\fR=\fIINT\fR
 Compute multiple distance matrices, with \fIn-1\fR bootstrapped from the first. See the paper Klötzl & Haubold (2016) for a detailed explanation.
 .TP
-\fB--file-of-filenames\fR <FILE>
+\fB--file-of-filenames\fR=\fIFILE\fR
 Usually, \fBandi\fR is called with the filenames as commandline arguments. With this option the filenames may also be read from a file itself, with one name per line. Use a single dash (\fB'-'\fR) to read from stdin.
 .TP
 \fB\-j\fR, \fB\-\-join\fR
@@ -23,13 +23,16 @@ Use this mode if each of your \fIFASTA\fR files represents one assembly with num
 \fB\-l\fR, \fB\-\-low-memory\fR
 In multithreaded mode, \fBandi\fR requires memory linear to the amount of threads. The low memory mode changes this to a constant demand independent from the used number of threads. Unfortunately, this comes at a significant runtime cost.
 .TP
-\fB\-m\fR, \fB\-\-model\fR <Raw|JC|Kimura>
-Different models of nucleotide evolution are supported. By default the Jukes-Cantor correction is used.
+\fB\-m\fR \fIMODEL\fR, \fB\-\-model\fR=\fIMODEL\fr
+Set the nucleotide evolution model to one of 'Raw', 'JC', or 'Kimura'. By default the Jukes-Cantor correction is used.
 .TP
-\fB\-p\fR <FLOAT>
+\fB\-p\fR \fIFLOAT\fR
 Significance of an anchor; default: 0.025.
 .TP
-\fB\-t\fR, \fB\-\-threads\fR <INT>
+\fB--progress\fR[=\fIWHEN\fR]
+Print a progress bar. \fIWHEN\fR can be 'auto' (default if omitted), 'always', or 'never'.
+.TP
+\fB\-t\fR \fIINT\fR, \fB\-\-threads\fR=\fIINT\fR
 The number of threads to be used; by default, all available processors are used.
 .br
 Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
@@ -38,7 +41,7 @@ Multithreading is only available if \fBandi\fR was compiled with OpenMP support.
 By default \fBandi\fR outputs the full names of sequences, optionally padded with spaces, if they are shorter than ten characters. Names longer than ten characters may lead to problems with downstream tools. With this switch names will be truncated.
 .TP
 \fB\-v\fR, \fB\-\-verbose\fR
-Prints additional information. Apply multiple times for extra verboseness.
+Prints additional information, including the amount of found homology. Apply multiple times for extra verboseness.
 .TP
 \fB\-h\fR, \fB\-\-help\fR
 Prints the synopsis and an explanation of available options.
@@ -46,7 +49,7 @@ Prints the synopsis and an explanation of available options.
 \fB\-\-version\fR
 Outputs version information and acknowledgments.
 .SH COPYRIGHT
-Copyright \(co 2014 - 2016 Fabian Klötzl
+Copyright \(co 2014 - 2017 Fabian Klötzl
 License GPLv3+: GNU GPL version 3 or later.
 .br
 This is free software: you are free to change and redistribute it.


=====================================
docs/manual/andi-manual.tex
=====================================
--- a/docs/manual/andi-manual.tex
+++ b/docs/manual/andi-manual.tex
@@ -106,61 +106,62 @@ This document is release under the Creative Commons Attribution Share-Alike lice
 The easiest way to install \andi is via a package manager. This also handles all dependencies for you.
 
 
-\noindent Debian and Ubuntu (since 16.04):
+\noindent Debian and Ubuntu:
 
 \begin{lstlisting}
 ~ %  sudo apt-get install andi
 \end{lstlisting}
 
-\noindent OS X with homebrew:
+\noindent macOS with homebrew:
 
 \begin{lstlisting}
 ~ %  brew install homebrew/science/andi
 \end{lstlisting}
 
-\noindent ArchLinux:
+\noindent ArchLinux AUR package with aura:
 
 \begin{lstlisting}
 ~ %  aura -A andi
 \end{lstlisting}
 
-\andi is intended to run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).
+\andi is intended to be run in a \algo{Unix} commandline such as \lstinline$bash$ or \lstinline$zsh$. All examples in this document are also intended for that environment. You can verify that \andi was installed correctly by executing \lstinline$andi -h$. This should give you a list of all available options (see Section~\ref{sec:options}).
 
 \section{Source Package} \label{sub:regular}
 
-Download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and optionally \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}. If you wish to install \andi without \algo{libdivsufsort}, consult Section~\ref{sub:wo-divsufsort}.
+To build \andi from source, download the latest \href{https://github.com/EvolBioInf/andi/releases}{release} from GitHub. Please note, that \andi requires the \algo{Gnu Scientific Library} and optionally \algo{libdivsufsort}\footnote{\url{https://github.com/y-256/libdivsufsort}} for optimal performance \cite{divsufsort}. If you wish to install \andi without \algo{libdivsufsort}, consult Section~\ref{sub:wo-divsufsort}.
 
 Once you have downloaded the package, unzip it and change into the newly created directory. 
 
 \begin{lstlisting}
-~ %  tar -xzvf andi-0.11.tar.gz
-~ %  cd andi-0.11
+~ %  tar -xzvf andi-0.12.tar.gz
+~ %  cd andi-0.12
 \end{lstlisting}
 
 \noindent Now build and install \andi.
 
 \begin{lstlisting}
-~/andi-0.11 %  ./configure
-~/andi-0.11 %  make
-~/andi-0.11 %  sudo make install
+~/andi-0.12 %  ./configure
+~/andi-0.12 %  make
+~/andi-0.12 %  sudo make install
 \end{lstlisting}
 
-\noindent This installs \andi for all users on your system. If you do not have root privileges, you will find a working copy of \andi in the \lstinline$src$ subdirectory. For the rest of this documentation, I will assume, that \andi is in your \textdollar\lstinline!PATH!.
+\noindent This installs \andi for all users on your system. If you do not have root privileges, you will find a working copy of \andi in the \lstinline$src$ subdirectory. For the rest of this documentation, it is assumed, that \andi is in your \textdollar\lstinline!PATH!.
 
 Now \andi should be ready for use. Try invoking the help.
 
 \begin{lstlisting}
-~/andi-0.11 %  andi --help
-Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...
-  FILES... can be any sequence of FASTA files. If no files are supplied, stdin is used instead.
+~/andi-0.12 %  Usage: andi [OPTIONS...] FILES...
+  FILES... can be any sequence of FASTA files.
+  Use '-' as file name to read from stdin.
 Options:
-  -b, --bootstrap <INT>  Print additional bootstrap matrices
-      --file-of-filenames <FILE>  Read additional filenames from FILE; one per line
+  -b, --bootstrap=INT  Print additional bootstrap matrices
+      --file-of-filenames=FILE  Read additional filenames from FILE; one per line
   -j, --join           Treat all sequences from one file as a single genome
   -l, --low-memory     Use less memory at the cost of speed
-  -m, --model <Raw|JC|Kimura>  Pick an evolutionary model; default: JC
-  -p <FLOAT>           Significance of an anchor; default: 0.025
-  -t, --threads <INT>  Set the number of threads; by default, all available processors are used
+  -m, --model=MODEL    Pick an evolutionary model of 'Raw', 'JC', 'Kimura'; default: JC
+  -p FLOAT             Significance of an anchor; default: 0.025
+      --progress=WHEN  Print a progress bar 'always', 'never', or 'auto'; default: auto
+  -t, --threads=INT    Set the number of threads; by default, all processors are used
       --truncate-names Truncate names to ten characters
   -v, --verbose        Prints additional information
   -h, --help           Display this help and exit
@@ -230,9 +231,9 @@ When the \algo{join} mode is active, the file names are used to label the indivi
 
 If not enough file names are provided, \andi will try to read sequences from the standard input stream. This behaviour can be explicitly triggered by passing a single dash (\lstinline$-$) as a file name, which is useful in pipelines.
 
-If \andi seems to take unusually long, or requires huge amounts of memory, then you might have forgotten the \algo{join} switch. This makes \andi compare each contig instead of each genome, resulting in many more comparisons! To make \andi output the number of genome it about to compare, use the \lstinline$--verbose$ switch.
+If \andi seems to take unusually long, or requires huge amounts of memory, then you might have forgotten the \algo{join} switch. This makes \andi compare each contig instead of each genome, resulting in many more comparisons! Since version 0.12 \andi produces a progressmeter on the standard error stream. \andi tries to be smart about when to show or hide the progress bar. You can manually change this behaviour using the \lstinline!--progress! option.
 
-Starting with version 0.11 \andi supports an extra way of input. Instead of passing file names directly to \andi via the commandline arguments, the files may also be read from a file itself. Using this new \lstinline$--file-of-filenames$ can work around limitations imposed be the shell.
+Starting with version 0.11 \andi supports an extra way of input. Instead of passing file names directly to \andi via the commandline arguments, the file names may also be read from a file itself. Using this new \lstinline$--file-of-filenames$ argument can work around limitations imposed be the shell.
 
 The following three snippets have the same functionality.
 
@@ -267,7 +268,7 @@ If the computation completed successfully, \andi exits with the status code 0. O
 
 \section{Options} \label{sec:options}
 
-\andi takes a small number of commandline options, of which even fewer are of interest on a day-to-day basis. If \lstinline$andi -h$ displays a \lstinline$-t$ option, then \andi was compiled with multi-threading support (implemented using \algo{OpenMP}). By default \andi uses all available processors. However, to restrict the number of threads, use \lstinline$-t$.
+\andi takes a small number of commandline options, of which even fewer are of interest on a day-to-day basis. If \lstinline$andi -h$ displays a \lstinline$-t$ option, then \andi was compiled with multi-threading support (implemented using \algo{OpenMP}). By default, \andi uses all available processors. However, to restrict the number of threads, use \lstinline$-t$.
 
 \begin{lstlisting}
 ~ %  time andi ../test/1M.1.fasta -t 1
@@ -298,13 +299,13 @@ S1        0.0000 0.1071
 S2        0.1071 0.0000
 \end{lstlisting}
 
-The original \algo{phylip} only supports distance matrices with names no longer than ten characters. However, this sometimes leads to problems with long accession numbers. Starting with version 0.11 \andi print the full name of a sequence, even if it is longer than ten characters. If your downstream tools have trouble with this, use \lstinline$--truncate-names$ to reimpose the limit.
+The original \algo{phylip} only supports distance matrices with names no longer than ten characters. However, this sometimes leads to problems with long accession numbers. Starting with version 0.11 \andi prints the full name of a sequence, even if it is longer than ten characters. If your downstream tools have trouble with this, use \lstinline$--truncate-names$ to reimpose the limit.
 
 Also new in version 0.11 is the \lstinline$--file-of-filenames$ option. See Section~\ref{sec:join} for details.
 
 \section{Example: \algo{eco29}}
 
-Here follows a real-world example of how to use \algo{andi}. It makes heavy use of the commandline and tools like \algo{Phylip}. If you prefer \algo{R}, check out this excellent \href{http://holtlab.net/2015/05/08/r-code-to-infer-tree-from-andi-output/}{blog post} by Kathryn Holt.
+Here follows a real-world example of how to use \algo{andi}. It makes heavy use of the commandline and tools like \algo{Phylip}. If you prefer \algo{R}, check out this excellent blog post by Kathryn Holt.\footnote{\url{http://holtlab.net/2015/05/08/r-code-to-infer-tree-from-andi-output/}}
 
 As a data set we use \algo{eco29}; 29 genomes of \textit{E. Coli} and \textit{Shigella}. You can download the data from here: {\small{\url{http://guanine.evolbio.mpg.de/andi/eco29.fasta.gz}}}. The genomes have an average length of 4.9~million nucleotides amounting to a total \SI{138}{\mega\byte}.
 
@@ -414,11 +415,11 @@ Some command line parameters of \andi require arguments. If these are not of the
 
 \section{Output-related Warnings}
 
-As the input sequences get more evolutionary divergent, \andi finds less anchors. With less anchors, less nucleotides are considered homologous between two sequences. If no anchors are found, comparison fails and \lstinline!nan! is printed instead. See our paper and especially Figure~2 for details.
+As the input sequences get more evolutionary divergent, \andi finds less homologous anchors. With less anchors, less nucleotides are considered homologous between two sequences. If no anchors are found, comparison fails and \lstinline!nan! is printed instead. See our paper and especially Figure~2 for details.
 
 \subsection*{NaN}
 
-No anchors were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult.
+No homologous sections were found. Your sequences are very divergent ($d>0.5$) or sprout a lot of indels that make comparison difficult.
 
 \subsection*{Little Homology}
 
@@ -469,7 +470,7 @@ The unit tests are located in the \andi repository under the \lstinline$./test$ 
 ~/andi %   make check
 \end{lstlisting}
 
-\noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests to not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}} Unfortunately, coveralls is broken at this point in time.
+\noindent The unit tests are also checked each time a commit is sent to the repository. This is done via \algo{TravisCI}.\footnote{\url{https://travis-ci.org/EvolBioInf/andi}} Thus, a warning is produced, when the builds fail, or the unit tests did not run successfully. Currently, the unit tests cover more than 75\% of the code. This is computed via the \algo{Travis} builds and a service called \algo{Coveralls}.\footnote{\url{https://coveralls.io/r/EvolBioInf/andi}}
 
 \section{Known Issues}
 


=====================================
scripts/_andi
=====================================
--- a/scripts/_andi
+++ b/scripts/_andi
@@ -27,6 +27,7 @@ args+=(
 		Kimura\:Kimura\-two\-parameter
 	))'
 	"($info)-p+[Significance of an anchor; default\: 0.025]:float:"
+	"($info)--progress=[Show progress bar]:when:(always auto never)"
 	"($info -t --threads)"{-t+,--threads=}'[The number of threads to be used; by default, all available processors are used]:num_threads:'
 	"($info)--truncate-names[Print only the first ten characters of each name]"
 	"($info)*"{-v,--verbose}'[Prints additional information]'


=====================================
src/andi.c
=====================================
--- a/src/andi.c
+++ b/src/andi.c
@@ -45,10 +45,9 @@
 int FLAGS = 0;
 int THREADS = 1;
 long unsigned int BOOTSTRAP = 0;
-double RANDOM_ANCHOR_PROP = 0.025;
+double ANCHOR_P_VALUE = 0.025;
 gsl_rng *RNG = NULL;
 int MODEL = M_JC;
-int EXIT_CODE = EXIT_SUCCESS;
 
 void usage(int);
 void version(void);
@@ -66,6 +65,7 @@ int main(int argc, char *argv[]) {
 		{"version", no_argument, NULL, 0},
 		{"truncate-names", no_argument, NULL, 0},
 		{"file-of-filenames", required_argument, NULL, 0},
+		{"progress", optional_argument, NULL, 0},
 		{"help", no_argument, NULL, 'h'},
 		{"verbose", no_argument, NULL, 'v'},
 		{"join", no_argument, NULL, 'j'},
@@ -79,6 +79,9 @@ int main(int argc, char *argv[]) {
 	// Use all available processors by default.
 	THREADS = omp_get_num_procs();
 #endif
+
+	enum { P_AUTO, P_NEVER, P_ALWAYS } progress = P_AUTO;
+
 	struct string_vector file_names;
 	string_vector_init(&file_names);
 
@@ -105,6 +108,19 @@ int main(int argc, char *argv[]) {
 				if (strcasecmp(option_str, "file-of-filenames") == 0) {
 					read_into_string_vector(optarg, &file_names);
 				}
+				if (strcasecmp(option_str, "progress") == 0) {
+					if (!optarg || strcasecmp(optarg, "always") == 0) {
+						progress = P_ALWAYS;
+					} else if (strcasecmp(optarg, "auto") == 0) {
+						progress = P_AUTO;
+					} else if (strcasecmp(optarg, "never") == 0) {
+						progress = P_NEVER;
+					} else {
+						warnx("invalid argument to --progress '%s'. Expected "
+							  "one of 'auto', 'always', or 'never'.",
+							  optarg);
+					}
+				}
 				break;
 			}
 			case 'h': usage(EXIT_SUCCESS); break;
@@ -117,21 +133,21 @@ int main(int argc, char *argv[]) {
 				double prop = strtod(optarg, &end);
 
 				if (errno || end == optarg || *end != '\0') {
-					warnx(
+					soft_errx(
 						"Expected a floating point number for -p argument, but "
 						"'%s' was given. Skipping argument.",
 						optarg);
 					break;
 				}
 
-				if (prop < 0.0 || prop > 1.0) {
-					warnx("A probability should be a value between 0 and 1; "
-						  "Ignoring -p %f argument.",
-						  prop);
+				if (prop <= 0.0 || prop >= 1.0) {
+					soft_errx("A probability should be a value between 0 and "
+							  "1, exlusive; Ignoring -p %f argument.",
+							  prop);
 					break;
 				}
 
-				RANDOM_ANCHOR_PROP = prop;
+				ANCHOR_P_VALUE = prop;
 				break;
 			}
 			case 'l': FLAGS |= F_LOW_MEMORY; break;
@@ -172,7 +188,7 @@ int main(int argc, char *argv[]) {
 				long unsigned int bootstrap = strtoul(optarg, &end, 10);
 
 				if (errno || end == optarg || *end != '\0' || bootstrap == 0) {
-					warnx(
+					soft_errx(
 						"Expected a positive number for -b argument, but '%s' "
 						"was given. Ignoring -b argument.",
 						optarg);
@@ -191,8 +207,9 @@ int main(int argc, char *argv[]) {
 				} else if (strcasecmp(optarg, "KIMURA") == 0) {
 					MODEL = M_KIMURA;
 				} else {
-					warnx("Ignoring argument for --model. Expected Raw, JC or "
-						  "Kimura");
+					soft_errx(
+						"Ignoring argument for --model. Expected Raw, JC or "
+						"Kimura");
 				}
 				break;
 			}
@@ -217,11 +234,13 @@ int main(int argc, char *argv[]) {
 
 	size_t minfiles = FLAGS & F_JOIN ? 2 : 1;
 	if (string_vector_size(&file_names) < minfiles) {
-		// not enough files passed via arguments; read from stdin.
-		string_vector_push_back(&file_names, "-");
-		// explain to the user, why nothing is happening.
-		if (isatty(STDIN_FILENO)) {
-			warnx("Not enough file names given; expecting input via stdin.");
+		// not enough files passed via arguments
+		if (!isatty(STDIN_FILENO)) {
+			// read from stdin in pipe
+			string_vector_push_back(&file_names, "-");
+		} else {
+			// print a helpful message on './andi' without args
+			usage(EXIT_FAILURE);
 		}
 	}
 
@@ -248,11 +267,6 @@ int main(int argc, char *argv[]) {
 			 n);
 	}
 
-	if (FLAGS & F_VERBOSE) {
-		fprintf(stderr, "Comparing %zu sequences\n", n);
-		fflush(stderr);
-	}
-
 	RNG = gsl_rng_alloc(gsl_rng_default);
 	if (!RNG) {
 		err(1, "RNG allocation failed.");
@@ -293,20 +307,27 @@ int main(int argc, char *argv[]) {
 	}
 
 	if (FLAGS & F_SHORT) {
-		warnx("One of the given input sequences is shorter than a thousand "
-			  "nucleotides. This may result in inaccurate distances. Try an "
-			  "alignment instead.");
+		soft_errx(
+			"One of the given input sequences is shorter than a thousand "
+			"nucleotides. This may result in inaccurate distances. Try an "
+			"alignment instead.");
 	}
 
-	// side channel
-	EXIT_CODE = EXIT_SUCCESS;
+	// determine whether to print a progress bar
+	if (progress == P_AUTO) {
+		progress = isatty(STDERR_FILENO) ? P_ALWAYS : P_NEVER;
+	}
+	if (progress == P_ALWAYS) {
+		FLAGS |= F_PRINT_PROGRESS;
+	}
 
 	// compute distance matrix
 	calculate_distances(dsa_data(&dsa), n);
 
 	dsa_free(&dsa);
 	gsl_rng_free(RNG);
-	return EXIT_CODE;
+
+	return FLAGS & F_SOFT_ERROR ? EXIT_FAILURE : EXIT_SUCCESS;
 }
 
 /**
@@ -314,22 +335,25 @@ int main(int argc, char *argv[]) {
  */
 void usage(int status) {
 	const char str[] = {
-		"Usage: andi [-jlv] [-b INT] [-p FLOAT] [-m MODEL] [-t INT] FILES...\n"
-		"\tFILES... can be any sequence of FASTA files. If no files are "
-		"supplied, stdin is used instead.\n"
+		"Usage: andi [OPTIONS...] FILES...\n"
+		"\tFILES... can be any sequence of FASTA files.\n"
+		"\tUse '-' as file name to read from stdin.\n"
 		"Options:\n"
-		"  -b, --bootstrap <INT>  Print additional bootstrap matrices\n"
-		"      --file-of-filenames <FILE>  Read additional filenames from "
+		"  -b, --bootstrap=INT  Print additional bootstrap matrices\n"
+		"      --file-of-filenames=FILE  Read additional filenames from "
 		"FILE; one per line\n"
 		"  -j, --join           Treat all sequences from one file as a single "
 		"genome\n"
 		"  -l, --low-memory     Use less memory at the cost of speed\n"
-		"  -m, --model <Raw|JC|Kimura>  Pick an evolutionary model; default: "
+		"  -m, --model=MODEL    Pick an evolutionary model of 'Raw', 'JC', "
+		"'Kimura'; default: "
 		"JC\n"
-		"  -p <FLOAT>           Significance of an anchor; default: 0.025\n"
+		"  -p FLOAT             Significance of an anchor; default: 0.025\n"
+		"      --progress=WHEN  Print a progress bar 'always', 'never', or "
+		"'auto'; default: auto\n"
 #ifdef _OPENMP
-		"  -t, --threads <INT>  Set the number of threads; by default, all "
-		"available processors are used\n"
+		"  -t, --threads=INT    Set the number of threads; by default, all "
+		"processors are used\n"
 #endif
 		"      --truncate-names Truncate names to ten characters\n"
 		"  -v, --verbose        Prints additional information\n"


=====================================
src/dist_hack.h
=====================================
--- a/src/dist_hack.h
+++ b/src/dist_hack.h
@@ -2,9 +2,10 @@
  * @brief This file is a preprocessor hack for the two functions `distMatrix`
  * and `distMatrixLM`.
  */
+// clang-format off
 #ifdef FAST
 #define NAME distMatrix
-#define P_OUTER _Pragma("omp parallel for num_threads( THREADS)")
+#define P_OUTER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress)")
 #define P_INNER
 #else
 #undef NAME
@@ -12,8 +13,9 @@
 #undef P_INNER
 #define NAME distMatrixLM
 #define P_OUTER
-#define P_INNER _Pragma("omp parallel for num_threads( THREADS)")
+#define P_INNER _Pragma("omp parallel for num_threads( THREADS) default(none) shared(progress_counter) firstprivate( stderr, M, sequences, n, print_progress, i, E, subject)")
 #endif
+// clang-format on
 
 /** @brief This function calls dist_andi for pairs of subjects and queries, and
  * thereby fills the distance matrix.
@@ -32,6 +34,14 @@
 void NAME(struct model *M, const seq_t *sequences, size_t n) {
 	size_t i;
 
+	size_t progress_counter = 0;
+	int print_progress = FLAGS & F_PRINT_PROGRESS;
+
+	if (print_progress) {
+		fprintf(stderr, "Comparing %zu sequences: %5.1f%% (%zu/%zu)", n, 0.0,
+				(size_t)0, n * n - n);
+	}
+
 	//#pragma
 	P_OUTER
 	for (i = 0; i < n; i++) {
@@ -56,15 +66,31 @@ void NAME(struct model *M, const seq_t *sequences, size_t n) {
 			size_t ql = sequences[j].len;
 
 			M(i, j) = dist_anchor(&E, sequences[j].S, ql, subject.gc);
+
+#pragma omp atomic update
+			progress_counter++;
 		}
 
-		// TODO: Provide a nicer progress indicator.
-		if (FLAGS & F_EXTRA_VERBOSE) {
+		if (print_progress) {
+			size_t local_progress_counter;
+			size_t num_comparisons = n * n - n;
+
+#pragma omp atomic read
+			local_progress_counter = progress_counter;
+
+			double progress =
+				100.0 * (double)local_progress_counter / num_comparisons;
+
 #pragma omp critical
-			fprintf(stderr, "Subject %s done.\n", sequences[i].name);
+			fprintf(stderr, "\rComparing %zu sequences: %5.1f%% (%zu/%zu)", n,
+					progress, local_progress_counter, num_comparisons);
 		}
 
 		esa_free(&E);
 		seq_subject_free(&subject);
 	}
+
+	if (print_progress) {
+		fprintf(stderr, ", done.\n");
+	}
 }


=====================================
src/esa.c
=====================================
--- a/src/esa.c
+++ b/src/esa.c
@@ -313,7 +313,9 @@ int esa_init_CLD(esa_s *C) {
 
 	const saidx_t *LCP = C->LCP;
 
-	typedef struct pair_s { saidx_t idx, lcp; } pair_t;
+	typedef struct pair_s {
+		saidx_t idx, lcp;
+	} pair_t;
 
 	pair_t *stack = malloc((C->len + 1) * sizeof(*stack));
 	CHECK_MALLOC(stack);


=====================================
src/global.h
=====================================
--- a/src/global.h
+++ b/src/global.h
@@ -26,11 +26,11 @@ extern int FLAGS;
 extern int THREADS;
 
 /**
- * The ::RANDOM_ANCHOR_PROP represents the probability with which a found
- * anchor is a random match and not homologous. Its value can be set using
- * the `-p` switch.
+ * The ::ANCHOR_P_VALUE represents the probability that an anchor will be found,
+ * if the two sequences are unrelated. I.e. it is the p-value for H_0: random
+ * sequences. Its value can be set using the `-p` switch.
  */
-extern double RANDOM_ANCHOR_PROP;
+extern double ANCHOR_P_VALUE;
 
 /**
  * The number of matrices that should be bootstrapped.
@@ -50,11 +50,6 @@ extern int MODEL;
 enum { M_RAW, M_JC, M_KIMURA };
 
 /**
- * Global exit code. Should be non-zero on error.
- */
-extern int EXIT_CODE;
-
-/**
  * This enum contains the available flags. Please note that all
  * available options are a power of 2.
  */
@@ -66,7 +61,9 @@ enum {
 	F_NON_ACGT = 8,
 	F_JOIN = 16,
 	F_LOW_MEMORY = 32,
-	F_SHORT = 64
+	F_SHORT = 64,
+	F_PRINT_PROGRESS = 128,
+	F_SOFT_ERROR = 256
 };
 
 /**
@@ -79,6 +76,26 @@ enum {
 		if (PTR == NULL) {                                                     \
 			err(errno, "Out of memory");                                       \
 		}                                                                      \
-	} while (0);
+	} while (0)
+
+/**
+ * @brief This macro is used to print a warning and make the program exit with
+ * an failure exit code, later.
+ */
+#define soft_err(...)                                                          \
+	do {                                                                       \
+		FLAGS |= F_SOFT_ERROR;                                                 \
+		warn(__VA_ARGS__);                                                     \
+	} while (0)
+
+/**
+ * @brief This macro is used to print a warning and make the program exit with
+ * an failure exit code, later.
+ */
+#define soft_errx(...)                                                         \
+	do {                                                                       \
+		FLAGS |= F_SOFT_ERROR;                                                 \
+		warnx(__VA_ARGS__);                                                    \
+	} while (0)
 
 #endif


=====================================
src/io.c
=====================================
--- a/src/io.c
+++ b/src/io.c
@@ -103,7 +103,8 @@ size_t string_vector_size(const struct string_vector *sv) {
 void read_into_string_vector(const char *file_name, struct string_vector *sv) {
 	FILE *file = strcmp(file_name, "-") ? fopen(file_name, "r") : stdin;
 	if (!file) {
-		err(errno, "%s", file_name);
+		soft_err("%s", file_name);
+		return;
 	}
 
 	while (1) {
@@ -118,7 +119,8 @@ void read_into_string_vector(const char *file_name, struct string_vector *sv) {
 		}
 
 		if (check == -1) {
-			err(errno, "%s", file_name);
+			soft_err("%s", file_name);
+			break;
 		}
 
 		char *nl = strchr(str, '\n');
@@ -137,7 +139,7 @@ void read_into_string_vector(const char *file_name, struct string_vector *sv) {
 
 	int check = fclose(file);
 	if (check != 0) {
-		err(errno, "%s", file_name);
+		soft_err("%s", file_name);
 	}
 }
 
@@ -198,7 +200,7 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
 		strcmp(file_name, "-") ? open(file_name, O_RDONLY) : STDIN_FILENO;
 
 	if (file_descriptor < 0) {
-		warn("%s", file_name);
+		soft_err("%s", file_name);
 		return;
 	}
 
@@ -209,7 +211,7 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
 	pfasta_file pf;
 
 	if ((l = pfasta_parse(&pf, file_descriptor)) != 0) {
-		warnx("%s: %s", file_name, pfasta_strerror(&pf));
+		soft_errx("%s: %s", file_name, pfasta_strerror(&pf));
 		goto fail;
 	}
 
@@ -225,7 +227,7 @@ void read_fasta(const char *file_name, dsa_t *dsa) {
 	}
 
 	if (l < 0) {
-		warnx("%s: %s", file_name, pfasta_strerror(&pf));
+		soft_errx("%s: %s", file_name, pfasta_strerror(&pf));
 		pfasta_seq_free(&ps);
 	}
 
@@ -276,23 +278,30 @@ void print_distances(const struct model *D, const seq_t *sequences, size_t n,
 			}
 
 			double dist = DD(i, j) = i == j ? 0.0 : estimate(&datum);
-			double coverage = model_coverage(&datum);
+
+			if (dist > 0 && dist < 0.001) {
+				use_scientific = 1;
+			}
 
 			if (isnan(dist) && warnings) {
-				EXIT_CODE = EXIT_FAILURE;
 				const char str[] = {
 					"For the two sequences '%s' and '%s' the distance "
 					"computation failed and is reported as nan. "
 					"Please refer to the documentation for further details."};
-				warnx(str, sequences[i].name, sequences[j].name);
-			} else if (dist > 0 && dist < 0.001) {
-				use_scientific = 1;
-			} else if (i < j && coverage < 0.05 && warnings) {
-				const char str[] = {
-					"For the two sequences '%s' and '%s' less than 5%% "
-					"homology were found (%f and %f, respectively)."};
-				warnx(str, sequences[i].name, sequences[j].name,
-					  model_coverage(&D(i, j)), model_coverage(&D(j, i)));
+				soft_errx(str, sequences[i].name, sequences[j].name);
+			}
+
+			if (!isnan(dist) && i < j && warnings) {
+				double coverage1 = model_coverage(&D(i, j));
+				double coverage2 = model_coverage(&D(j, i));
+
+				if (coverage1 < 0.05 || coverage2 < 0.05) {
+					const char str[] = {
+						"For the two sequences '%s' and '%s' less than 5%% "
+						"homology were found (%f and %f, respectively)."};
+					soft_errx(str, sequences[i].name, sequences[j].name,
+							  coverage1, coverage2);
+				}
 			}
 		}
 	}


=====================================
src/model.c
=====================================
--- a/src/model.c
+++ b/src/model.c
@@ -202,6 +202,26 @@ void model_count_equal(model *MM, const char *S, size_t len) {
 	MM->counts[TtoT] += local_counts[2];
 }
 
+/** @brief Convert a nucleotide to a 2bit representation.
+ *
+ * We want to map characters:
+ *  A → 0
+ *  C → 1
+ *  G → 2
+ *  T → 3
+ * The trick used below is that the three lower bits of the
+ * characters are unique. Thus, they can be used to compute the mapping
+ * above. The mapping itself is done via tricky bitwise operations.
+ *
+ * @param c - input nucleotide
+ * @returns 2bit representation.
+ */
+char nucl2bit(char c) {
+	c &= 6;
+	c ^= c >> 1;
+	return c >> 1;
+}
+
 /**
  * @brief Count the substitutions and add them to the mutation matrix.
  *
@@ -213,7 +233,7 @@ void model_count_equal(model *MM, const char *S, size_t len) {
 void model_count(model *MM, const char *S, const char *Q, size_t len) {
 	size_t local_counts[MUTCOUNTS] = {0};
 
-	for (; len--; S++, Q++) {
+	for (size_t i = 0; i < len; S++, Q++, i++) {
 		char s = *S;
 		char q = *Q;
 
@@ -222,24 +242,9 @@ void model_count(model *MM, const char *S, const char *Q, size_t len) {
 			continue;
 		}
 
-		/* We want to map characters:
-		 *  A → 0
-		 *  C → 1
-		 *  G → 2
-		 *  T → 3
-		 * The trick used below is that the three lower bits of the
-		 * characters are unique. Thus, they can be used to compute the mapping
-		 * above. The mapping itself is done via tricky bitwise operations.
-		 */
-
-		unsigned char nibble_s = s & 7;
-		unsigned char nibble_q = q & 7;
-
-		static const unsigned int mm1 = 0x20031000;
-
 		// Pick the correct two bits representing s and q.
-		unsigned char foo = (mm1 >> (4 * nibble_s)) & 0x3;
-		unsigned char baz = (mm1 >> (4 * nibble_q)) & 0x3;
+		unsigned char foo = nucl2bit(s);
+		unsigned char baz = nucl2bit(q);
 
 		/*
 		 * The mutation matrix is symmetric. For convenience we define the


=====================================
src/process.c
=====================================
--- a/src/process.c
+++ b/src/process.c
@@ -21,7 +21,7 @@
 #include <omp.h>
 #endif
 
-double shuprop(size_t x, double g, size_t l);
+double shustring_cum_prob(size_t x, double g, size_t l);
 int calculate_bootstrap(const struct model *M, const seq_t *sequences,
 						size_t n);
 
@@ -31,18 +31,17 @@ int calculate_bootstrap(const struct model *M, const seq_t *sequences,
  * Given some parameters calculate the minimum length for anchors according
  * to the distribution from Haubold et al. (2009).
  *
- * @param p - The probability with which an anchor is allowed to be random.
+ * @param p - The probability with which an anchor will be created under a
+ * random model.
  * @param g - The the relative amount of GC in the subject.
- * @param l - The length of the subject.
+ * @param l - The length of the subject (includes revcomp).
  * @returns The minimum length of an anchor.
  */
 size_t min_anchor_length(double p, double g, size_t l) {
 	size_t x = 1;
 
-	double prop = 0.0;
-	// Find smallest x with P(X > x) < p
-	for (; prop < 1 - p; x++) {
-		prop = shuprop(x, g / 2, l);
+	while (shustring_cum_prob(x, g / 2, l) < 1 - p) {
+		x++;
 	}
 
 	return x;
@@ -82,18 +81,20 @@ size_t binomial_coefficient(size_t n, size_t k) {
 
 /**
  * @brief Given `x` this function calculates the probability of a shustring
- * with a length less than `x`.
+ * with a length less or equal to `x` under a random model. This means, it is
+ * the cumulative probability.
  *
  * Let X be the longest shortest unique substring (shustring) at any position.
  * Then this function computes P{X <= x} with respect to the given parameter
- * set. See Haubold et al. (2009).
+ * set. See Haubold et al. (2009). Note that `x` includes the final mismatch.
+ * Thus, `x` is `match length + 1`.
  *
  * @param x - The maximum length of a shustring.
  * @param p - The half of the relative amount of GC in the DNA.
  * @param l - The length of the subject.
  * @returns The probability of a certain shustring length.
  */
-double shuprop(size_t x, double p, size_t l) {
+double shustring_cum_prob(size_t x, double p, size_t l) {
 	double xx = (double)x;
 	double ll = (double)l;
 	size_t k;
@@ -115,6 +116,106 @@ double shuprop(size_t x, double p, size_t l) {
 	return s;
 }
 
+typedef _Bool bool;
+#define false 0
+#define true !false
+
+/**
+ * @brief This structure captures properties of an anchor.
+ */
+struct anchor {
+	/** The position on the subject. */
+	size_t pos_S;
+	/** The position on the query. */
+	size_t pos_Q;
+	/** The length of the exact match. */
+	size_t length;
+};
+
+/**
+ * @brief This is a structure of assorted variables needed for anchor finding.
+ */
+struct context {
+	const esa_s *C;
+	const char *query;
+	size_t query_length;
+	size_t threshold;
+};
+
+/**
+ * @brief Compute the length of the longest common prefix of two strings.
+ *
+ * @param S - One string.
+ * @param Q - Another string.
+ * @param remaining - The length of one of the strings.
+ * @returns the length of the lcp.
+ */
+static inline size_t lcp(const char *S, const char *Q, size_t remaining) {
+	size_t length = 0;
+	while (length < remaining && S[length] == Q[length]) {
+		length++;
+	}
+	return length;
+}
+
+/**
+ * @brief Check whether the last anchor can be extended by a lucky anchor.
+ *
+ * Anchors are defined to be unique and of a minimum length. The uniqueness
+ * requires us to search throw the suffix array for a second appearance of the
+ * anchor. However, if a left anchor is already unique, we could be sloppy and
+ * drop the uniqueness criterion for the second anchor. This way we can skip the
+ * lookup and just compare characters directly. However, for a lucky anchor the
+ * match still has to be longer than the threshold.
+ *
+ * @param ctx - Matching context of various variables.
+ * @param last_match - The last anchor.
+ * @param this_match - Input/Output variable for the current match.
+ * @returns true iff the current match is a lucky anchor.
+ */
+static inline bool lucky_anchor(const struct context *ctx,
+								const struct anchor *last_match,
+								struct anchor *this_match) {
+
+	size_t advance = this_match->pos_Q - last_match->pos_Q;
+	size_t gap = this_match->pos_Q - last_match->pos_Q - last_match->length;
+
+	size_t try_pos_S = last_match->pos_S + advance;
+	if (try_pos_S >= (size_t)ctx->C->len || gap > ctx->threshold) {
+		return false;
+	}
+
+	this_match->pos_S = try_pos_S;
+	this_match->length =
+		lcp(ctx->query + this_match->pos_Q, ctx->C->S + try_pos_S,
+			ctx->query_length - this_match->pos_Q);
+
+	return this_match->length >= ctx->threshold;
+}
+
+/**
+ * @brief Check for a new anchor.
+ *
+ * Given the current context and starting position check if the new match is an
+ * anchor. The latter requires uniqueness and a certain minimum length.
+ *
+ * @param ctx - Matching context of various variables.
+ * @param last_match - (unused)
+ * @param this_match - Input/Output variable for the current match.
+ * @returns true iff an anchor was found.
+ */
+static inline bool anchor(const struct context *ctx,
+						  const struct anchor *last_match,
+						  struct anchor *this_match) {
+
+	lcp_inter_t inter = get_match_cached(ctx->C, ctx->query + this_match->pos_Q,
+										 ctx->query_length - this_match->pos_Q);
+
+	this_match->pos_S = ctx->C->SA[inter.i];
+	this_match->length = inter.l <= 0 ? 0 : inter.l;
+	return inter.i == inter.j && this_match->length >= ctx->threshold;
+}
+
 /**
  * @brief Divergence estimation using the anchor technique.
  *
@@ -135,83 +236,72 @@ model dist_anchor(const esa_s *C, const char *query, size_t query_length,
 				  double gc) {
 	struct model ret = {.seq_len = query_length, .counts = {0}};
 
-	lcp_inter_t inter;
-
-	size_t last_pos_Q = 0;
-	size_t last_pos_S = 0;
-	size_t last_length = 0;
-	// This variable indicates that the last anchor was the right anchor of a
-	// pair.
-	size_t last_was_right_anchor = 0;
+	struct anchor this_match = {0};
+	struct anchor last_match = {0};
+	bool last_was_right_anchor = false;
 
-	size_t this_pos_Q = 0;
-	size_t this_pos_S;
-	size_t this_length;
+	size_t threshold = min_anchor_length(ANCHOR_P_VALUE, gc, C->len);
 
-	size_t num_right_anchors = 0;
-
-	size_t threshold = min_anchor_length(RANDOM_ANCHOR_PROP, gc, C->len);
+	struct context ctx = {C, query, query_length, threshold};
 
 	// Iterate over the complete query.
-	while (this_pos_Q < query_length) {
-		inter =
-			get_match_cached(C, query + this_pos_Q, query_length - this_pos_Q);
-
-		this_length = inter.l <= 0 ? 0 : inter.l;
+	while (this_match.pos_Q < query_length) {
 
-		if (inter.i == inter.j && this_length >= threshold) {
+		// Check for lucky anchors and fall back to normal strategy.
+		if (lucky_anchor(&ctx, &last_match, &this_match) ||
+			anchor(&ctx, &last_match, &this_match)) {
 			// We have reached a new anchor.
-			this_pos_S = C->SA[inter.i];
 
+			size_t end_S = last_match.pos_S + last_match.length;
+			size_t end_Q = last_match.pos_Q + last_match.length;
 			// Check if this can be a right anchor to the last one.
-			if (this_pos_S > last_pos_S &&
-				this_pos_Q - last_pos_Q == this_pos_S - last_pos_S) {
-				num_right_anchors++;
+			if (this_match.pos_S > end_S &&
+				this_match.pos_Q - end_Q == this_match.pos_S - end_S) {
 
-				// classify nucleotides in the qanchor
-				model_count_equal(&ret, query + last_pos_Q, last_length);
+				// classify nucleotides in the left qanchor
+				model_count_equal(&ret, query + last_match.pos_Q,
+								  last_match.length);
 
 				// Count the SNPs in between.
-				model_count(&ret, C->S + last_pos_S + last_length,
-							query + last_pos_Q + last_length,
-							this_pos_Q - last_pos_Q - last_length);
-				last_was_right_anchor = 1;
+				model_count(&ret, C->S + end_S, query + end_Q,
+							this_match.pos_Q - end_Q);
+				last_was_right_anchor = true;
 			} else {
 				if (last_was_right_anchor) {
 					// If the last was a right anchor, but with the current one,
 					// we cannot extend, then add its length.
-					model_count_equal(&ret, query + last_pos_Q, last_length);
-				} else if (last_length >= threshold * 2) {
+					model_count_equal(&ret, query + last_match.pos_Q,
+									  last_match.length);
+				} else if (last_match.length >= threshold * 2) {
 					// The last anchor wasn't neither a left or right anchor.
 					// But, it was as long as an anchor pair. So still count it.
-					model_count_equal(&ret, query + last_pos_Q, last_length);
+					model_count_equal(&ret, query + last_match.pos_Q,
+									  last_match.length);
 				}
 
-				last_was_right_anchor = 0;
+				last_was_right_anchor = false;
 			}
 
 			// Cache values for later
-			last_pos_Q = this_pos_Q;
-			last_pos_S = this_pos_S;
-			last_length = this_length;
+			last_match = this_match;
 		}
 
 		// Advance
-		this_pos_Q += this_length + 1;
+		this_match.pos_Q += this_match.length + 1;
 	}
 
 	// Very special case: The sequences are identical
-	if (last_length >= query_length) {
-		model_count(&ret, C->S + last_pos_S, query, query_length);
+	if (last_match.length >= query_length) {
+		model_count_equal(&ret, query, query_length);
 		return ret;
 	}
 
 	// We might miss a few nucleotides if the last anchor was also a right
 	// anchor. The logic is the same as a few lines above.
 	if (last_was_right_anchor) {
-		model_count(&ret, C->S + last_pos_S, query + last_pos_Q, last_length);
-	} else if (last_length >= threshold * 2) {
-		model_count_equal(&ret, query + last_pos_Q, last_length);
+		model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
+	} else if (last_match.length >= threshold * 2) {
+		model_count_equal(&ret, query + last_match.pos_Q, last_match.length);
 	}
 
 	return ret;
@@ -266,7 +356,7 @@ void calculate_distances(seq_t *sequences, size_t n) {
 	if (BOOTSTRAP) {
 		int res = calculate_bootstrap(M, sequences, n);
 		if (res) {
-			warnx("Bootstrapping failed.");
+			soft_errx("Bootstrapping failed.");
 		}
 	}
 


=====================================
test/Makefile.am
=====================================
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -1,5 +1,5 @@
-check_PROGRAMS = test_esa test_seq test_fasta
-dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh
+check_PROGRAMS = test_esa test_seq test_fasta test_process
+dist_noinst_DATA = test_extra.sh test_random.sh test_join.sh nan.sh low_homo.sh
 
 if !BUILD_WITH_LIBDIVSUFSORT
 PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
@@ -7,12 +7,19 @@ PSUFSORT=$(top_builddir)/opt/psufsort/libpsufsort.a
 DUMMY=dummy.cxx
 endif
 
-test_seq_SOURCES = test_seq.c ../src/sequence.c
+test_seq_SOURCES = test_seq.c $(top_srcdir)/src/sequence.c
 test_seq_CPPFLAGS = -I$(top_srcdir)/src -I$(top_srcdir)/opt -DDEBUG -std=gnu99
 test_seq_CFLAGS = -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
 test_seq_LDADD = $(GLIB_LIBS) $(top_builddir)/opt/libcompat.a
 
-test_esa_SOURCES = test_esa.c ../src/esa.c ../src/sequence.c $(top_srcdir)/src/esa.h
+test_process_SOURCES = test_process.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/io.c $(top_srcdir)/src/model.c $(top_srcdir)/src/process.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/global.h
+test_process_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/src -I$(top_srcdir)/opt -I$(top_srcdir)/libs -DDEBUG -std=gnu99
+test_process_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
+test_process_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a $(top_builddir)/libs/libpfasta.a
+test_process_CXXFLAGS = $(OPENMP_CXXFLAGS) -Wall -Wextra
+nodist_EXTRA_test_process_SOURCES = $(DUMMY)
+
+test_esa_SOURCES = test_esa.c $(top_srcdir)/src/esa.c $(top_srcdir)/src/sequence.c $(top_srcdir)/src/esa.h
 test_esa_CPPFLAGS = $(OPENMP_CFLAGS) -I$(top_srcdir)/libs -I$(top_srcdir)/opt -I$(top_srcdir)/src -DDEBUG -std=gnu99
 test_esa_CFLAGS = $(OPENMP_CFLAGS) -Wall -Wextra $(GLIB_CFLAGS) -Wno-missing-field-initializers
 test_esa_LDADD = $(GLIB_LIBS) $(PSUFSORT) $(top_builddir)/opt/libcompat.a


=====================================
test/low_homo.sh
=====================================
--- /dev/null
+++ b/test/low_homo.sh
@@ -0,0 +1,20 @@
+#!/bin/bash -f
+
+./test/test_fasta -l 100000 > a.fa
+./test/test_fasta -l 100000 > b.fa
+./test/test_fasta -l 100 > both.fa
+
+cat both.fa a.fa | awk -vRS='>' '{if($1 == "S0")print ">"$0 > "S0.fa"}'
+cat both.fa b.fa | awk -vRS='>' '{if($1 == "S1")print ">"$0 > "S1.fa"}'
+
+# this is expected to trigger the low homology warning
+./src/andi -j S0.fa S1.fa 2>&1 | grep 'homology'
+EXIT_VAL=$?
+
+if [[ EXIT_VAL -ge 1 ]]; then
+	echo "Triggering low homology failed" >&2
+	grep '^>' a.fa b.fa both.fa
+fi
+
+rm -f a.fa b.fa both.fa S0.fa S1.fa
+exit $EXIT_VAL


=====================================
test/nan.sh
=====================================
--- /dev/null
+++ b/test/nan.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -f
+
+./test/test_fasta -l 10000 > a.fa
+./test/test_fasta -l 10000 > b.fa
+
+# this is expected to trigger the nan warning
+./src/andi -j a.fa b.fa 2>&1 | grep 'nan'
+EXIT_VAL=$?
+
+
+if [[ EXIT_VAL -ge 1 ]]; then
+	echo "Triggering nan failed" >&2
+	grep '^>' a.fa b.fa both.fa
+fi
+
+rm -f a.fa b.fa
+exit $EXIT_VAL


=====================================
test/test_join.sh
=====================================
--- a/test/test_join.sh
+++ b/test/test_join.sh
@@ -16,11 +16,11 @@ rm p1.fasta p2.fasta p3.fasta;
 RES=$(./src/andi -m RAW -t 1 -j S0.fasta S1.fasta |
 	tail -n 1 |
 	awk '{print ($2 - 0.1)}' |
-	awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
+	awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
 	)
 
 if test $RES -ne 1; then
-	echo "The last test computed a distance deviating more than one percent from its intended value."
+	echo "The last test computed a distance deviating more than three percent from its intended value."
 	echo "See S0.fasta and S1.fasta for the used sequences."
 	exit 1;
 fi
@@ -38,11 +38,11 @@ rm p2.fasta p3.fasta;
 RES=$(./src/andi -m RAW -t1 -j S0.fasta S1.fasta |
         tail -n 1 |
         awk '{print ($2 - 0.1)}' |
-        awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
+        awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
         )
 
 if test $RES -ne 1; then
-        echo "The last test computed a distance deviating more than one percent from its intended value."
+        echo "The last test computed a distance deviating more than three percent from its intended value."
         echo "See S0.fasta and S1.fasta for the used sequences."
         exit 1;
 fi
@@ -62,11 +62,11 @@ rm p1.fasta p2.fasta p3.fasta;
 RES=$(./src/andi -mRAW -t 1 -j S0.fasta S1.fasta |
         tail -n 1 |
         awk '{print ($2 - 0.1)}' |
-        awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.01}'
+        awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) < 0.03}'
         )
 
 if test $RES -ne 1; then
-        echo "The last test computed a distance deviating more than one percent from its intended value."
+        echo "The last test computed a distance deviating more than three percent from its intended value."
         echo "See S0.fasta and S1.fasta for the used sequences."
         exit 1;
 fi


=====================================
test/test_process.c
=====================================
--- /dev/null
+++ b/test/test_process.c
@@ -0,0 +1,33 @@
+#include "global.h"
+#include "process.h"
+#include <glib.h>
+#include <math.h>
+
+int FLAGS = 0;
+int THREADS = 1;
+long unsigned int BOOTSTRAP = 0;
+double ANCHOR_P_VALUE = 0.025;
+gsl_rng *RNG = NULL;
+int MODEL = M_JC;
+
+double shustring_cum_prob(size_t x, double g, size_t l);
+size_t min_anchor_length(double p, double g, size_t l);
+
+void test_shustring_cum_prob() {
+	int len = 100000;
+	double gc = 0.5;
+	double p_value = 0.025;
+
+	size_t threshold = min_anchor_length(p_value, gc, len);
+
+	g_assert_cmpfloat(1 - p_value, <, shustring_cum_prob(threshold + 1, gc / 2, len));
+	g_assert_cmpfloat(1 - p_value, <=, shustring_cum_prob(threshold, gc / 2, len));
+	g_assert_cmpfloat(1 - p_value, >, shustring_cum_prob(threshold - 1, gc / 2, len));
+}
+
+int main(int argc, char *argv[]) {
+	g_test_init(&argc, &argv, NULL);
+	g_test_add_func("/process/shustring_cum_prob", test_shustring_cum_prob);
+
+	return g_test_run();
+}


=====================================
test/test_random.sh
=====================================
--- a/test/test_random.sh
+++ b/test/test_random.sh
@@ -35,9 +35,9 @@ do
 			./src/andi -t 1 |
 			tail -n 1 |
 			awk -v dist=$dist '{print $2, dist}' |
-			awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.02 && abs($1-$2) <= 0.02 * $2}')
+			awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}')
 		if test $res -ne 1; then
-			echo "The last test computed a distance deviating more than two percent from its intended value."
+			echo "The last test computed a distance deviating more than five percent from its intended value."
 			echo "See test_random.fasta for the used sequences."
 			echo "./test/test_fasta -s $SEED -l $LENGTH -d $dist"
 			head -n 1 ./test/test_random.fasta
@@ -57,9 +57,9 @@ do
 			./src/andi -m RAW -t 1 |
 			tail -n 1 |
 			awk -v dist=$dist '{print $2, dist}' |
-			awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.02 && abs($1-$2) <= 0.02 * $2}')
+			awk 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($1-$2) <= 0.055 && abs($1-$2) <= 0.055 * $2}')
 		if test $res -ne 1; then
-			echo "The last test computed a distance deviating more than two percent from its intended value."
+			echo "The last test computed a distance deviating more than five percent from its intended value."
 			echo "See test_random.fasta for the used sequences."
 			echo "./test/test_fasta -r -s $SEED -l $LENGTH -d $dist"
 			head -n 1 ./test/test_random.fasta



View it on GitLab: https://salsa.debian.org/med-team/andi/compare/f8aef5c4864e52df519e7e356e88cd9b636584e3...1012cec549f8d12d7510385019949b78c3829938

---
View it on GitLab: https://salsa.debian.org/med-team/andi/compare/f8aef5c4864e52df519e7e356e88cd9b636584e3...1012cec549f8d12d7510385019949b78c3829938
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.alioth.debian.org/pipermail/debian-med-commit/attachments/20180226/bc702ab7/attachment-0001.html>