[med-svn] [fasttree] 01/03: Imported Upstream version 2.1.8

Roland Fehrenbacher rfehren-guest at moszumanska.debian.org
Tue Mar 31 11:01:53 UTC 2015


This is an automated email from the git hooks/post-receive script.

rfehren-guest pushed a commit to branch master
in repository fasttree.

commit 34340d0376e84ee19e8171ac464dcaa64bc72f37
Author: Roland Fehrenbacher <rf at q-leap.de>
Date:   Tue Mar 31 10:50:09 2015 +0000

    Imported Upstream version 2.1.8
---
 changelog  |  9 ++++++
 fasttree.c | 97 +++++++++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/changelog b/changelog
index e0bfeea..d6f63f2 100644
--- a/changelog
+++ b/changelog
@@ -1,3 +1,12 @@
+Version 2.1.8: March 24, 2015
+
+	To provide useful branch lengths for very wide alignments of very
+	close closely-related sequences, the minimum branch lengths were
+	dramatically decreased when compiling with USE_DOUBLE. If using ML
+	on an alignment of closely-related and long sequences, issues a
+	warning, and recommends recompiling with USE_DOUBLE (if not
+	already using USE_DOUBLE).
+
 Version 2.1.7: January 22, 2013
 
 	To avoid numerical problems that led to crashes in rare cases,
diff --git a/fasttree.c b/fasttree.c
index 12862e2..e8adc7a 100644
--- a/fasttree.c
+++ b/fasttree.c
@@ -2,15 +2,17 @@
  * FastTree -- inferring approximately-maximum-likelihood trees for large
  * multiple sequence alignments.
  *
- * Morgan N. Price, 2008-2011
+ * Morgan N. Price
  * http://www.microbesonline.org/fasttree/
  *
  * Thanks to Jim Hester of the Cleveland Clinic Foundation for
  * providing the first parallel (OpenMP) code, Siavash Mirarab of
- * UT Austin for implementing the WAG option, and Samuel Shepard
- * at the CDC for suggesting and helping with the -quote option.
+ * UT Austin for implementing the WAG option, Samuel Shepard
+ * at the CDC for suggesting and helping with the -quote option, and
+ * Aaron Darling (University of Technology, Sydney) for numerical changes
+ * for wide alignments of closely-related sequences.
  *
- *  Copyright (C) 2008-2011 The Regents of the University of California
+ *  Copyright (C) 2008-2015 The Regents of the University of California
  *  All rights reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
@@ -305,7 +307,8 @@
 
 /* By default, tries to compile with SSE instructions for greater speed.
    But if compiled with -DUSE_DOUBLE, uses double precision instead of single-precision
-   floating point (2x memroy required), and does not use SSE.
+   floating point (2x memory required), does not use SSE, and allows much shorter
+   branch lengths.
 */
 #ifdef __SSE__
 #if !defined(NO_SSE) && !defined(USE_DOUBLE)
@@ -340,7 +343,7 @@ typedef float numeric_t;
 
 #endif /* USE_SSE3 */
 
-#define FT_VERSION "2.1.7"
+#define FT_VERSION "2.1.8"
 
 char *usage =
   "  FastTree protein_alignment > tree\n"
@@ -848,14 +851,24 @@ const double LogLkUnderflow = 9.21034037197618; /* -log(LkUnderflowInv) */
 const double Log2 = 0.693147180559945;
 /* These are used to limit the optimization of branch lengths.
    Also very short branch lengths can create numerical problems.
-   In version 2.1.7., the minimum branch lengths (MLMinBranchLength and MLMinRelBranchLength)
+   In version 2.1.7, the minimum branch lengths (MLMinBranchLength and MLMinRelBranchLength)
    were increased to prevent numerical problems in rare cases.
-   If compiled with -DUSE_DOUBLE then these minimums could be decreased.
+   In version 2.1.8, to provide useful branch lengths for genome-wide alignments,
+   the minimum branch lengths were dramatically decreased if USE_DOUBLE is defined.
 */
+#ifndef USE_DOUBLE
 const double MLMinBranchLengthTolerance = 1.0e-4; /* absolute tolerance for optimizing branch lengths */
 const double MLFTolBranchLength = 0.001; /* fractional tolerance for optimizing branch lengths */
-const double MLMinBranchLength = 5.0e-4;
+const double MLMinBranchLength = 5.0e-4; /* minimum value for branch length */
 const double MLMinRelBranchLength = 2.5e-4; /* minimum of rate * length */
+const double fPostTotalTolerance = 1.0e-10; /* posterior vector must sum to at least this before rescaling */
+#else
+const double MLMinBranchLengthTolerance = 1.0e-9;
+const double MLFTolBranchLength = 0.001;
+const double MLMinBranchLength = 5.0e-9;
+const double MLMinRelBranchLength = 2.5e-9;
+const double fPostTotalTolerance = 1.0e-20;
+#endif
 
 int mlAccuracy = 1;		/* Rounds of optimization of branch lengths; 1 means do 2nd round only if close */
 double closeLogLkLimit = 5.0;	/* If partial optimization of an NNI looks like it would decrease the log likelihood
@@ -2211,21 +2224,20 @@ int main(int argc, char **argv) {
       UpdateBranchLengths(/*IN/OUT*/NJ);
       LogTree("ME_Lengths",0, fpLog, NJ, aln->names, unique, bQuote);
 
-      if(verbose>0 || fpLog) {
-	double total_len = 0;
-	int iNode;
-	for (iNode = 0; iNode < NJ->maxnode; iNode++)
-	  total_len += fabs(NJ->branchlength[iNode]);
-	if (verbose>0) {
-	  fprintf(stderr, "Total branch-length %.3f after %.2f sec\n",
-		  total_len, clockDiff(&clock_start));
-	  fflush(stderr);
-	}
-	if (fpLog) {
-	  fprintf(fpLog, "Total branch-length %.3f after %.2f sec\n",
-		  total_len, clockDiff(&clock_start));
-	  fflush(stderr);
-	}
+      double total_len = 0;
+      int iNode;
+      for (iNode = 0; iNode < NJ->maxnode; iNode++)
+	total_len += fabs(NJ->branchlength[iNode]);
+
+      if (verbose>0) {
+	fprintf(stderr, "Total branch-length %.3f after %.2f sec\n",
+		total_len, clockDiff(&clock_start));
+	fflush(stderr);
+      }
+      if (fpLog) {
+	fprintf(fpLog, "Total branch-length %.3f after %.2f sec\n",
+		total_len, clockDiff(&clock_start));
+	fflush(stderr);
       }
 
 #ifdef TRACK_MEMORY
@@ -2238,7 +2250,27 @@ int main(int argc, char **argv) {
 #endif
 
       SplitCount_t splitcount = {0,0,0,0,0.0,0.0};
+
       if (MLnniToDo > 0 || MLlen) {
+	bool warn_len = total_len/NJ->maxnode < 0.001 && MLMinBranchLengthTolerance > 1.0/aln->nPos;
+	bool warn = warn_len || (total_len/NJ->maxnode < 0.001 && aln->nPos >= 10000);
+	if (warn)
+	  fprintf(stderr, "\nWARNING! This alignment consists of closely-related and very-long sequences.\n");
+	if (warn_len)
+	  fprintf(stderr,
+		  "This version of FastTree may not report reasonable branch lengths!\n"
+#ifdef USE_DOUBLE
+		  "Consider changing MLMinBranchLengthTolerance.\n"
+#else
+		  "Consider recompiling FastTree with -DUSE_DOUBLE.\n"
+#endif
+		  "For more information, visit\n"
+		  "http://www.microbesonline.org/fasttree/#BranchLen\n\n");
+	if (warn)
+	  fprintf(stderr, "WARNING! FastTree (or other standard maximum-likelihood tools)\n"
+		  "may not be appropriate for aligments of very closely-related sequences\n"
+		  "like this one, as FastTree does not account for recombination or gene conversion\n\n");
+
 	/* Do maximum-likelihood computations */
 	/* Convert profiles to use the transition matrix */
 	distance_matrix_t *tmatAsDist = TransMatToDistanceMat(/*OPTIONAL*/NJ->transmat);
@@ -3561,14 +3593,19 @@ void PrintNJ(FILE *fp, NJ_t *NJ, char **names, uniquify_t *unique, bool bShowSup
 	fprintf(fp,")");
       }
       /* Print the branch length */
-      fprintf(fp, ":%.5f", NJ->branchlength[node]);
+#ifdef USE_DOUBLE
+#define FP_FORMAT "%.9f"
+#else
+#define FP_FORMAT "%.5f"
+#endif
+      fprintf(fp, ":" FP_FORMAT, NJ->branchlength[node]);
     } else if (end) {
       if (node == NJ->root)
 	fprintf(fp, ")");
       else if (bShowSupport)
-	fprintf(fp, ")%.3f:%.5f", NJ->support[node], NJ->branchlength[node]);
+	fprintf(fp, ")%.3f:" FP_FORMAT, NJ->support[node], NJ->branchlength[node]);
       else
-	fprintf(fp, "):%.5f", NJ->branchlength[node]);
+	fprintf(fp, "):" FP_FORMAT, NJ->branchlength[node]);
     } else {
       if (node != NJ->root && NJ->child[NJ->parent[node]].child[0] != node) fprintf(fp, ",");
       fprintf(fp, "(");
@@ -4370,7 +4407,7 @@ void NormalizeFreq(/*IN/OUT*/numeric_t *freq, distance_matrix_t *dmat) {
     for (k = 0; k < nCodes; k++)
       total_freq += freq[k];
   }
-  if (total_freq > 1e-10) {
+  if (total_freq > fPostTotalTolerance) {
     numeric_t inverse_weight = 1.0/total_freq;
     vector_multiply_by(/*IN/OUT*/freq, inverse_weight, nCodes);
   } else {
@@ -4962,7 +4999,7 @@ profile_t *PosteriorProfile(profile_t *p1, profile_t *p2,
       double fPostTot = 0;
       for (j = 0; j < 4; j++)
 	fPostTot += fPost[j];
-      assert(fPostTot > 1e-10);
+      assert(fPostTot > fPostTotalTolerance);
       double fPostInv = 1.0/fPostTot;
 #if 0 /* SSE3 is slower */
       vector_multiply_by(fPost, fPostInv, 4);
@@ -5025,7 +5062,7 @@ profile_t *PosteriorProfile(profile_t *p1, profile_t *p2,
 	fPost[j] = value >= 0 ? value : 0;
       }
       double fPostTot = vector_sum(fPost, 20);
-      assert(fPostTot > 1e-10);
+      assert(fPostTot > fPostTotalTolerance);
       double fPostInv = 1.0/fPostTot;
       vector_multiply_by(/*IN/OUT*/fPost, fPostInv, 20);
       int ch = -1;		/* the dominant character, if any */

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/fasttree.git



More information about the debian-med-commit mailing list