[med-svn] [dazzdb] 01/06: New upstream version 1.0+20171014

Fri Oct 20 04:02:32 UTC 2017

This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository dazzdb.

commit 37db065aaaa30550bcfd55019929f682e22a4280
Author: Afif Elghraoui <afif at debian.org>
Date:   Thu Oct 19 23:08:56 2017 -0400

    New upstream version 1.0+20171014
---
 Catrack.c   |  10 ++--
 DB.c        |   3 +-
 DBdump.c    |  19 +++++++
 DBshow.c    |   2 +
 DBtrim.c    | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 Makefile    |   5 +-
 README.md   |  29 ++++++----
 arrow2DB.c  |   1 +
 simulator.c |  20 ++++++-
 9 files changed, 257 insertions(+), 17 deletions(-)

diff --git a/Catrack.c b/Catrack.c
index 14d9b64..8ea6c38 100644
--- a/Catrack.c
+++ b/Catrack.c
@@ -21,12 +21,13 @@
 #define PATHSEP "/"
 #endif
 
-static char *Usage = "[-v] <path:db|dam> <track:name>";
+static char *Usage = "[-vf] <path:db|dam> <track:name>";
 
 int main(int argc, char *argv[])
 { char *prefix;
   FILE *aout, *dout;
   int   VERBOSE;
+  int   FORCE;
 
   //  Process arguments
 
@@ -38,12 +39,13 @@ int main(int argc, char *argv[])
     j = 1;
     for (i = 1; i < argc; i++)
       if (argv[i][0] == '-')
-        { ARG_FLAGS("v") }
+        { ARG_FLAGS("vf") }
       else
         argv[j++] = argv[i];
     argc = j;
 
     VERBOSE = flags['v'];
+    FORCE   = flags['f'];
 
     if (argc != 3)
       { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
@@ -65,14 +67,14 @@ int main(int argc, char *argv[])
     free(root);
 
     aout = fopen(Catenate(prefix,argv[2],".","anno"),"r");
-    if (aout != NULL)
+    if (aout != NULL && !FORCE)
       { fprintf(stderr,"%s: Track file %s%s.anno already exists!\n",Prog_Name,prefix,argv[2]);
         fclose(aout);
         exit (1);
       }
 
     dout = fopen(Catenate(prefix,argv[2],".","data"),"r");
-    if (dout != NULL)
+    if (dout != NULL && !FORCE)
       { fprintf(stderr,"%s: Track file %s%s.data already exists!\n",Prog_Name,prefix,argv[2]);
         fclose(dout);
         exit (1);
diff --git a/DB.c b/DB.c
index 3afff14..69060d0 100644
--- a/DB.c
+++ b/DB.c
@@ -1486,7 +1486,8 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii)
       EXIT(1);
     }
   if (Arrow_DB != db)
-    { fclose(Arrow_File);
+    { if (Arrow_File != NULL)
+        fclose(Arrow_File);
       arrow = Fopen(Catenate(db->path,"","",".arw"),"r");
       if (arrow == NULL)
         EXIT(1);
diff --git a/DBdump.c b/DBdump.c
index 6227251..faef253 100644
--- a/DBdump.c
+++ b/DBdump.c
@@ -165,6 +165,25 @@ int main(int argc, char *argv[])
     if (argc <= 1)
       { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]);
         fprintf(stderr,"       %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]);
+        fprintf(stderr,"\n");
+        fprintf(stderr,"      -r: R #              - read number\n");
+        fprintf(stderr,"      -h: H # string       - original file name string (header)\n");
+        fprintf(stderr,"          L # # #          - location: well, pulse start, pulse end\n");
+        fprintf(stderr,"          Q #              - quality of read (#/1000)\n");
+        fprintf(stderr,"      -s: S # string       - sequence string\n");
+        fprintf(stderr,"      -a: N # # # #        - SNR of ACGT channels (#/100)\n");
+        fprintf(stderr,"          A # string       - arrow pulse-width string\n");
+        fprintf(stderr,"      -i: I # string       - intrinsic quality vector (as an ASCII string)\n");
+        fprintf(stderr,"      -q: d # string       - Quiva deletion values (as an ASCII string)\n");
+        fprintf(stderr,"          c # string       - Quiva deletion character string\n");
+        fprintf(stderr,"          i # string       - Quiva insertion value string\n");
+        fprintf(stderr,"          m # string       - Quiva merge value string\n");
+        fprintf(stderr,"          s # string       - Quiva substitution value string\n");
+        fprintf(stderr,"      -p: P # string       - repeat profile vector (as an ASCII string)\n");
+        fprintf(stderr,"      -m: Tx #n (#b #e)^#n - x'th track on command line, #n intervals all on same line\n");
+        fprintf(stderr,"\n");
+        fprintf(stderr,"      -u: Dump entire untrimmed database.\n");
+        fprintf(stderr,"      -U: Output base pairs in upper case letters\n");
         exit (1);
       }
 
diff --git a/DBshow.c b/DBshow.c
index c376ed4..5114665 100644
--- a/DBshow.c
+++ b/DBshow.c
@@ -435,6 +435,8 @@ int main(int argc, char *argv[])
       }
     if (DOARR)
       arrow = New_Read_Buffer(db);
+    else
+      arrow = NULL;
 
     if (UPPER == 1)
       { hilight = 'A'-'a';
diff --git a/DBtrim.c b/DBtrim.c
new file mode 100644
index 0000000..778123d
--- /dev/null
+++ b/DBtrim.c
@@ -0,0 +1,185 @@
+/*******************************************************************************************
+ *
+ *  Reset the trimming parameters for a .db:
+ *     Rewrite the .db or .dam file with the new thresholds and the new read counts for
+ *     each trimmed block.
+ *
+ *  Author:  Gene Myers
+ *  Date  :  September 2017
+ *
+ ********************************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "DB.h"
+
+#ifdef HIDE_FILES
+#define PATHSEP "/."
+#else
+#define PATHSEP "/"
+#endif
+
+static char *Usage = "[-af] [-x<int>] <path:db|dam>";
+
+int main(int argc, char *argv[])
+{ HITS_DB    db, dbs;
+  int64      dbpos;
+  FILE      *dbfile, *ixfile;
+  int        nblocks;
+  int        status;
+
+  int        FORCE;
+  int        ALL;
+  int        CUTOFF;
+
+  { int   i, j, k;
+    int   flags[128];
+    char *eptr;
+    float size;
+
+    ARG_INIT("DBtrim")
+
+    CUTOFF = 0;
+    size   = 200;
+
+    j = 1;
+    for (i = 1; i < argc; i++)
+      if (argv[i][0] == '-')
+        switch (argv[i][1])
+        { default:
+            ARG_FLAGS("af")
+            break;
+          case 'x':
+            ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
+            break;
+        }
+      else
+        argv[j++] = argv[i];
+    argc = j;
+
+    ALL   = flags['a'];
+    FORCE = flags['f'];
+
+    if (argc != 2)
+      { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
+        exit (1);
+      }
+  }
+
+  //  Open db
+
+  status = Open_DB(argv[1],&db);
+  if (status < 0)
+    exit (1);
+  if (db.part > 0)
+    { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
+      exit (1);
+    }
+
+  { char    *pwd, *root;
+    char     buffer[2*MAX_NAME+100];
+    int      nfiles;
+    int      all, cutoff;
+    int64    size;
+    int      i;
+
+    pwd  = PathTo(argv[1]);
+    if (status)
+      { root   = Root(argv[1],".dam");
+        dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+");
+      }
+    else
+      { root   = Root(argv[1],".db");
+        dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+");
+      }
+    ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
+    if (dbfile == NULL || ixfile == NULL)
+      exit (1);
+    free(pwd);
+    free(root);
+
+    if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
+      SYSTEM_ERROR
+    for (i = 0; i < nfiles; i++)
+      if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
+        SYSTEM_ERROR
+
+    if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1)
+      SYSTEM_ERROR
+
+    if (dbs.cutoff >= 0)
+      { if (!FORCE)
+          { printf("You are about to reset the thresholds for the trimmed DB.\n");
+            printf("This will invalidate any .las files produced by daligner\n");
+            printf("Are you sure you want to proceed? [Y/N] ");
+            fflush(stdout);
+            if (fgets(buffer,100,stdin) == NULL)
+              SYSTEM_ERROR
+            if (index(buffer,'n') != NULL || index(buffer,'N') != NULL)
+              { printf("Aborted\n");
+                fflush(stdout);
+                fclose(dbfile);
+                exit (1);
+              }
+          }
+      }
+    else
+      { fprintf(stderr,"%s: DB has not yet been split, use DBsplit\n",Prog_Name);
+        exit (1);
+      }
+
+    fscanf(dbfile,DB_NBLOCK,&nblocks);
+
+    dbpos = ftello(dbfile);
+    fscanf(dbfile,DB_PARAMS,&size,&cutoff,&all);
+    fseeko(dbfile,dbpos,SEEK_SET);
+    fprintf(dbfile,DB_PARAMS,size,CUTOFF,ALL);
+  }
+
+  { HITS_READ *reads  = db.reads;
+    int        uread, tread;
+    int        rlen;
+    int        b, u, t;
+
+    u = 0;
+    t = 0;
+    fprintf(dbfile,DB_BDATA,0,0);
+    for (b = 0; b < nblocks; b++)
+      { dbpos = ftello(dbfile);
+        fscanf(dbfile,DB_BDATA,&uread,&tread);
+
+        if (ALL)
+          while (u < uread)
+            { rlen = reads[u++].rlen;
+              if (rlen >= CUTOFF)
+                t += 1;
+            }
+        else
+          while (u < uread)
+            { rlen = reads[u].rlen;
+              if (rlen >= CUTOFF && (reads[u].flags & DB_BEST) != 0)
+                t += 1;
+              u += 1;
+            }
+
+        fseeko(dbfile,dbpos,SEEK_SET);
+        fprintf(dbfile,DB_BDATA,uread,t);
+      }
+
+    dbs.cutoff = CUTOFF;
+    if (ALL)
+      dbs.allarr |= DB_ALL;
+    dbs.treads = t;
+    rewind(ixfile);
+    fwrite(&dbs,sizeof(HITS_DB),1,ixfile);
+  }
+
+  fclose(ixfile);
+  fclose(dbfile);
+  Close_DB(&db);
+
+  exit (0);
+}
diff --git a/Makefile b/Makefile
index 873296f..8ed8f68 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ DEST_DIR = ~/bin
 CFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing
 
 ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \
-      fasta2DAM DAM2fasta DBdump rangen arrow2DB DB2arrow DBwipe
+      fasta2DAM DAM2fasta DBdump rangen arrow2DB DB2arrow DBwipe DBtrim
 
 all: $(ALL)
 
@@ -28,6 +28,9 @@ arrow2DB: arrow2DB.c DB.c QV.c DB.h QV.h
 DBsplit: DBsplit.c DB.c DB.h QV.c QV.h
 	gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm
 
+DBtrim: DBtrim.c DB.c DB.h QV.c QV.h
+	gcc $(CFLAGS) -o DBtrim DBtrim.c DB.c QV.c -lm
+
 DBdust: DBdust.c DB.c DB.h QV.c QV.h
 	gcc $(CFLAGS) -o DBdust DBdust.c DB.c QV.c -lm
 
diff --git a/README.md b/README.md
index ba37408..0967a17 100644
--- a/README.md
+++ b/README.md
@@ -279,7 +279,14 @@ question has previously bin split, i.e. one is not interactively asked if they w
 to proceed.
 
 ```
-10. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
+10. DBtrim [-af] [-x<int>] <path:db|dam>
+```
+
+Exactly like DBsplit except that it only resets the trimming parameters (and not the split
+partition itself).
+
+```
+11. DBdust [-b] [-w<int(64)>] [-t<double(2.)>] [-m<int(10)>] <path:db|dam>
 ```
 
 Runs the symmetric DUST algorithm over the reads in the untrimmed DB \<path\>.db or
@@ -300,16 +307,18 @@ This permits job parallelism in block-sized chunks, and the resulting sequence o
 block tracks can then be merged into a track for the entire untrimmed DB with Catrack.
 
 ```
-11. Catrack [-v] <path:db|dam> <track:name>
+12. Catrack [-vf] <path:db|dam> <track:name>
 ```
 
 Find all block tracks of the form .\<path\>.#.\<track\>... and merge them into a single
 track, .\<path\>.\<track\>..., for the given DB or DAM.   The block track files must all
 encode the same kind of track data (this is checked), and the files must exist for
-block 1, 2, 3, ... up to the last block number.
+block 1, 2, 3, ... up to the last block number.  If the -f option is set, then the
+concatenation takes place regardless of whether or not the single, combined track
+exists or not.
 
 ```
-12. DBshow [-unqaUQA] [-w<int(80)>] [-m<track>]+
+13. DBshow [-unqaUQA] [-w<int(80)>] [-m<track>]+
                       <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
 ```
 
@@ -348,7 +357,7 @@ fasta2DB, quiva2D, and arrow2DB, giving one a simple way to make a DB of a subse
 the reads for testing purposes.
 
 ```
-13. DBdump [-rhsaqip] [-uU] [-m<track>]+
+14. DBdump [-rhsaqip] [-uU] [-m<track>]+
                       <path:db|dam> [ <reads:FILE> | <reads:range> ... ]
 ```
 
@@ -422,7 +431,7 @@ Arrow pulse width strings are identical to that for the sequence as they are all
 the same length for any given entry.
 
 ```
-14. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
+15. DBstats [-nu] [-b<int(1000)] [-m<track>]+ <path:db|dam>
 ```
 
 Show overview statistics for all the reads in the trimmed data base \<path\>.db or
@@ -434,7 +443,7 @@ intervals along the read can be specified with the -m option in which case a sum
 and a histogram of the interval lengths is displayed.
 
 ```
-15. DBrm <path:db|dam> ...
+16. DBrm <path:db|dam> ...
 ```
 
 Delete all the files for the given data bases.  Do not use rm to remove a database, as
@@ -442,7 +451,7 @@ there are at least two and often several secondary files for each DB including t
 files, and all of these are removed by DBrm.
 
 ```
-16. DBwipe <path:db|dam> ...
+17. DBwipe <path:db|dam> ...
 ```
 
 Delete any Arrow or Quiver data from the given databases.  This removes the .arw or
@@ -450,7 +459,7 @@ Delete any Arrow or Quiver data from the given databases.  This removes the .arw
 or Quiver.  Basically, converts an A-DB or Q-DB back to a simple S-DB.
 
 ```
-17.  simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
+18.  simulator <genome:dam> [-CU] [-m<int(10000)>] [-s<int(2000)>] [-e<double(.15)]
                                   [-c<double(50.)>] [-f<double(.5)>] [-x<int(4000)>]
                                   [-w<int(80)>] [-r<int>] [-M<file>]
 ```
@@ -485,7 +494,7 @@ a read is say 's b e' then if b \< e the read is a perturbed copy of s[b,e] in t
 forward direction, and a perturbed copy s[e,b] in the reverse direction otherwise.
 
 ```
-18. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
+19. rangen <genlen:double> [-U] [-b<double(.5)>] [-w<int(80)>] [-r<int>]
 ```
 
 Generate a random DNA sequence of length genlen*1Mbp that has an AT-bias of -b.
diff --git a/arrow2DB.c b/arrow2DB.c
index 19975aa..d7890e9 100644
--- a/arrow2DB.c
+++ b/arrow2DB.c
@@ -222,6 +222,7 @@ int main(int argc, char *argv[])
           goto error;
       }
 
+    eof = 0;
     for (cell = 0; cell < nfiles; cell++)
       { char  prolog[MAX_NAME], fname[MAX_NAME];
 
diff --git a/simulator.c b/simulator.c
index cf1d694..57cd956 100644
--- a/simulator.c
+++ b/simulator.c
@@ -41,6 +41,8 @@
 #include <unistd.h>
 #include <math.h>
 
+#define PACBIO
+
 #include "DB.h"
 
 static char *Usage[] = { "<genome:dam> [-CU] [-m<int(10000)>]  [-s<int(2000)>] [-e<double(.15)>]",
@@ -61,10 +63,26 @@ static int    HASR;       // -r option is set?
 static int    SEED;       // -r option
 static FILE  *MAP;        // -M option
 
-#define INS_RATE  .73333  // insert rate
+#ifdef PACBIO
+
+#define INS_RATE  .73333  // insert rate (for PB data)
 #define DEL_RATE  .20000  // deletion rate
 #define IDL_RATE  .93333  // insert + delete rate
 
+#elif ILLUMINA
+
+#define INS_RATE  .1  // insert rate (for Illumina data)
+#define DEL_RATE  .1  // deletion rate
+#define IDL_RATE  .2  // insert + delete rate
+
+#else
+
+#define INS_RATE  .33333  // insert rate (equal weighting)
+#define DEL_RATE  .33333  // deletion rate
+#define IDL_RATE  .66666  // insert + delete rate
+
+#endif
+
 //  Complement (in the DNA sense) string *s*.
 
 static void complement(int elen, char *s)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/dazzdb.git