[med-svn] [pirs] 01/03: Imported Upstream version 2.0.2+dfsg

Andreas Tille tille at debian.org
Wed Aug 17 08:10:14 UTC 2016


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository pirs.

commit c0f993b844932dfbe2bc965f66ed7687209963e3
Author: Andreas Tille <tille at debian.org>
Date:   Wed Aug 17 10:08:17 2016 +0200

    Imported Upstream version 2.0.2+dfsg
---
 .gitignore                                         |  1 +
 src/pirs/MaskQvalsByEamss.cpp                      | 21 ++-----
 src/pirs/MaskQvalsByEamss.h                        | 20 ++-----
 .../baseCalling_Matrix_calculator.pl               | 43 ++++++++------
 .../baseCalling_Matrix_calculator.txt              | 69 ++++++++++++++++++++++
 5 files changed, 107 insertions(+), 47 deletions(-)

diff --git a/.gitignore b/.gitignore
index 58f5301..dbdcdc3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 src/afterclean.lst
 
+.DS_Store
 *.bak
 *.md5
 ._*
diff --git a/src/pirs/MaskQvalsByEamss.cpp b/src/pirs/MaskQvalsByEamss.cpp
index 4d5e276..21283c1 100644
--- a/src/pirs/MaskQvalsByEamss.cpp
+++ b/src/pirs/MaskQvalsByEamss.cpp
@@ -1,27 +1,18 @@
 /**
- ** Copyright (c) 2008-2010 Illumina, Inc.
- **
- ** This software is covered by the "Illumina Genome Analyzer Software
- ** License Agreement" and the "Illumina Source Code License Agreement",
- ** and certain third party copyright/licenses, and any user of this
- ** source file is bound by the terms therein (see accompanying files
- ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
- ** Illumina_Source_Code_License_Agreement.pdf and third party
- ** copyright/license notices).
- **
- ** This file is part of the Consensus Assessment of Sequence And VAriation
+ ** This file is based on the Consensus Assessment of Sequence And VAriation
  ** (CASAVA) software package.
  **
- ** \file MaskQvalsByEamss.cpp
+ ** \file ./CASAVA_v1.8.2/src/c++/lib/demultiplex/MaskQvalsByEamss.cpp
  **
  ** \brief Masks quality values of a read, using the EAMSS.
  **
  ** \author Come Raczy
+ **
+ ** \modifed_by Eric Biggers
+ **
+ ** \changed Use vectors instead fo strings; also got rid of the boost header.
  **/
 
-// Modified for pIRS: Use vectors instead fo strings; also got rid of the boost
-// header.
-
 #include <string>
 #include <vector>
 #include <string.h>
diff --git a/src/pirs/MaskQvalsByEamss.h b/src/pirs/MaskQvalsByEamss.h
index 61c912b..413d14b 100644
--- a/src/pirs/MaskQvalsByEamss.h
+++ b/src/pirs/MaskQvalsByEamss.h
@@ -1,26 +1,18 @@
 /**
- ** Copyright (c) 2008-2010 Illumina, Inc.
- **
- ** This software is covered by the "Illumina Genome Analyzer Software
- ** License Agreement" and the "Illumina Source Code License Agreement",
- ** and certain third party copyright/licenses, and any user of this
- ** source file is bound by the terms therein (see accompanying files
- ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and
- ** Illumina_Source_Code_License_Agreement.pdf and third party
- ** copyright/license notices).
- **
- ** This file is part of the Consensus Assessment of Sequence And VAriation
+ ** This file is based on the Consensus Assessment of Sequence And VAriation
  ** (CASAVA) software package.
  **
- ** \file MaskQvalsByEamss.cpp
+ ** \file ./CASAVA_v1.8.2/src/c++/include/demultiplex/MaskQvalsByEamss.hh
  **
  ** \brief Masks quality values of a read, using the EAMSS.
  **
  ** \author Come Raczy
+ **
+ ** \modifed_by Eric Biggers
+ **
+ ** \changed Pass qValues and baseCalls as C arrays (on the stack)
  **/
 
-// Modified for pIRS: Pass qValues and baseCalls as C arrays (on the stack)
-
 #ifndef CASAVA_DEMULTIPLEX_MASK_QVALS_BY_EAMSS_H
 #define CASAVA_DEMULTIPLEX_MASK_QVALS_BY_EAMSS_H
 
diff --git a/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl
index d91c626..db6c764 100755
--- a/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl
+++ b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 =pod
 Author: Hu Xuesong @ BGI <huxuesong at genomics.org.cn>
-Version: 0.1.2 @ 20140115
+Version: 0.1.5 @ 20140207
 =cut
 use strict;
 use warnings;
@@ -35,11 +35,10 @@ sub main::HELP_MESSAGE() {
 	$main::help =~ s|\n|\033[32;1m\n|g;
 	$main::ARG_DESC='[PROGRAM_ARG1 ...]' unless $main::ARG_DESC;
 	print STDERR <<EOH;
-\nUsage: \033[0;1m$0\033[0;0m [-OPTIONS [-MORE_OPTIONS]]
+\nUsage: \033[0;1m$0\033[0;0m -i xxx.bam -r ref.fa.gz -o xxx [-MORE_OPTIONS]
 
 The following single-character options are accepted:
-\033[32;1m$main::help\033[0;0mOptions may be merged together.
-Space is not required between options and their arguments.
+\033[32;1m$main::help\033[0;0m
 EOH
 }
 sub main::VERSION_MESSAGE() {
@@ -59,24 +58,28 @@ EOH
 	}
 }
 
-$main::VERSION=1.0.0;
-our $opts='i:r:o:l:s:c:t:qb';
-our($opt_i, $opt_o, $opt_r, $opt_l, $opt_s, $opt_c, $opt_t, $opt_q, $opt_b);
+$main::VERSION=1.0.1;
+our $opts='i:r:o:l:s:c:t:m:q';
+our($opt_i, $opt_o, $opt_r, $opt_l, $opt_s, $opt_c, $opt_t, $opt_q, $opt_m);
 
 #our $desc='';
 our $help=<<EOH;
 \t-i Input Pair-End SAM/BAM files [used with "samtools view xxx"] 
-\t-r ref fasta file (./ref/human.fa) [.{gz,bz2} is OK]
-\t-s trim SNP positions from (<filename>) in format /^ChrID\\tPos/. VCF file with only SNP is OK.
+\t-r Reference FASTA file [.{gz,bz2} is OK]
+\t-s skip SNP positions from (<filename>) in format /^ChrID\\tPos/. VCF file with only SNP is OK.
+\t-m minimal accepted MAPQ (60)
 \t-l read length of reads (int) [Optional. Specify to override auto detected value.]
 \t-o output prefix (./matrix).{count,ratio}.matrix and .{stat,info}
-\t-c ChrID list file [Useful to analyse only autosomes]
+\t-c ChrID list file [to specify a subset of chromosomes, one per line]
 \t-q Use Qascii=64 for sam files instead of 33
 \t-t Trim ChrID in ref fasta file to match alignment results (none) [use RegEx for s/\$ARG//;]
-\t-b No 5s Pause for batch runs
 EOH
 
 ShowHelp();
+unless ($opt_r) {
+	die "[x]-r Reference FASTA file not specified !\n"
+} else { die "[x]-r $opt_r not exists !\n" unless -f $opt_r; }
+
 unless ($opt_i) {
 	die "[x]-i Input.bam not specified !\n"
 } else {
@@ -85,7 +88,7 @@ unless ($opt_i) {
 		warn "[!]Use user specified Read Length=",$opt_l,"\n";
 	} else {
 		my ($READLEN,$lines)=(0,0);
-		open IN,"-|","$SAMTOOLSBIN view -f 3 -F 1792 $opt_i" or die "Error opening $opt_i: $!\n";
+		open IN,"-|","$SAMTOOLSBIN view -f 3 -F 3840 $opt_i" or die "Error opening $opt_i: $!\n";
 		while (<IN>) {
 			#next if /^@\w\w\t\w\w:/;
 			#chomp;
@@ -101,19 +104,18 @@ unless ($opt_i) {
 		$opt_l = $READLEN;
 		warn "[!]Use detected Read Length=",$opt_l," from beginning $MAXREADStoCHECK proper lines of input file.\n";
 	}
-	open( INSAM,"-|","$SAMTOOLSBIN view -f 3 -F 1792 -h $opt_i") or die "Error opening $opt_i: $!\n";
+	open( INSAM,"-|","$SAMTOOLSBIN view -f 3 -F 3840 -h $opt_i") or die "Error opening $opt_i: $!\n";
 }
-$opt_r='./ref/human.fa' if ! $opt_r;
 $opt_o='./matrix' if ! $opt_o;
+$opt_m=60 if (! $opt_m) or $opt_m < 0;
 die "[x]-r $opt_r not exists !\n" unless -f $opt_r;
 if ($opt_s) {die "[x]-s $opt_s not exists !\n" unless -f $opt_s;}
 
-print STDERR "From [$opt_i] of [$opt_l] with [$opt_r] to [$opt_o]\n";
+print STDERR "From [$opt_i][$opt_m] of [$opt_l] with [$opt_r] to [$opt_o]\n";
 print STDERR "ChrID will be trimed by s/$opt_t//;\n" if $opt_t;
 print STDERR "ChrID list:[$opt_c]\n" if $opt_c;
 print STDERR "SNP skipping list:[$opt_s]\n" if $opt_s;
 print STDERR "SAM files with Qascii=64\n" if $opt_q;
-unless ($opt_b) {print STDERR "Wait 3 seconds to continue...\n"; sleep 5;}
 
 #my $start_time = [gettimeofday];
 #BEGIN
@@ -129,6 +131,7 @@ if ($opt_c) {
 		++$Genome{$_};
 	}
 	close C;
+	print STDERR "[!]ChrID list:[",join('],[',(sort keys %Genome)),"].\n";
 }
 warn "[!]Reading Reference Genome:\n";
 if ($opt_r =~ /.bz2$/) {
@@ -326,9 +329,11 @@ while (<INSAM>) {
 	#next unless ($read1[1] & 3) == 3;  # paired + mapped in a proper pair; samtools view -f 3
 	#next if $read1[1] >= 256;   # not primary || QC failure || optical or PCR duplicate; samtools view -F 1792
 	next unless $read1[6] eq '=';   # same Reference sequence NAME
+	next if $read1[4] < $opt_m;	# 5,MAPQ: MAPping Quality (Phred-scaled)
 	#next if $read1[11] eq 'XT:A:R'; # Type: Unique/Repeat/N/Mate-sw, N not found.
 	my $OPT = join("\t", at read1[11 .. $#read1]);
 	next if $OPT =~ /\bXT:A:R\b/;
+	next if $OPT =~ /\bXA:Z:/;	# bwa mem use XA:Z for Alternative hits, SA:Z for Chimeric reads. We need skip those with Alternative hits.
 	my $ref1=uc getBases($read1[2],$read1[3],$READLEN) or print join("\t", at read1),"\n";
 	my $ReadCycle;
 	if ($read1[1] & 64) {
@@ -406,6 +411,7 @@ my @BaseQ;
 for my $base (@BaseOrder) {
 	push @BaseQ,"$base-$_" for (0..$MaxQ);
 }
+=pod
 $tmp .= "\n
 [Info]
 Date = $date
@@ -429,7 +435,8 @@ MismatchBase = $MisBase
 QB_Bases = $QBbase
 QB_Mismatches = $QBmis
 <<END
-
+=cut
+$tmp .= "\n
 [DistMatrix]
 #".join("\t",'Ref','Cycle', at BaseQ);
 print OA $tmp;
@@ -604,5 +611,5 @@ TODO:
 add reference masked len.
 
 Example for bam file:
-samtools view -f 3 -F 1792 -h GA0146.sort.bam | ./pirs/baseCalling_Matrix_calculator -p sam -r ref.fa -s GA0146.SNPs.filter.vcf -l 125 -o GA0146.matrix -b >GA0146.matrix.log 2>GA0146.matrix.err
+samtools view -f 3 -F 3840 -h GA0146.sort.bam | ./pirs/baseCalling_Matrix_calculator -p sam -r ref.fa -s GA0146.SNPs.filter.vcf -l 125 -o GA0146.matrix -b >GA0146.matrix.log 2>GA0146.matrix.err
 # log should be empty in normal situation.
diff --git a/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.txt b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.txt
new file mode 100644
index 0000000..d1d53b6
--- /dev/null
+++ b/src/stator/baseCallingMatrix/baseCalling_Matrix_calculator.txt
@@ -0,0 +1,69 @@
+Input file: samtools view -f 3 -F 1792 -h xxx.bam
+
+SAM FORMAT:
+	┌────┬───────┬──────────────────────────────────────────────────────────┐
+	│Col │ Field │                       Description                        │
+	├────┼───────┼──────────────────────────────────────────────────────────┤
+	│ 1  │ QNAME │ Query (pair) NAME                                        │
+	│ 2  │ FLAG  │ bitwise FLAG                                             │
+	│ 3  │ RNAME │ Reference sequence NAME                                  │
+	│ 4  │ POS   │ 1-based leftmost POSition/coordinate of clipped sequence │
+	│ 5  │ MAPQ  │ MAPping Quality (Phred-scaled)                           │
+	│ 6  │ CIAGR │ extended CIGAR string                                    │
+	│ 7  │ MRNM  │ Mate Reference sequence NaMe (`=' if same as RNAME)      │
+	│ 8  │ MPOS  │ 1-based Mate POSistion                                   │
+	│ 9  │ ISIZE │ Inferred insert SIZE                                     │
+	│10  │ SEQ   │ query SEQuence on the same strand as the reference       │
+	│11  │ QUAL  │ query QUALity (ASCII-33 gives the Phred base quality)    │
+	│12  │ OPT   │ variable OPTional fields in the format TAG:VTYPE:VALUE   │
+	└────┴───────┴──────────────────────────────────────────────────────────┘
+
+SAM FLAG:
+	┌────┬────────┬───────────────────────────────────────┐
+	│Chr │  Flag  │              Description              │
+	├────┼────────┼───────────────────────────────────────┤
+	│ p +│ 0x0001 │ the read is paired in sequencing      │
+	│ P +│ 0x0002 │ the read is mapped in a proper pair   │
+	│ u  │ 0x0004 │ the query sequence itself is unmapped │
+	│ U  │ 0x0008 │ the mate is unmapped                  │
+	│ r  │ 0x0010 │ strand of the query (1 for reverse)   │
+	│ R  │ 0x0020 │ strand of the mate                    │
+	│ 1  │ 0x0040 │ the read is the first read in a pair  │
+	│ 2  │ 0x0080 │ the read is the second read in a pair │
+	│ s -│ 0x0100 │ the alignment is not primary          │
+	│ f -│ 0x0200 │ QC failure                            │
+	│ d -│ 0x0400 │ optical or PCR duplicate              │
+	└────┴────────┴───────────────────────────────────────┘
+
+Options:
+	-i Input Pair-End SAM/BAM files [used with "samtools view xxx"] 
+	-r Reference FASTA file [.{gz,bz2} is OK]
+	-s skip SNP positions from (<filename>) in format /^ChrID\tPos/. VCF file with only SNP is OK.
+	-m minimal accepted MAPQ (50)
+	-l read length of reads (int) [Optional. Specify to override auto detected value.]
+	-o output prefix (./matrix).{count,ratio}.matrix and .{stat,info}
+	-c ChrID list file [to specify a subset of chromosomes, one per line]
+	-q Use Qascii=64 for sam files instead of 33
+	-t Trim ChrID in ref fasta file to match alignment results (none) [use RegEx for s/\$ARG//;]
+
+Should be set: i,r,o
+Better to have: s
+With reasonable defaults: m
+Up to you: c
+Histrical Reason: l,q,t
+
+补充:
+-q 没它就按33来,有它就按64来。
+-t 适用于参考序列是用"chr01",但soap文件是用"01"的情况。在华大遇到过人的soap文件,染色体名只有数字和X、Y。此时 -t chr 即可。
+
+All filters on input SAM/BAM file:
+1. samtools view -f 3 -F 1792	含义见上面“SAM FLAG”表格第一列的加号和减号。即必须有加号的项、没有减号的项。
+2. (CIAGR =~ /(\d+)M/) && ($1 != $READLEN);
+3. RNAME not in given Reference file and Chr List
+4. MRNM not '='
+5. MAPQ < "-m minimal accepted MAPQ (50)"
+6. OPT =~ /\bXT:A:R\b/;
+7. OPT =~ /\bXA:Z:/;
+
+Example CLI:
+perl baseCalling_Matrix_calculator.pl -b -r ref.fa -i bam/xxx.sort.bam -s vcf/xxx.SNPs.filter.vcf -o matrix/xxx.matrix 2>matrix/xxx.matrix.err

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pirs.git



More information about the debian-med-commit mailing list