[med-svn] r1156 - trunk/community/infrastructure

smoe-guest at alioth.debian.org smoe-guest at alioth.debian.org
Thu Jan 17 23:03:52 UTC 2008


Author: smoe-guest
Date: 2008-01-17 23:03:51 +0000 (Thu, 17 Jan 2008)
New Revision: 1156

Added:
   trunk/community/infrastructure/getData.pl
Log:
Concept study for a mirroring system that could possibly be shared between multiple applications. Comments welcome. Have a look at the source and try

	getData.pl --list

or

	getData.pl --man



Added: trunk/community/infrastructure/getData.pl
===================================================================
--- trunk/community/infrastructure/getData.pl	                        (rev 0)
+++ trunk/community/infrastructure/getData.pl	2008-01-17 23:03:51 UTC (rev 1156)
@@ -0,0 +1,266 @@
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+getData.pl - retrieves databases from the Internet
+
+=cut
+
+# This script shall help maintaining sets of frequently changing databases
+# of various sorts. It is motivated by demands in bioinformatics and
+# astronomy.
+
+# Copyright (c) 2008 Steffen Moeller <moeller at debian.org>
+# 
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+# Or else go to GNU Web pages http://www.gnu.org and follow the white rabbit.
+#
+
+my $rootdir = "/var/lib/mirrored";
+
+=head1 SYNOPSIS
+
+getData.pl [ --rootdir <path> ] <list of db names>
+getData.pl --list
+
+=head1 DESCRIPTION
+
+Bioinformatics has the intrinsic problem to bring the biological data
+to the end user. Astronomers have the equivalent problem and particle
+physicists, well, they haven come up with (first) the web and (second)
+the computational grids to access their problems. Debian helps with the
+programs but will not provide such huge datasets that are even frequently
+updated. Not even in volatile.debian.org. Most bioinformatics researchers
+will not need too many of such databases. And even more so will gladly
+continue in using public services remotely.
+
+For those who need a set of databases on a regular basis, this script 
+shall be a start to automate the burden to download the data and update
+indices and the like. The world has seen such magic before with the
+Lion Biosciences Prisma tool (http://bib.oxfordjournals.org/cgi/reprint/3/4/389.pdf)
+but how about something simpler (as a start) that at least gets close
+to what we desire and is Free. The aim must be to address the needs of
+all (most) communities, not only of the bioinformatics world. The seed was
+hence made with databases from astronomy.
+
+=head1 OPTIONS
+
+=over 4
+
+=item --help
+
+    this help
+
+=item --man
+
+    Present a more detailed description in form of a man page.
+
+=item --rootdir <path>
+
+    Specifies destination directory. The data will be mirrored to $rootdir/$dbname/
+
+=item --list
+
+    Lists all databases that may be requested to be installed.
+
+=item <list of db names>
+
+    Only those databases that are explicitly requested to be downloaded will be downloaded. Such databases may require considerable bandwidth, so please make sure you know you are doing the right thing.    
+
+=back
+
+=head1 EXAMPLES
+
+./getData.pl --list | head 4
+
+=head1 TODO
+
+We now need a mechanism with which packages can specify hooks that
+shall be called upon an update of a database. But we cannot assume that
+every indexing that can be performed because of the installation of some
+package is also desired by the user. How to configure this properly is
+left to be decided.
+
+=head1 SEE ALSO
+
+http://debian-med.alioth.debian.org, http://wiki.debian.org/DebianMed
+
+=head1 AUTHORS
+
+Steffen Moeller <moeller at debian.org>, ... and ?, from the Debian-Med packaging initiative.
+
+=cut
+
+
+use strict;
+
+use Getopt::Long;
+use Cwd;
+
+my %toBeMirrored = (
+
+#	"tycho2" => {
+#		name => "Tycho2 Star Coordinates",
+#		source => "wget --mirror ftp://cdsarc.u-strasbg.fr/pub/cats/I/259/tyc2.dat*",
+#		"post-download" => "[ -r tyc2.dat -a -z \"`find .  -cnewer tyc2.dat "
+#		   . "-a ! -name .listing`\" ] "
+#		   . "&& echo \"No mirrored file newer than previously created index.\" "
+#		   . "|| zcat cdsarc.u-strasbg.fr/pub/cats/I/259/tyc2.dat* > tycho2.dat"
+#	},
+#
+#	"astorb" => {
+#		name => "asteroid orbits",
+#		source => "wget --mirror ftp://ftp.lowell.edu/pub/elgb/astorb.dat.gz",
+#		"post-download" => "[ -r astorb.dat "
+#		   . "-a ftp.lowell.edu/pub/elgb/astorb.dat.gz -nt astorb.dat ] "
+#		   . "|| zcat ftp.lowell.edu/pub/elgb/astorb.dat.gz > astorb.dat"
+#	},
+#
+#	"DE405" => {
+#		name => "DE405",
+#		source => "wget --mirror ftp://ssd.jpl.nasa.gov/pub/eph/export/unix/unxp2[01]*.405",
+##		"debian-depends" => "jpl-eph-tools",
+#		"post-download" => "ln -s ssd.jpl.nasa.gov/pub/eph/export/unix/unxp*.405 ."
+#	},
+#
+#	"swiss.dat" => {
+#		name => "UniProt - SwissProt in EMBL format",
+#		source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/swissprot/release_compressed/uniprot_sprot.dat.gz ftp://ftp.ebi.ac.uk/pub/databases/swissprot/updates_compressed/*.dat.gz"
+#	},
+#
+	"swiss.fasta" => {
+		name => "UniProt - SWISS-PROT in FASTA format",
+		source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
+	},
+
+	"trembl.fasta" => {
+		name => "UniProt - TrEMBL in FASTA format",
+		source => "wget --mirror ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
+	},
+
+#	"jaspar.sites" => {
+#		name => "Sites subfolder of JASPAR database",
+#		source => "wget --mirror http://jaspar.genereg.net/html/DOWNLOAD/SITES/*/*.sites"
+#	},
+#
+#	"jaspar.MatrixDir" => {
+#		name => "MatrixDir subfolder of JASPAR database",
+#		source => "wget --mirror http://jaspar.genereg.net/html/DOWNLOAD/MatrixDir/*/*.sites"
+#	}
+);
+
+my ($list,$help,$man)=(0,0,0);
+
+my %options = (
+	"rootdir:s" => \$rootdir,
+	"list" => \$list,
+	"help"=>\$help,
+	"man"=>\$man);
+
+sub help () {
+    require "Pod/Usage.pm";
+    import Pod::Usage;
+    pod2usage() unless defined($man);
+    pod2usage( -verbose => 2 );
+}
+
+
+help() if !GetOptions(%options) or $help or $man;
+
+
+if ($list) {
+	foreach my $db (sort keys %toBeMirrored) {
+		printf "%-10s",$db;
+		# more information like the expected size should be printed here.
+		print "\t".$toBeMirrored{$db}{"name"};
+		print "\n";
+	}
+	exit 0;
+}
+
+die "Cannot write to root destination directory at '$rootdir'.\n"
+	unless  -w "$rootdir" ;
+
+#print join(", ", at ARGV)."\n";
+
+my $d=getcwd();
+
+foreach my $db (@ARGV) {
+	unless(exists($toBeMirrored{$db})) {
+		print STDERR "Unknown database: '$db'\n";
+		next;
+	}
+};
+
+my ($list,$help,$man)=(0,0,0);
+
+my %options=("rootdir:s" => \$rootdir, "list" => \$list, "help"=>\$help, "man"=>\$man);
+
+sub myhelp () {
+    require "Pod/Usage.pm";
+    import Pod::Usage;
+    pod2usage() unless defined($man);
+    pod2usage( -verbose => 2 );
+}
+
+
+myhelp() if !GetOptions(%options) or $help or $man;
+
+
+if ($list) {
+	foreach my $db (sort keys %toBeMirrored) {
+		printf "%-10s",$db;
+		# more information like the expected size should be printed here.
+		print "\t".$toBeMirrored{$db}{"name"};
+		print "\n";
+	}
+	exit 0;
+}
+
+die "Cannot write to root destination directory at '$rootdir'.\n" unless ( -w "$rootdir" );
+
+#print join(", ", at ARGV)."\n";
+
+my $d=getcwd();
+
+foreach my $db (@ARGV) {
+	unless(exists($toBeMirrored{$db})) {
+		print STDERR "Unknown database: '$db'\n";
+		next;
+	}
+
+	print STDERR "\"$db\" -> \"$rootdir\"\n";
+	print "Mirroring ".$toBeMirrored{$db}{"name"}." ($db)\n";
+
+	unless ( -d "$rootdir/$db" ) {
+		print "  creating directory $rootdir/$db\n";
+		mkdir("$rootdir/$db") or die "Could not create directory \"$rootdir/$db\"\n";
+	}
+	chdir("$rootdir/$db") or die "Could not change directory to \"$rootdir/$db\"\n";
+
+	my $cmd = $toBeMirrored{$db}{"source"}."\n";
+	print STDERR "$cmd\n";
+	system($cmd) and die "Experienced problem.";
+
+	$cmd = $toBeMirrored{$db}{"post-download"}."\n";
+	if ( defined($cmd) and "" != $cmd) {
+		print STDERR "$cmd\n";
+		system($cmd) and die "Experienced problem.";
+	}
+	else {
+		print STDERR "$db: No post-download command defined.\n";
+	}
+	chdir($d) or die "Could not change back to dir '$d'.\n";
+}


Property changes on: trunk/community/infrastructure/getData.pl
___________________________________________________________________
Name: svn:executable
   + *




More information about the debian-med-commit mailing list