[med-svn] r22173 - trunk/community/edam

Steffen Möller moeller at moszumanska.debian.org
Wed Jun 22 19:50:52 UTC 2016


Author: moeller
Date: 2016-06-22 19:50:52 +0000 (Wed, 22 Jun 2016)
New Revision: 22173

Added:
   trunk/community/edam/debian2edam
Modified:
   trunk/community/edam/registry-tool.py
Log:
Synchronisation with past Debian Sprint


Added: trunk/community/edam/debian2edam
===================================================================
--- trunk/community/edam/debian2edam	                        (rev 0)
+++ trunk/community/edam/debian2edam	2016-06-22 19:50:52 UTC (rev 22173)
@@ -0,0 +1,259 @@
+#!/bin/bash -e
+
+
+# A routine to facilitate the output to STDERR instead of the default STDIN
+function STDERR () {
+	cat - 1>&2
+}
+
+# echoindent outputs a series of blanks to STDOUT. An optional
+# second argument is echoed after those blanks if present.
+function echoindent () {
+	for i in $(seq 1 $1)
+	do
+		echo -n " "
+	done
+	if [ "" != "$2" ]; then
+		echo $2
+	fi
+}
+
+level=0
+# helper to properly close an open paranthesis
+function closeParenthesis () {
+	level=$(($level-1))
+	echoindent $level
+	echo -n "}"
+	if [ -n "$1" ]; then
+		echo "# $1"
+	else
+		echo
+	fi
+}
+
+
+function echoTerm(){
+        level=$(($level-1))
+        echoindent $level
+        echo "{\"uri\": \"$1\", \"term\": \"Pippi Langstrumpf\"}"
+}
+
+# Key argument indicating the debian directory from which to retrieve all the
+# information
+pathToDebian=$1
+#verbose="yes"
+verbose=""
+
+# Variable keeping usage information
+USAGE=<<EOUSAGE
+debian2edam [--upload] <path to 'debian' directory> 
+
+server=https://
+Environment variables:
+elixir_cat_username
+elixir_cat_password 
+
+EOUSAGE
+
+filename=$(basename "$pathToDebian")
+if [ "edam" = "$filename" ]; then
+	pathToDebian=$(dirname "$pathToDebian") # upstream
+	pathToDebian=$(dirname "$pathToDebian") # debian
+fi
+
+if [ -z "$pathToDebian" ]; then
+	echo "$USAGE" | STDERR
+	echo "E: Please specify debian directory in which to find EDAM annotation." | STDERR
+	exit -1
+fi
+
+if [ ! -d "$pathToDebian" ]; then
+	echo "$USAGE" | STDERR
+	echo "E: Could not find directory '$pathToDebian'" | STDERR
+	exit -1
+fi
+
+if [ ! -r "$pathToDebian/changelog" ]; then
+	echo "$USAGE" | STDERR
+	echo "E: Could not find a changelog file expected at '$pathToDebian/changelog'" | STDERR
+	exit -1
+fi
+
+cd $(dirname "$pathToDebian")
+
+edamfile="debian/upstream/edam"
+if [ ! -r "$edamfile" ]; then
+	echo "$USAGE" | STDERR
+	echo "E: Could not access file '$edamfile' from $(pwd)" | STDERR
+	exit -1
+fi
+
+sourcepackage=$(dpkg-parsechangelog |grep ^Source | sed -e 's/`^Source: //' )
+version=$(dpkg-parsechangelog |grep ^Version | cut -f2  -d\  | sed -e 's/-[^-][^-]*//' )
+
+declare -a descriptions
+declare -a packages
+
+if [ -n "$debug" ]; then cat debian/control; fi
+
+while read pack; do
+	p=$(echo "$pack"|sed -e 's/^[^:]*: *//')
+	echo Package: $p
+	packages[${#packages[*]}]="$p"
+done < <(grep "^Package:" debian/control )
+
+while read desc; do
+	d=$(echo "$desc"|sed -e 's/^[^:]*: *//')
+	echo Description: $d
+	descriptions[${#descriptions[*]}]="$d"
+	#descriptions[1]="$d"
+	#descriptions="$d"
+done < <(grep "^Description:" debian/control )
+
+#echo "DESCRIPTIONS: ${descriptions[*]}"
+#echo "PACKAGES: ${packages[*]}"
+#echo "DESCRIPTIONS: $descriptions}"
+#echo "PACKAGES: $packages"
+
+if [ ${#packages[*]} != ${#descriptions[*]} ]; then
+	echo "E: Internal error - expected same number of packages (${#packagesp[*]}) as for their descriptions (${#descriptions[*]})" | STDERR
+	exit -1
+fi
+
+(
+if [ -n "$verbose" ]; then
+	for packageno in $(seq 0 ${#descriptions[*]})
+	do
+		echo "# $packageno"
+		echo Packages[$packageno]: ${packages[$packageno]}
+		echo Descriptions[$packageno]: ${descriptions[$packageno]}
+	done
+fi
+) | STDERR
+
+prevstate="start";
+previndent=0
+currentscope=""
+currenttopic=""
+opentopic=0
+openfunction=0
+openscope=0
+indentlen=0
+
+# Core part of the program
+# It reads every line of the EDAM file (see end of loop for the redirection)
+# and decides what to print to STDOUT.
+
+while IFS='' read -r line 
+do
+	if [ -z "$line" ]; then
+		echo "Read empty line"
+		continue
+	fi
+
+	if [ -n "$verbose" ]; then
+		echo "line: '$line'" | STDERR
+	fi
+
+	# retrieve different parts of the description
+	blanks=$(echo "$line"|sed -e 's/^\( *\)\([^ :]\+\): *\([^ ]\+\).*$/\1/')
+	type=$(echo   "$line"|sed -e 's/^\( *\)\([^ :]\+\): *\([^ ]\+\).*$/\2/')
+	val=$(echo    "$line"|sed -e 's/^\( *\)\([^ :]\+\): *\([^ ]\+\).*$/\3/')
+
+	if echo "$val" | grep -q : ; then
+		echo "W: found colon in ID of line '$line' - transscribing to underscore" | STDERR
+		val=$(echo "$val"|tr ":" "_")
+	fi
+
+	#echo "Indent='$blanks'"
+	#echo "Indentlength='$indentlen'"
+	#echo "Type='$type'"
+	#echo "Val='$val'"
+
+	if [  -n "$currentscope" -a "*" != "$currentscope" -a "summary" != "$currentscope" -a "scope" != "$type" ]; then
+		echo "I: Wrong scope ($currentscope) - ignored '$line'" | STDERR
+		continue
+	fi
+	indentlen=${#blanks}
+
+	if [ "scope" = "$type" ]; then
+		if [ $openfunction -gt 0 ]; then closeParenthesis "openfunction($openfunction) in scope"; fi
+		currentscope="$val"
+		resourcename=$sourcepackage
+		if [ "*"!=$val -a "summary"!="$val" ];then
+			resourcename=$val
+		fi
+
+		if [ "summary" != "$val" -a "*" != "$val" ]; then
+			echo "I: treatment of multiple scopes not yet implemented" | STDERR
+		else
+			echo "{"
+			# Some decent comparison of package names with scope is not implemented
+			level=$((level+1))
+			echoindent
+			echo "Package $resourcename"
+			echoindent
+			echo "\"version\": \"$version\","
+			echoindent
+			echo "\"description\": \"${descriptions[0]}\","
+			echoindent
+			echo "\"topic\": \"{$currenttopic}\""
+			openscope=1
+		fi
+	elif [ "topic" = "$type" ]; then
+		if [ $openfunction -gt 0 ]; then closeParenthesis "openfunction($openfunction) in topic"; openfunction=0; fi
+		if [ $openscope -gt 0 ]; then closeParenthesis "openscope($openscope) after loop"; openscope=0; fi
+		if [ "start" != "$prevstate" ]; then
+			closeParenthesis "topic with prior state - weird"
+		fi
+		currenttopic="$val"
+		# at some laterimplementation point, bits generated here would be cached and then distributed
+		# to various lower-level scopes
+	elif [ "function" = "$type" ]; then
+		if [ $openfunction -gt 0 ]; then
+			closeParenthesis "openfunction($openfunction) in function"
+			openfunction=0
+		fi
+		echoindent $level
+		echo "{function: [ { \"functionName\":  ["
+                echoTerm $val
+                echo "] }],"
+		level=$((level+1))
+		openfunction=1
+	elif [ "input" = "$type" -o "output" = "$type" ]; then
+                if [ $prevstate = $type ]; then
+                        echo "},{"
+                fi
+                if [ $prevstate = 'function' ]; then
+                        echo "\"$type\": [{"
+                fi
+		echoindent $level
+		echo "($type $val)"
+	else
+		echo "W: unknown type '$type' - ignored" | STDERR
+	fi
+	prevstate=$type
+	#echo "indentlen='$indentlen'"
+done < $edamfile 
+
+if [ $openfunction -gt 0 ]; then
+	closeParenthesis "openfunction($openfunction) after loop"
+	openfunction=0
+fi
+
+if [ $openscope -gt 0 ]; then
+	#echo "I: treatment of multiple scopes not yet implemented"|STDERR
+	closeParenthesis "openscope($openscope) after loop"
+	openscope=0
+fi
+
+#echo "indentlen='$indentlen'" | STDERR
+
+if [ $opentopic -gt 0 ]; then
+	opentopic=0
+fi
+
+#for i in $(seq $(($indentlen-$openfunction-$openscope-$opentopic)) -1 1)
+#do
+#	closeParenthesis "indent $i"
+#done

Modified: trunk/community/edam/registry-tool.py
===================================================================
--- trunk/community/edam/registry-tool.py	2016-06-22 14:34:40 UTC (rev 22172)
+++ trunk/community/edam/registry-tool.py	2016-06-22 19:50:52 UTC (rev 22173)
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import json
 import yaml
 import argparse
@@ -5,18 +6,33 @@
 import os.path
 import getpass
 import re
-import random # for uri2term, to be removed
 
-def uri2term(uri):
-    """The routine is meant to retrieve the human-readable term name for a URI provided.
+from lxml import etree
 
-    The current implementation merely produces a combination of the first and
-    last name of Pipi Longstocking as named in different languages.
-    """
-    pipi= [["Pippi","Langstrumpf"],["Pippi","Longstocking"],["Inger","Nilsson"],["Fifi","Brindacier"],
-           ["Pippi","Långstrump"],["Pippi","Langstrømpe"],["Pippi","Calcesllargues"],["Pipi","Ŝtrumpolonga"],["Pippi","Uzunçorap"]]
-    return(random.choice(pipi)[0]+" "+random.choice(pipi)[1])
+#parsing and declaring namespaces...
+EDAM_NS = {'owl' : 'http://www.w3.org/2002/07/owl#',
+           'rdf':"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+           'rdfs':"http://www.w3.org/2000/01/rdf-schema#",
+           'oboInOwl': "http://www.geneontology.org/formats/oboInOwl#"}
 
+EDAM_DOC = doc = etree.parse("/home/hmenager/edamontology/EDAM_1.13_dev.owl")
+
+def check_id(label, axis):
+    xpath_query = "//owl:Class[translate(rdfs:label/text(),'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')=translate('" + label\
+          + "','abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ') and starts-with(@rdf:about, 'http://edamontology.org/" + axis + "')]/@rdf:about"
+    matching_terms = EDAM_DOC.xpath(xpath_query, namespaces=EDAM_NS)
+    if len(matching_terms)==0:
+        print("ERROR - No matching " + axis + " term for label " + label + "!")
+        print(xpath_query)
+    elif len(matching_terms)>1:
+        print("ERROR - More than one " + axis + " term for label " + label + "!")
+    else:
+        term_id = matching_terms[0]
+        if len(EDAM_DOC.xpath("//owl:Class[@rdf:about='"+ term_id +"' and owl:deprecated='true']", namespaces=EDAM_NS))>0:
+            print("ERROR - Term " + term_id + " term for label " + label + " is deprecated!")
+        else:
+            return term_id            
+        
 def doc_to_dict(pack_dir):
     debian_path = os.path.join(pack_dir, 'debian')
     control_path = os.path.join(debian_path, 'control')
@@ -31,14 +47,13 @@
     version_upstream = m.groups()[m.lastindex-1]
     edam = yaml.load(open(edam_path))
     metadata = yaml.load(open(metadata_path))
-
     resource = {'name': control.get('Source'),
                 'homepage': control.get('Homepage'),
                 'version': version_debian,
                 'collection': 'debian',
                 'interface': {}, #TODO
                 'description': control.get('Description'),
-                'topic': [{'uri':uri,'term':uri2term(el['data'])} for uri in edam.get('topic')],
+                'topic': [{'uri':check_id(topic_label,'topic')} for topic_label in edam.get('topic')],
                 'sourceRegistry': '',
                 'publications': [{'publicationsOtherID': [i['DOI'] for i in metadata['Reference']]}],
                 'function': []
@@ -46,68 +61,32 @@
     for scope in edam['scopes']:
         function = {}
         function['functionHandle'] = scope['name']
-        function['functionName'] = [{'uri':uri,'term':uri2term(el['data'])} for uri in scope.get('function')]
+        function['functionName'] = [{'uri':check_id(function_label,'operation')} for function_label in scope.get('function')]
         function['input'] = []
         for el in scope.get('inputs'):
             function['input'].append({
-                                      'dataType': {'uri':el['data'],'term':uri2term(el['data'])},
-                                      'dataFormat' : [{'uri':format_el,'term':uri2term(el['data'])} for format_el in el['formats']]
+                                      'dataType': {'uri':check_id(el['data'],'data')},
+                                      'dataFormat' : [{'uri':check_id(format_el,'format')} for format_el in el['formats']]
                                      })
         function['output'] = []
         for el in scope.get('outputs'):
             function['output'].append({
-                                      'dataType': {'uri':el['data'],'term':uri2term(el['data'])},
-                                      'dataFormat' : [{'uri':format_el,'term':uri2term(el['data'])} for format_el in el['formats']]
+                                      'dataType': {'uri':check_id(el['data'],'data')},
+                                      'dataFormat' : [{'uri':check_id(format_el,'format')} for format_el in el['formats']]
                                      })
         resource['function'].append(function)
     return resource
  
-def auth(login):
-    password = getpass.getpass()
-    resp = requests.post('https://elixir-registry.cbs.dtu.dk/api/auth/login','{"username": "%s","password": "%s"}' % (login, password), headers={'Accept':'application/json', 'Content-type':'application/json'}).text
-    return json.loads(resp)['token']
 
 if __name__ == '__main__':
-    # 1. Import XML files from a Mobyle server or from a folder containing XML files
-    # 2. Convert to BTR XML
-    # 3. Convert to BTR JSON
-    # 4. Register to Elixir BTR
     parser = argparse.ArgumentParser(
                  description='ELIXIR registry tool for Debian Med packages')
-    group = parser.add_mutually_exclusive_group()
-    parser.add_argument('--package_dirs', help="Debian package directory", nargs='+')
-    parser.add_argument('--json_dir', help="target directory for JSON files")
-    parser.add_argument('--login', help="registry login")
+    parser.add_argument('package_dirs', help="Debian package directory", nargs='+')
     args = parser.parse_args()
     if args.package_dirs:
         package_dirs = args.package_dirs
-    params = {'mobyle_root':"'http://mobyle.pasteur.fr'",
-              'mobyle_contact':"'mobyle at pasteur.fr'"}
-    if args.login:
-        print "authenticating..."
-        token = auth(args.login)
-        print "authentication ok"
-        ok_cnt = 0
-        ko_cnt = 0
-        #print "attempting to delete all registered services..."
-        #resp = requests.delete('https://elixir-registry.cbs.dtu.dk/api/tool/%s' % args.login, headers={'Accept':'application/json', 'Content-type':'application/json', 'Authorization': 'Token %s' % token})
-        #print resp
     for package_dir in package_dirs:
-        print "processing %s..." % package_dirs
+        print "processing %s..." % package_dir
         res = doc_to_dict(package_dir)
         print json.dumps(res, indent=True)
-        resource_name = res['name']
-        if args.json_dir:
-            json_path = os.path.join(args.json_dir, resource_name + '.json')
-            json.dump(res, open(json_path, 'w'), indent=True)
-        if args.login and args:
-            resp = requests.post('https://elixir-registry.cbs.dtu.dk/api/tool', json.dumps(res), headers={'Accept':'application/json', 'Content-type':'application/json', 'Authorization': 'Token %s' % token})
-            #print resp.status_code
-            if resp.status_code==201:
-                print "%s ok" % resource_name
-                ok_cnt += 1
-            else:
-                print "%s ko, error: %s" % (resource_name, resp.text)
-                ko_cnt += 1
-    if args.login:
-        print "import finished, ok=%s, ko=%s" % (ok_cnt, ko_cnt)
+        print "done processing %s..." % package_dir




More information about the debian-med-commit mailing list