[Collab-qa-commits] r1408 - udd/udd
tille at alioth.debian.org
tille at alioth.debian.org
Mon Mar 16 10:21:12 UTC 2009
Author: tille
Date: 2009-03-16 10:21:12 +0000 (Mon, 16 Mar 2009)
New Revision: 1408
Added:
udd/udd/ddtp_gatherer.py
Modified:
udd/udd/ftpnew_gatherer.py
Log:
Forgot to add ddtp_gatherer in latest commit; fixed somme issues in description parsing for ftpnew
Added: udd/udd/ddtp_gatherer.py
===================================================================
--- udd/udd/ddtp_gatherer.py (rev 0)
+++ udd/udd/ddtp_gatherer.py 2009-03-16 10:21:12 UTC (rev 1408)
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+"""
+This script imports translations from the Debian Description
+translation project into the database. It parses the translation
+files at
+ http://ddtp.debian.net/Translation_udd
+which are enriched by the version numbers of the packages that
+are described which makes it qut simple to assotiate a primary
+key to the translation even if it might be redundant information
+because you have the MD5sum of the descriptions
+"""
+
+from aux import quote
+from gatherer import gatherer
+import re
+from debian_bundle import deb822
+from os import listdir, access, F_OK
+from sys import stderr, exit
+import gzip
+# import bz2
+from psycopg2 import IntegrityError, InternalError
+
+online=0
+
+def get_gatherer(connection, config, source):
+ return ddtp_gatherer(connection, config, source)
+
+class ddtp():
+ def __init__(self, package, release, language):
+ self.package = package
+ self.distribution = 'debian' # No DDTP translations for debian-backports / debian-volatile
+ self.release = release # sid for the moment
+ self.component = 'main' # Only main translated for the moment
+ self.language = language
+ self.description = ''
+ self.long_description = ''
+ self.md5sum = ''
+ self.version = ''
+
+ def __str__(self):
+ return "Package %s: %s, %s\n%s" % \
+ (self.package, self.language, self.description, self.long_description)
+
+class ddtp_gatherer(gatherer):
+ # DDTP translations
+
+ select_language_re = re.compile('^Translation-(\w+)\.gz$')
+
+ def __init__(self, connection, config, source):
+ gatherer.__init__(self, connection, config, source)
+ self.assert_my_config('path', 'files', 'table', 'releases')
+ my_config = self.my_config
+
+ cur = self.cursor()
+ query = "DELETE FROM %s" % my_config['table']
+ cur.execute(query)
+ query = """PREPARE ddtp_insert AS INSERT INTO %s
+ (package, distribution, component, release, language, version, description, long_description, md5sum)
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""" % (my_config['table'])
+ cur.execute(query)
+
+ # Query for english package description, its md5 sum and package version
+# Not used any more because the Translation files now contain version numbers
+# but keep the query as comment to store the knowledge how to calculate MD5 sums
+# for the descriptions for possible later use
+# query = """PREPARE ddtp_packages_recieve_description_md5 AS
+# SELECT md5(full_description || E'\n' ) AS md5,
+# full_description, MAX(version) AS version FROM (
+# SELECT DISTINCT
+# description || E'\n' || long_description AS full_description,
+# version
+# FROM packages
+# WHERE package = $1 AND distribution = $2 AND component = $3 AND
+# release = $4
+# ) AS tmp GROUP BY full_description"""
+# cur.execute(query)
+
+ pkg = None
+
+ def run(self):
+ my_config = self.my_config
+ #start harassing the DB, preparing the final inserts and making place
+ #for the new data:
+ cur = self.cursor()
+
+ releases=my_config['releases'].split(' ')
+ for rel in releases:
+ dir = my_config['path']+'/'+rel+'/'
+ if not access(dir, F_OK):
+ print >>stderr, "Directory %s for release %s does not exist" % (dir, rel)
+ continue
+ for filename in listdir(dir):
+ match = ddtp_gatherer.select_language_re.match(filename)
+ if not match:
+ continue
+ lang = match.groups()[0]
+ descstring = 'Description-'+lang
+ g = gzip.GzipFile(dir + filename)
+ try:
+ for stanza in deb822.Sources.iter_paragraphs(g, shared_storage=False):
+ self.pkg = ddtp(stanza['package'], rel, lang)
+ self.pkg.md5sum = stanza['Description-md5']
+ self.pkg.version = stanza['Version']
+ desc = stanza[descstring]
+ lines = desc.splitlines()
+ self.pkg.description = lines[0]
+ for line in lines[1:]:
+ self.pkg.long_description += line + "\n"
+ query = "EXECUTE ddtp_insert (%s, '%s', '%s', '%s', '%s', '%s', %s, %s, %s)" % \
+ (quote(self.pkg.package), self.pkg.distribution, self.pkg.component, self.pkg.release, \
+ self.pkg.language, self.pkg.version, quote(self.pkg.description), \
+ quote(self.pkg.long_description), \
+ quote(self.pkg.md5sum))
+ try:
+ cur.execute(query)
+ except IntegrityError, err:
+ print "Key is duplicated but not fetched before:", key
+ except IOError, err:
+ print >>stderr, "Error reading %s (%s)" % (dir+filename, err)
+
+ cur.execute("DEALLOCATE ddtp_insert")
+
+if __name__ == '__main__':
+ main()
+
+# vim:set et tabstop=2:
+
Modified: udd/udd/ftpnew_gatherer.py
===================================================================
--- udd/udd/ftpnew_gatherer.py 2009-03-16 09:19:26 UTC (rev 1407)
+++ udd/udd/ftpnew_gatherer.py 2009-03-16 10:21:12 UTC (rev 1408)
@@ -20,12 +20,16 @@
def get_gatherer(connection, config, source):
return ftpnew_gatherer(connection, config, source)
+DEBUG=0
+
# When parsing src html pages we have to get rid of certain html strings
def de_html(string):
- string= re.sub("</?span[^>]*>", "", string)
- string= re.sub("<", "<", string)
- string= re.sub(">", ">", string)
- string= re.sub("</?pre>", "", string)
+ string= re.sub("</?span[^>]*>", '', string)
+ string= re.sub(""", '"', string)
+ string= re.sub("&", '&', string)
+ string= re.sub("<", '<', string)
+ string= re.sub(">", '>', string)
+ string= re.sub("</?pre>", '', string)
return string
# These fields are not forewarded to UDD tables for the moment
@@ -145,8 +149,9 @@
cur.execute(query)
in_udd = cur.fetchone()[0]
if in_udd:
- print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
- % (value, int(in_udd), queue)
+ if DEBUG != 0:
+ print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
+ % (value, int(in_udd), queue)
return 1
return 0
@@ -201,8 +206,9 @@
cur.execute(query)
in_udd = cur.fetchone()[0]
if in_udd:
- print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
- % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
+ if DEBUG != 0:
+ print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
+ % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
continue
src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
@@ -249,9 +255,9 @@
print >>srco, "%s: %s" % (field, value)
elif field == 'Description':
if in_source:
- srcpkg.s[field] = value
+ srcpkg.s[field] = de_html(value)
else:
- binpkg.b[field] = value
+ binpkg.b[field] = de_html(value)
print >>srco, "%s: %s" % (field, value)
elif field == 'Architecture':
if in_source:
@@ -368,21 +374,20 @@
if match:
if match.groups()[0][0] != ' ':
description += ' '
- description += match.groups()[0]
+ description += de_html(match.groups()[0])
in_description = 0
if not in_source: # binpkg and binpkg.b:
- binpkg.b['Description'] = description
- binpkg.b['Long_Description'] = description.split("\n",1)[1]
- print >>srco, "Description: %s\n" % (description)
+ (binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
+ print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
else:
if line[0] != ' ':
description += ' '
- description += line
+ description += de_html(line)
else:
match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
if match:
in_description = 1
- description = match.groups()[0] + "\n"
+ description = de_html(match.groups()[0]) + "\n"
srci.close()
srco.close()
# cur.execute("EXECUTE ftpnew_insert (%s, %s, %s, %s)"\
More information about the Collab-qa-commits
mailing list