[Collab-qa-commits] r1408 - udd/udd

Mon Mar 16 10:21:12 UTC 2009

Author: tille
Date: 2009-03-16 10:21:12 +0000 (Mon, 16 Mar 2009)
New Revision: 1408

Added:
   udd/udd/ddtp_gatherer.py
Modified:
   udd/udd/ftpnew_gatherer.py
Log:
Forgot to add ddtp_gatherer in latest commit; fixed somme issues in description parsing for ftpnew


Added: udd/udd/ddtp_gatherer.py
===================================================================

--- udd/udd/ddtp_gatherer.py	                        (rev 0)
+++ udd/udd/ddtp_gatherer.py	2009-03-16 10:21:12 UTC (rev 1408)
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+"""
+This script imports translations from the Debian Description
+translation project into the database.  It parses the translation
+files at
+     http://ddtp.debian.net/Translation_udd
+which are enriched by the version numbers of the packages that
+are described which makes it qut simple to assotiate a primary
+key to the translation even if it might be redundant information
+because you have the MD5sum of the descriptions
+"""
+
+from aux import quote
+from gatherer import gatherer
+import re
+from debian_bundle import deb822
+from os import listdir, access, F_OK
+from sys import stderr, exit
+import gzip
+# import bz2
+from psycopg2 import IntegrityError, InternalError
+
+online=0
+
+def get_gatherer(connection, config, source):
+  return ddtp_gatherer(connection, config, source)
+
+class ddtp():
+  def __init__(self, package, release, language):
+    self.package          = package
+    self.distribution     = 'debian' # No DDTP translations for debian-backports / debian-volatile
+    self.release          = release  # sid for the moment
+    self.component        = 'main'   # Only main translated for the moment
+    self.language         = language
+    self.description      = ''
+    self.long_description = ''
+    self.md5sum           = ''
+    self.version          = ''
+
+  def __str__(self):
+    return "Package %s: %s, %s\n%s" % \
+        (self.package, self.language, self.description, self.long_description)
+
+class ddtp_gatherer(gatherer):
+  # DDTP translations
+
+  select_language_re    = re.compile('^Translation-(\w+)\.gz$')
+
+  def __init__(self, connection, config, source):
+    gatherer.__init__(self, connection, config, source)
+    self.assert_my_config('path', 'files', 'table', 'releases')
+    my_config = self.my_config
+
+    cur = self.cursor()
+    query = "DELETE FROM %s" % my_config['table']
+    cur.execute(query)
+    query = """PREPARE ddtp_insert AS INSERT INTO %s
+                   (package, distribution, component, release, language, version, description, long_description, md5sum)
+                    VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""" % (my_config['table'])
+    cur.execute(query)
+
+    # Query for english package description, its md5 sum and package version
+# Not used any more because the Translation files now contain version numbers
+# but keep the query as comment to store the knowledge how to calculate MD5 sums
+# for the descriptions for possible later use
+#    query = """PREPARE ddtp_packages_recieve_description_md5 AS 
+#               SELECT md5(full_description || E'\n' ) AS md5,
+#               full_description, MAX(version) AS version FROM (
+#                 SELECT DISTINCT
+#                   description || E'\n' || long_description AS full_description,
+#                   version
+#                  FROM packages
+#                  WHERE package = $1 AND distribution = $2 AND component = $3 AND
+#                  release = $4
+#               ) AS tmp GROUP BY full_description"""
+#    cur.execute(query)
+
+    pkg = None
+
+  def run(self):
+    my_config = self.my_config
+    #start harassing the DB, preparing the final inserts and making place
+    #for the new data:
+    cur = self.cursor()
+
+    releases=my_config['releases'].split(' ')
+    for rel in releases:
+      dir = my_config['path']+'/'+rel+'/'
+      if not access(dir, F_OK):
+        print >>stderr, "Directory %s for release %s does not exist" % (dir, rel)
+        continue
+      for filename in listdir(dir):
+        match = ddtp_gatherer.select_language_re.match(filename)
+        if not match:
+          continue
+        lang = match.groups()[0]
+        descstring = 'Description-'+lang
+        g = gzip.GzipFile(dir + filename)
+        try:
+          for stanza in deb822.Sources.iter_paragraphs(g, shared_storage=False):
+            self.pkg             = ddtp(stanza['package'], rel, lang)
+            self.pkg.md5sum      = stanza['Description-md5']
+            self.pkg.version     = stanza['Version']
+            desc                 = stanza[descstring]
+            lines                = desc.splitlines()
+            self.pkg.description = lines[0]
+            for line in lines[1:]:
+              self.pkg.long_description += line + "\n"
+    	    query = "EXECUTE ddtp_insert (%s, '%s', '%s', '%s', '%s', '%s', %s, %s, %s)" % \
+                        (quote(self.pkg.package), self.pkg.distribution, self.pkg.component, self.pkg.release, \
+                         self.pkg.language, self.pkg.version, quote(self.pkg.description), \
+                         quote(self.pkg.long_description), \
+                         quote(self.pkg.md5sum))
+    	    try:
+              cur.execute(query)
+            except IntegrityError, err:
+              print "Key is duplicated but not fetched before:", key
+        except IOError, err:
+          print >>stderr, "Error reading %s (%s)" % (dir+filename, err)
+
+    cur.execute("DEALLOCATE ddtp_insert")
+
+if __name__ == '__main__':
+  main()
+
+# vim:set et tabstop=2:
+

Modified: udd/udd/ftpnew_gatherer.py
===================================================================
--- udd/udd/ftpnew_gatherer.py	2009-03-16 09:19:26 UTC (rev 1407)
+++ udd/udd/ftpnew_gatherer.py	2009-03-16 10:21:12 UTC (rev 1408)
@@ -20,12 +20,16 @@
 def get_gatherer(connection, config, source):
   return ftpnew_gatherer(connection, config, source)
 
+DEBUG=0
+
 # When parsing src html pages we have to get rid of certain html strings
 def de_html(string):
-  string= re.sub("</?span[^>]*>", "", string)
-  string= re.sub("&lt;", "<", string)
-  string= re.sub("&gt;", ">", string)
-  string= re.sub("</?pre>", "", string)
+  string= re.sub("</?span[^>]*>", '',  string)
+  string= re.sub("&quot;",        '"', string)
+  string= re.sub("&amp;",         '&', string)
+  string= re.sub("&lt;",          '<', string)
+  string= re.sub("&gt;",          '>', string)
+  string= re.sub("</?pre>",       '',  string)
   return string
 
 # These fields are not forewarded to UDD tables for the moment
@@ -145,8 +149,9 @@
       cur.execute(query)
       in_udd = cur.fetchone()[0]
       if in_udd:
-        print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
-    	             % (value, int(in_udd), queue)
+        if DEBUG != 0:
+    	  print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
+      	             % (value, int(in_udd), queue)
     	return 1
     return 0
 
@@ -201,8 +206,9 @@
       cur.execute(query)
       in_udd = cur.fetchone()[0]
       if in_udd:
-        print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
-    	    % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
+        if DEBUG != 0:
+          print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
+    	                  % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
         continue
 
       src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
@@ -249,9 +255,9 @@
             print >>srco, "%s: %s" % (field, value)
     	  elif field == 'Description':
             if in_source:
-              srcpkg.s[field]  = value
+              srcpkg.s[field]  = de_html(value)
             else:
-              binpkg.b[field]  = value
+              binpkg.b[field]  = de_html(value)
             print >>srco, "%s: %s" % (field, value)
     	  elif field == 'Architecture':
             if in_source:
@@ -368,21 +374,20 @@
           if match:
             if match.groups()[0][0] != ' ':
               description += ' '
-            description += match.groups()[0]
+            description += de_html(match.groups()[0])
             in_description = 0
             if not in_source: # binpkg and binpkg.b:
-              binpkg.b['Description']      = description
-              binpkg.b['Long_Description'] = description.split("\n",1)[1]
-            print >>srco, "Description: %s\n" % (description)
+              (binpkg.b['Description'], binpkg.b['Long_Description']) = description.split("\n",1)
+              print >>srco, "Description: %s\n%s" % (binpkg.b['Description'], binpkg.b['Long_Description'])
           else:
             if line[0] != ' ':
               description += ' '
-            description += line
+            description += de_html(line)
         else:
           match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
           if match:
             in_description = 1
-            description = match.groups()[0] + "\n"
+            description = de_html(match.groups()[0]) + "\n"
       srci.close()
       srco.close()      
 #        cur.execute("EXECUTE ftpnew_insert (%s, %s, %s, %s)"\