[Collab-qa-commits] r1406 - udd/udd
tille at alioth.debian.org
tille at alioth.debian.org
Mon Mar 16 06:35:13 UTC 2009
Author: tille
Date: 2009-03-16 06:35:13 +0000 (Mon, 16 Mar 2009)
New Revision: 1406
Added:
udd/udd/ftpnew_gatherer.py
Log:
First shot for a ftpnew gatherer
Added: udd/udd/ftpnew_gatherer.py
===================================================================
--- udd/udd/ftpnew_gatherer.py (rev 0)
+++ udd/udd/ftpnew_gatherer.py 2009-03-16 06:35:13 UTC (rev 1406)
@@ -0,0 +1,424 @@
+#!/usr/bin/env python
+
+"""
+This script imports information from ftp new queue into the database
+See http://ftp-master.debian.org/new.822 and
+ http://ftp-master.debian.org/new.html
+"""
+
+from debian_bundle import deb822
+from os import access, mkdir, unlink, W_OK
+from sys import stderr
+import aux
+from aux import quote
+from gatherer import gatherer
+import email.Utils
+import re
+from time import ctime
+from psycopg2 import IntegrityError
+
+def get_gatherer(connection, config, source):
+ return ftpnew_gatherer(connection, config, source)
+
+# When parsing src html pages we have to get rid of certain html strings
+def de_html(string):
+ string= re.sub("</?span[^>]*>", "", string)
+ string= re.sub("<", "<", string)
+ string= re.sub(">", ">", string)
+ string= re.sub("</?pre>", "", string)
+ return string
+
+# These fields are not forewarded to UDD tables for the moment
+fields_to_pass = ('Format',
+ 'Date',
+ 'Changed-By',
+ 'Files',
+ 'Uploaders',
+ 'Standards-Version',
+ 'Priority',
+ 'Urgency',
+ 'Dm-Upload-Allowed',
+ 'Autobuild',
+ 'Build-Depends',
+ 'Build-Depends-Indep',
+ 'Build-Conflicts',
+ 'Python-Version')
+ # + startswith('Npp-')
+
+dependencies_to_accept = ( 'Depends', 'Recommends', 'Suggests', 'Enhances', 'Pre-Depends',
+ 'Breaks', 'Replaces', 'Provides', 'Conflicts')
+
+class src_pkg():
+ def __init__(self, source):
+ self.s = {}
+ self.s['Source'] = source
+ self.has_several_versions = 0
+ # self.bin = () # comma separated list of binaries created from the source
+ self.s['Bin'] = () # comma separated list of binaries created from the source
+ self.s['Architecture'] = () # architecture(s separated by blanks)
+ # Just define Vcs fields in case it is not provided in the control
+ self.s['Vcs-Type'] = None
+ self.s['Vcs-Url'] = None
+ # preset WNPP bug
+ self.s['Closes'] = 0
+
+ def check_dict(self):
+ "Make sure that non-mandatory fields at least get a '' value"
+ for field in ftpnew_gatherer.s_non_mandatory:
+ if not self.s.has_key(field):
+ self.s[field] = ''
+
+ def __str__(self):
+ str = "Source %(Source)s: %(Version)s, (%(Architecture)s), %(Last_modified)s, %(Queue)s, %(Distribution)s" % \
+ (self.s)
+ str += " %(maintainer_name)s <%(maintainer_email)s>, %(Closes)i" % (self.s)
+ return str
+
+class bin_pkg():
+ def __init__(self, package, source):
+ self.b = {}
+ self.b['Package'] = package
+ self.b['Source'] = source
+ self.b['Installed-Size'] = 0
+ self.b['License'] = ''
+
+ def check_dict(self):
+ "Make sure that non-mandatory fields at least get a '' value"
+ for field in ftpnew_gatherer.b_non_mandatory:
+ if not self.b.has_key(field):
+ self.b[field] = ''
+
+ def __str__(self):
+ return "Package %s: %s, %s, %s, %s, %s" % \
+ (self.b['Package'], self.b['Version'], self.b['Architecture'], self.b['Maintainer'],
+ self.b['Description'], self.b['Long_Description'])
+
+class ftpnew_gatherer(gatherer):
+ "This class imports the data from New queue into the database"
+ s_mandatory = {'Source': 0, 'Format': 0, 'Maintainer': 0, 'Package': 0, 'Version': 0, 'Files': 0,
+ 'Queue': 0, 'Last_modified': 0}
+ s_non_mandatory = {'Uploaders': 0, 'Bin': 0, 'Architecture': 0,
+ 'Homepage': 0, 'Build-Depends': 0, 'Vcs-Arch': 0, 'Vcs-Bzr': 0,
+ 'Vcs-Cvs': 0, 'Vcs-Darcs': 0, 'Vcs-Git': 0, 'Vcs-Hg': 0, 'Vcs-Svn': 0,
+ 'Vcs-Mtn':0, 'Vcs-Browser': 0, 'License': 0
+ }
+ s_ignorable = {'X-Vcs-Browser': 0, 'X-Vcs-Bzr': 0, 'X-Vcs-Darcs': 0, 'X-Vcs-Svn': 0, 'X-Vcs-Hg':0, 'X-Vcs-Git':0,
+ 'Directory':0, 'Comment':0, 'Origin':0, 'Url':0, 'X-Collab-Maint':0, 'Autobuild':0, 'Vcs-Cvs:':0,
+ 'Python-Standards-Version':0, 'url':0, 'originalmaintainer':0, 'Originalmaintainer':0,
+ 'Build-Recommends':0,
+ 'Build-Depends-Indep': 0, 'Build-Conflicts': 0, 'Build-Conflicts-Indep': 0,
+ 'Priority': 0, 'Section': 0, 'Python-Version': 0, 'Checksums-Sha1':0,
+ 'Checksums-Sha256':0, 'Original-Maintainer':0, 'Dm-Upload-Allowed':0,
+ 'Standards-Version': 0,
+ }
+
+ b_non_mandatory = {'Source': 0, 'Essential': 0, 'Depends': 0, 'Recommends': 0,
+ 'Suggests': 0, 'Enhances': 0, 'Pre-Depends': 0, 'Breaks':0, 'Installed-Size': 0,
+ 'Homepage': 0, 'Size': 0, 'Build-Essential':0, 'Origin':0,
+ 'SHA1':0, 'Replaces':0, 'Section':0, 'MD5sum':0, 'Bugs':0, 'Priority':0,
+ 'Tag':0, 'Task':0, 'Python-Version':0, 'Provides':0, 'Conflicts':0,
+ 'SHA256':0, 'Original-Maintainer':0}
+
+ s_ignorable_re = re.compile("^(Original-|Origianl-|Orginal-|Debian-|X-Original-|Upstream-)")
+ s_vcs = { 'Arch':0, 'Bzr':0, 'Cvs':0, 'Darcs':0, 'Git':0, 'Hg':0, 'Svn':0, 'Mtn':0}
+
+ src_html_failed_re = re.compile("^<p>The requested URL /new/.+\.html was not found on this server\.</p>")
+ src_html_has_tag_re = re.compile('^\s*<tr><td class="key">([-\w]+):</td><td class="val">(.+)</td></tr>$')
+ src_html_has_description_start_re = re.compile('^\s*<tr><td class="key">Description:</td><td class="val"><pre>(.+)')
+ src_html_has_description_end_re = re.compile('(.+)</pre></td></tr>')
+ closes_is_itp_re = re.compile('^\s*(ITP|RFP|ITA)')
+ vcs_type_re = re.compile('Vcs-(Svn|Git|Bzr|Darcs|Hg|Cvs|Arch|Mtn)')
+
+ def __init__(self, connection, config, source):
+ gatherer.__init__(self, connection, config, source)
+ self.assert_my_config('path', 'table_sources', 'table_packages', 'ftpmasterURL', 'releases_ignore')
+
+
+ def check_existing_binaries(self, values, queue):
+ # Sometimes the source package name has changed, but the binary package name is known in UDD
+ # we are not interested in these packages
+
+ cur = self.cursor()
+ for value in values:
+ # query = "SELECT count(*) FROM packages WHERE package = '%s'" % (value)
+ query = "EXECUTE ftpnew_check_existing_package ('%s')" % (value)
+ cur.execute(query)
+ in_udd = cur.fetchone()[0]
+ if in_udd:
+ print >>stderr, "Binary package %s is %i times in UDD - no interest in just known binaries (queue = %s)" \
+ % (value, int(in_udd), queue)
+ return 1
+ return 0
+
+ def run(self):
+ my_config = self.my_config
+
+ #start harassing the DB, preparing the final inserts and making place
+ #for the new data:
+ cur = self.cursor()
+
+ # if we check whether a package just exists in UDD we ignore oldstable which is currently etch but other
+ # dists might have to be ignored as well
+ cur.execute("PREPARE ftpnew_check_existing_package AS SELECT COUNT(*) FROM packages WHERE package = $1 AND release NOT IN (%s)" \
+ % self.my_config["releases_ignore"])
+ # For some reason the code tries to add binary packages twice - just verify whether the package is
+ # just included to make sure we do not trigger conflicting primary keys
+ cur.execute("PREPARE ftpnew_check_just_added_package AS SELECT COUNT(*) FROM new_packages WHERE package = $1 AND version = $2 AND architecture = $3")
+
+ cur.execute("DELETE FROM %s" % my_config["table_sources"])
+ cur.execute("DELETE FROM %s" % my_config["table_packages"])
+
+ query = """PREPARE ftpnew_insert_source
+ AS INSERT INTO %s (source, version, maintainer, maintainer_name, maintainer_email, binaries,
+ changed_by, architecture, homepage,
+ vcs_type, vcs_url, vcs_browser, distribution, closes, license, last_modified, queue)
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)""" % (my_config['table_sources'])
+ cur.execute(query)
+ query = """PREPARE ftpnew_insert_package
+ AS INSERT INTO %s (package, version, architecture, maintainer, description, source,
+ depends, recommends, suggests, enhances, pre_depends, breaks, replaces, provides, conflicts,
+ installed_size, homepage, section, long_description, license)
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20)""" % (my_config['table_packages'])
+ cur.execute(query)
+
+ ftpnew_data = open(my_config['path']+'/new.822')
+
+ for stanza in deb822.Sources.iter_paragraphs(ftpnew_data, shared_storage=False):
+ if stanza['queue'] == 'accepted' or stanza['queue'] == 'proposedupdates' :
+ continue
+ srcpkg = src_pkg(stanza['source'])
+ versions = stanza['version'].split(' ') # the page lists more than one version
+ srcpkg.has_several_versions = len(versions)-1 # some tests below fail if more than one version in in queue
+ srcpkg.s['Version'] = versions[srcpkg.has_several_versions]
+ srcpkg.s['Architecture'] = stanza['architectures']
+ srcpkg.s['Queue'] = stanza['queue']
+ srcpkg.s['Last_modified'] = ctime(int(stanza['last-modified'])) # We want a real time object instead of an epoch
+ srcpkg.s['Distribution'] = stanza['distribution']
+ srcpkg.s['Changed-By'] = stanza['changed-by']
+
+ # Check UDD for existing source packages of this name
+ query = "SELECT count(*) FROM sources WHERE source = '%s'" % (srcpkg.s['Source'])
+ cur.execute(query)
+ in_udd = cur.fetchone()[0]
+ if in_udd:
+ print >>stderr, "%s is %i times in UDD - no interest in just known sources (queue = %s)" \
+ % (srcpkg.s['Source'], int(in_udd), srcpkg.s['Source'])
+ continue
+
+ src_info_base = srcpkg.s['Source'] + '_' + srcpkg.s['Version']
+ src_info_html = my_config['path'] + '/' + src_info_base + '.html'
+ src_info_822 = my_config['path'] + '/' + src_info_base + '.822'
+
+ try:
+ srci = open(src_info_html, 'r')
+ except IOError, err:
+ print >>stderr, "No html info for package %s in queue %s (%s)." % (srcpkg.s['Source'], stanza['queue'], err)
+ continue
+ srco = open(src_info_822, 'w')
+ in_description = 0
+ in_source = 1
+ binpkgs = []
+ binpkg = None
+ for line in srci.readlines():
+ if ftpnew_gatherer.src_html_failed_re.match(line):
+ print >>stderr, "File %s not found." % (src_info_html)
+ src_info_not_found = 1
+ break
+ match = ftpnew_gatherer.src_html_has_tag_re.match(line)
+ if match:
+ field = match.groups()[0]
+ value = de_html(match.groups()[1])
+ if field == 'Package':
+ # Here begins a new binary package
+ if self.check_existing_binaries((value,), srcpkg.s['Queue']):
+ srcpkg.s['Queue'] = 'ignore'
+ break
+ if in_source:
+ in_source = 0
+ if binpkg:
+ binpkgs.append(binpkg)
+ binpkg = bin_pkg(value, srcpkg.s['Source'])
+ print >>srco, "\nPackage: %s" % (value)
+ elif field == 'Maintainer':
+ # print "DEBUG %s: %s" % (field, value)
+ if in_source:
+ srcpkg.s[field] = value
+ srcpkg.s['maintainer_name'], srcpkg.s['maintainer_email'] = email.Utils.parseaddr(srcpkg.s['Maintainer'])
+ else:
+ binpkg.b[field] = value
+ print >>srco, "%s: %s" % (field, value)
+ elif field == 'Description':
+ if in_source:
+ srcpkg.s[field] = value
+ else:
+ binpkg.b[field] = value
+ print >>srco, "%s: %s" % (field, value)
+ elif field == 'Architecture':
+ if in_source:
+ srcpkg.s[field] = value
+ else:
+ binpkg.b[field] = value
+ print >>srco, "%s: %s" % (field, value)
+ elif field == 'Source':
+ if in_source:
+ if value != srcpkg.s['Source']:
+ print >>stderr, "Incompatible source names between new.822(%s) and %s.html (%s)" % \
+ (srcpkg.s['Source'], src_info_base, value)
+ srcpkg.s['Source'] = value
+ print >>srco, "%s: %s" % (field, value)
+ elif field == 'Version':
+ if in_source:
+ if srcpkg.has_several_versions == 0 and value != srcpkg.s[field]:
+ print >>stderr, "Incompatible version numbers between new.822(%s) and %s.html (%s)" % \
+ (srcpkg.s[field], src_info_base, value)
+ srcpkg.s[field] = value
+ else:
+ binpkg.b[field] = value
+ print >>srco, "%s: %s" % (field, value)
+ elif field == 'Closes':
+ values = value.split(' ')
+ found_itp = 0
+ for val in values:
+ ival = int(val)
+ query = "SELECT title from bugs where id = %i and package = 'wnpp' and source = 'wnpp'" % (ival)
+ cur.execute(query)
+ try:
+ wnpp_title = cur.fetchone()[0]
+ except TypeError, err:
+ query = "SELECT id, package, source, title FROM bugs WHERE id = %i" % (ival)
+ cur.execute(query)
+ bug_info = cur.fetchone()
+ if not bug_info:
+ print >>stderr, "Bug %i which source package %s claims to close does not exist." % (ival, srcpkg.s['Source'])
+ else:
+ print >>stderr, "Bug #%i of package %s and source %s is not against pseudopackage 'wnpp' and hast title '%s'" % bug_info
+ if not ftpnew_gatherer.closes_is_itp_re.match(wnpp_title):
+ print >>stderr, "Closed bug %i seems to be not ITPed (queue = %s; title = %s)" % (ival, srcpkg.s['Queue'], wnpp_title)
+ else:
+ if found_itp:
+ print >>stderr, "Warning: Package %s seems to have more than one ITP bugs (%i, %i). Only %i is stored in UDD" % \
+ (srcpkg.s['Source'], srcpkg.s['Closes'], ival, srcpkg.s['Closes'])
+ query = "SELECT count(*) FROM bugs_merged_with WHERE id = %i OR id = %i" % (srcpkg.s['Closes'], ival)
+ cur.execute(query)
+ is_merged = cur.fetchone()[0]
+ if is_merged != 2:
+ print >>stderr, " --> Bugs should be merged in BTS!"
+ else: # stay with the ITP found first
+ srcpkg.s['Closes'] = int(ival)
+ found_itp = 1
+ if not found_itp:
+ print >>stderr, "Most probably %s is not new." % (srcpkg.s['Source'])
+ print >>srco, "%s: %s\n" % (field, value)
+ elif field == 'Distribution':
+ if in_source:
+ if srcpkg.has_several_versions == 0 and value != srcpkg.s['Distribution']:
+ print >>stderr, "Incompatible distributions between new.822(%s) and %s.html (%s)" % \
+ (srcpkg.s['Distribution'], src_info_base, value)
+ srcpkg.s['Distribution'] = value
+ print >>srco, "%s: %s" % (field, value)
+ else:
+ print >>stderr, "Binary should not mention distribution field in %s.html (%s)" % \
+ (src_info_base, value)
+ elif field == 'Binary':
+ if in_source:
+ # Binaries are mentioned in different syntax in *.changes and *.dsc
+ value = re.sub(", +", " ", value)
+ if self.check_existing_binaries(value.split(' '), srcpkg.s['Queue']):
+ srcpkg.s['Queue'] = 'ignore'
+ break
+ if in_source:
+ if srcpkg.s['Bin'] != () and value != srcpkg.s['Bin']:
+ print >>stderr, "Incompatible binaries between new.822(%s) and %s.html (%s)" % \
+ (srcpkg.s['Bin'], src_info_base, value)
+ srcpkg.s['Bin'] = value
+ print >>srco, "%s: %s" % (field, value)
+ else:
+ print >>stderr, "Binary should not mention Binary field in %s.html (%s)" % \
+ (src_info_base, value)
+ elif field == 'Installed-Size':
+ if not in_source:
+ binpkg.b[field] = int(value)
+ elif field == 'Homepage':
+ if not in_source:
+ binpkg.b[field] = value
+ elif field == 'Section':
+ if not in_source:
+ if not binpkg:
+ print >>stderr, "This should not happen", srcpkg, field, value
+ exit(-1)
+ elif field == 'Vcs-Browser':
+ srcpkg.s[field] = value
+ elif binpkg != None and field in dependencies_to_accept:
+ binpkg.b[field] = value
+ print >>srco, "%s: %s" % (field, value)
+ elif field in fields_to_pass or field.startswith('Npp-'):
+ print >>srco, "%s: %s" % (field, value)
+ else:
+ matchvcs = ftpnew_gatherer.vcs_type_re.match(field)
+ if matchvcs:
+ srcpkg.s['Vcs-Type'] = matchvcs.groups()[0]
+ srcpkg.s['Vcs-Url'] = value
+ print >>srco, "%s: %s" % (field, value)
+ else:
+ print >>stderr, "Unknown field in %s: %s" % (srcpkg.s['Source'], field)
+ print >>srco, "*%s: %s" % (field, value)
+ continue
+ if in_description:
+ match = ftpnew_gatherer.src_html_has_description_end_re.match(line)
+ if match:
+ if match.groups()[0][0] != ' ':
+ description += ' '
+ description += match.groups()[0]
+ in_description = 0
+ if not in_source: # binpkg and binpkg.b:
+ binpkg.b['Description'] = description
+ binpkg.b['Long_Description'] = description.split("\n",1)[1]
+ print >>srco, "Description: %s\n" % (description)
+ else:
+ if line[0] != ' ':
+ description += ' '
+ description += line
+ else:
+ match = ftpnew_gatherer.src_html_has_description_start_re.match(line)
+ if match:
+ in_description = 1
+ description = match.groups()[0] + "\n"
+ srci.close()
+ srco.close()
+# cur.execute("EXECUTE ftpnew_insert (%s, %s, %s, %s)"\
+# % (quote(pkg), pkg_type, quote(tag), quote(ftpnew_gatherer.code_to_tag_type_map[code])));
+ if srcpkg.s['Queue'] != 'ignore':
+ # print srcpkg
+ srcpkg.check_dict()
+ query = """EXECUTE ftpnew_insert_source (%(Source)s, %(Version)s,
+ %(Maintainer)s, %(maintainer_name)s, %(maintainer_email)s,
+ %(Bin)s, %(Changed-By)s, %(Architecture)s, %(Homepage)s,
+ %(Vcs-Type)s, %(Vcs-Url)s, %(Vcs-Browser)s,
+ %(Distribution)s, %(Closes)s, %(License)s,
+ %(Last_modified)s, %(Queue)s)"""
+ cur.execute(query, srcpkg.s)
+ for binpkg in binpkgs:
+ # print binpkg
+ binpkg.check_dict()
+ query = """EXECUTE ftpnew_insert_package (%(Package)s, %(Version)s,
+ %(Architecture)s, %(Maintainer)s, %(Description)s, %(Source)s,
+ %(Depends)s, %(Recommends)s, %(Suggests)s, %(Enhances)s,
+ %(Pre-Depends)s, %(Breaks)s, %(Replaces)s, %(Provides)s, %(Conflicts)s,
+ %(Installed-Size)s, %(Homepage)s, %(Section)s,
+ %(Long_Description)s, %(License)s)"""
+ try:
+ cur.execute(query, binpkg.b)
+ except IntegrityError, err:
+ print >>stderr, err, src_info_html
+ print >>stderr, binpkg
+ print >>stderr, binpkg.b
+ continue
+
+ cur.execute("DEALLOCATE ftpnew_insert_source")
+ cur.execute("DEALLOCATE ftpnew_insert_package")
+ cur.execute("DEALLOCATE ftpnew_check_existing_package")
+
+if __name__ == '__main__':
+ main()
+
+# vim:set et tabstop=2:
More information about the Collab-qa-commits
mailing list