r392 - /debtorrent/trunk/uniquely_projectb.py

Wed Jun 25 20:28:54 UTC 2008

Author: camrdale-guest
Date: Wed Jun 25 20:28:54 2008
New Revision: 392

URL: http://svn.debian.org/wsvn/debtorrent/?sc=1&rev=392
Log:
Add the new uniquely script that uses the projectb DB.

Added:
    debtorrent/trunk/uniquely_projectb.py   (with props)

Added: debtorrent/trunk/uniquely_projectb.py
URL: http://svn.debian.org/wsvn/debtorrent/debtorrent/trunk/uniquely_projectb.py?rev=392&op=file
==============================================================================

--- debtorrent/trunk/uniquely_projectb.py (added)
+++ debtorrent/trunk/uniquely_projectb.py Wed Jun 25 20:28:54 2008
@@ -1,0 +1,431 @@
+#!/usr/bin/env python
+
+"""Process a Release file, creating, finding and updating any torrent files."""
+
+import sha
+import sys
+import gzip
+import pgdb
+from bz2 import BZ2File
+from math import ceil
+from os import remove, rename, system
+from os.path import exists
+from time import strftime, gmtime
+from debian_bundle import deb822
+from tempfile import mkstemp
+
+# The piece size to use (must match the '-extrapieces' file's piece size)
+DEFAULT_PIECESIZE = 512*1024
+
+# The Packages files to read
+EXTENSION = ".gz"
+
+# The fields to hash to determine the torrent identifier
+# (can not contain Date, Infohash, NextPiece or OriginalPieces)
+DEFAULT_HASH_FIELDS = ["Codename", "Suite", "Component", "Architecture",
+                       "PieceSize", "OriginalDate"]
+
+# The tracker announce URL to use
+DEFAULT_TRACKER = "http://dttracker.debian.net:6969/announce"
+
+# The order to write the headers in (headers not listed won't be written)
+HEADER_ORDER = ["Torrent", "Infohash", "InfohashArchs", "OriginalDate", "Date",
+                "PieceSize", "NextPiece", "OriginalPieces", "Codename", "Suite",
+                "Component", "Architecture", "Tracker", "TorrentHashFields"]
+
+# The maximum increase in the size of the torrent before it is reset
+MAX_SIZE_INCREASE = 2.0
+
+class Torrent(deb822._multivalued):
+    """For reading piece numbers from a unique piece number torrent file."""
+    _multivalued_fields = {
+        "piecenumbers": [ "number", "file" ],
+    }
+
+def read_release(filename):
+    """Read the headers and Packages file names from a Release file.
+    
+    @type filename: C{string}
+    @param filename: the Release file to read
+    @rtype: C{dictionary}, C{list} of C{string}
+    @return: the headers and full file names of Packages files
+    
+    """
+
+    # Initialize the Release file variables
+    release_dir = filename.rsplit('/', 1)[0]
+    read_packages = False
+    headers = {}
+    packages = []
+    
+    f = open(filename, 'r')
+    
+    rel = deb822.Release(f)
+    for header in rel:
+        if header.lower() not in ["md5sum", "sha1", "sha256"]:
+            # Read the headers from the file
+            headers[header] = rel[header]
+            
+    # Read the Packages file names
+    for file in rel.get('MD5Sum', []):
+        if file['name'].endswith("Packages"+EXTENSION) and release_dir + "/" + file['name'] not in packages:
+            packages.append(release_dir + "/" + file['name'])
+    for file in rel.get('SHA1', []):
+        if file['name'].endswith("Packages"+EXTENSION) and release_dir + "/" + file['name'] not in packages:
+            packages.append(release_dir + "/" + file['name'])
+    for file in rel.get('SHA256', []):
+        if file['name'].endswith("Packages"+EXTENSION) and release_dir + "/" + file['name'] not in packages:
+            packages.append(release_dir + "/" + file['name'])
+    
+    f.close()
+    
+    return headers, packages
+
+def get_old(old_file):
+    """Read the headers and piece ordering data from an old file.
+    
+    @type old_file: C{string}
+    @param old_file: the old piece ordering file to open
+    @rtype: C{dictionary}, C{dictionary}
+    @return: the old piece ordering (keys are the file names, values are the
+        starting piece number) and headers
+    
+    """
+
+    pieces = {}
+    headers = {}
+    
+    try:
+        f = gzip.open(old_file, 'r')
+    
+        tor = Torrent(f)
+        for header in tor:
+            if header.lower() != 'piecenumbers':
+                # Read the headers from the file
+                headers[header] = tor[header]
+                
+        # Read the piece ordering data from the file
+        for piece in tor['PieceNumbers']:
+            pieces[piece['file']] = int(piece['number'])
+
+        f.close()
+    except:
+        # Delete the file and return empty variables to create a new torrent
+        if exists(old_file):
+            remove(old_file)
+    
+    return pieces, headers
+
+def update_headers(headers, release_headers, component, arch):
+    """Update the headers with new fields from the Release file.
+    
+    @type headers: C{dictionary}
+    @param headers: the headers from the piece ordering file
+    @type release_headers: C{dictionary}
+    @param release_headers: the headers from the Release file
+    @type component: C{string}
+    @param component: the component name (e.g. main, contrib, non-free)
+    @type arch: C{string}
+    @param arch: the architecture name (e.g. i386, amd64, all)
+    @rtype: C{boolean}
+    @return: whether a new torrent has been created
+    
+    """
+
+    # Set any required Release headers
+    if len(release_headers.get("Date", "")) == 0:
+        # Use today's date
+        release_headers["Date"] = strftime('%a, %d %b %Y %H:%M:%S +0000', gmtime())
+    
+    # Create/update the headers
+    headers.setdefault("OriginalDate", release_headers["Date"])
+    headers["Date"] = release_headers["Date"]
+    headers.setdefault("PieceSize", str(DEFAULT_PIECESIZE))
+    headers.setdefault("NextPiece", str(0))
+    headers["Codename"] = release_headers.get("Codename", "")
+    headers["Suite"] = release_headers.get("Suite", "")
+    headers["Component"] = component
+    headers["Architecture"] = arch
+    headers.setdefault("Tracker", DEFAULT_TRACKER)
+    headers.setdefault("TorrentHashFields", " ".join(DEFAULT_HASH_FIELDS))
+    
+    # Calculate the new hash
+    sha1 = sha.new()
+    for header in headers["TorrentHashFields"].split():
+        sha1.update(headers[header])
+    new_hash = sha1.hexdigest()
+    
+    # Check if the hash has changed or the torrent is too big
+    if (headers.get("Torrent", "") == new_hash and
+        int(headers.get("NextPiece")) < 
+        int(headers.get("OriginalPieces", "0"))*MAX_SIZE_INCREASE):
+        return False
+    else:
+        # If it has, then reset the torrent to create a new one
+        headers["OriginalDate"] = release_headers["Date"]
+        headers["NextPiece"] = str(0)
+        headers.pop("OriginalPieces", "")
+        sha1 = sha.new()
+        for header in headers["TorrentHashFields"].split():
+            sha1.update(headers[header])
+        headers["Torrent"] = sha1.hexdigest()
+
+        return True
+
+def get_new(db, suite, codename, component, arch, old_files):
+    """Read the new piece data from a Packages file.
+    
+    Reads the Packages file, finding old files in it and copying their data to
+    the new ordering, and adding any new files found to the end of the
+    ordering. The old_files input is modified by removing the found files from
+    it, and the 'NextPiece' header in the input headers is changed.
+    
+    @type db: C{DB-APIv2 connection}
+    @param db: an open connection to the projectb database
+    @type suite: C{string}
+    @param suite: the suite name (e.g. testing, unstable)
+    @type codename: C{string}
+    @param codename: the codename of the suite (e.g. sid, lenny)
+    @type component: C{string}
+    @param component: the component name (e.g. main, contrib, non-free)
+    @type arch: C{string}
+    @param arch: the architecture name (e.g. i386, amd64, all)
+    @type old_files: C{dictionary}
+    @param old_files: the original piece ordering, keys are the file names,
+        values are the starting piece number
+    @rtype: C{dictionary}
+    @return: the new piece ordering, keys are the starting piece numbers,
+        values are the file names
+    
+    """
+
+    c = db.cursor()
+    c.execute("select location.path, files.filename, files.size " + 
+              "from binaries join files on binaries.file = files.id " + 
+                            "join location on files.location = location.id " +
+                            "join architecture on binaries.architecture = architecture.id " + 
+                            "join bin_associations on binaries.id = bin_associations.bin " +
+                            "join suite on bin_associations.suite = suite.id " +
+                            "join component on location.component = component.id " +
+              "where suite_name = '" + suite +"' and component.name = '" + component + "' and " +
+                     "arch_string = '" + arch + "' " +
+              "order by location.path, files.filename")
+
+    pieces = {}
+    new_pieces = []
+    
+    res = c.fetchone()
+    while res:
+        filename = '/'.join(res[0].split('/')[-2:]) + res[1]
+        size = res[2]
+        if filename.endswith('.deb'):
+            # Check which torrent to add the info to
+            if filename in old_files:
+                # Found old file, so add it
+                pieces[old_files[filename]] = filename
+                del old_files[filename]
+            else:
+                # Found new file, save it for later processing
+                new_pieces.append((filename, long(size)))
+        res = c.fetchone()
+        
+    c.close()
+
+    return pieces, new_pieces
+
+def add_new(pieces, new_pieces, headers):
+    """Read the new piece data from a Packages file.
+    
+    Adds new files to the end of the piece ordering. The 'pieces' input is 
+    modified by having the new pieces added to it. The 'new_pieces' input
+    list is sorted. The 'NextPiece' header in the input 'headers' is updated.
+    
+    @type pieces: C{dictionary}
+    @param pieces: the current piece ordering, keys are the starting piece
+        numbers, values are the file names
+    @type new_pieces: C{list} of (C{string}, C{long})
+    @param new_pieces: the file name and file size of the new files that have
+        been found and are to be added to the pirce ordering
+    @type headers: C{dictionary}
+    @param headers: the headers from the piece ordering file
+    
+    """
+
+    # Get the needed header information
+    next_piece = int(headers["NextPiece"])
+    piece_size = int(headers["PieceSize"])
+    
+    new_pieces.sort()
+    old_file = ""
+    old_size = 0L
+    for (file, size) in new_pieces:
+        if file == old_file:
+            if size != old_size:
+                print "WARNING: multiple files with different size:", file
+        else:
+            pieces[next_piece] = file
+            next_piece += int(ceil(size/float(piece_size)))
+            
+        old_file = file
+        old_size = size
+
+    # Set the final header values
+    headers["NextPiece"] = str(next_piece)
+    headers.setdefault("OriginalPieces", headers["NextPiece"])
+
+def write_file(filename, pieces, headers):
+    """Print the new data to the file.
+    
+    @type filename: C{string}
+    @param filename: the file to write to
+    @type pieces: C{dictionary}
+    @param pieces: the current piece ordering, keys are the starting piece
+        numbers, values are the file names
+    @type headers: C{dictionary}
+    @param headers: the headers from the piece ordering file
+    
+    """
+
+    f = gzip.open(filename + '.new', 'w')
+    
+    # Write the headers
+    for header in HEADER_ORDER:
+        if header in headers:
+            f.write("%s: %s\n" % (header, headers[header]))
+    f.write("PieceNumbers:\n")
+    
+    # Write the starting piece numbers
+    ps = pieces.keys()
+    ps.sort()
+    format_string = " %"+str(len(str(max(ps))))+"d %s\n"
+    for p in ps:
+        f.write(format_string % (p, pieces[p]))
+    
+    f.close()
+    rename(filename + '.new', filename)
+
+def run(db, releasefile):
+    """Process a single Release file.
+    
+    @type db: C{DB-APIv2 connection}
+    @param db: an open connection to the projectb database
+    @type releasefile: C{string}
+    @param releasefile: the Release file to process
+
+    """
+    
+    # Process the Release file
+    print "Processing: %s" % releasefile
+    release_headers, packages = read_release(releasefile)
+    
+    suite = release_headers['Suite']
+    codename = release_headers["Codename"]
+    torrent_prefix = "dists_" + codename + "_"
+    torrent_suffix = "_Packages-torrent.gz"
+    
+    for component in release_headers["Components"].split():
+        # Get the old 'all' data
+        all_file = torrent_prefix + component + "_binary-all" + torrent_suffix
+        print all_file + ": reading ...",
+        sys.stdout.flush()
+        old_all_pieces, all_headers = get_old(all_file)
+    
+        # First update the 'all' headers
+        if update_headers(all_headers, release_headers, component, "all"):
+            # If it has, then reset the torrent
+            print "new torrent created ...",
+            sys.stdout.flush()
+            old_all_pieces = {}
+    
+        # Parse the database for the new data
+        print "updating ...",
+        sys.stdout.flush()
+        all_pieces, all_new_pieces = get_new(db, suite, codename, component, 'all',
+                                             old_all_pieces)
+
+        # Add the old removed pieces so out-of-date mirrors will work too
+        for file in old_all_pieces:
+            all_pieces[old_all_pieces[file]] = file
+
+        # If there were 'all' files found
+        if all_pieces or all_new_pieces:
+            # Process the new 'all' files found
+            add_new(all_pieces, all_new_pieces, all_headers)
+        
+            # Write the all_headers
+            print "writing ...",
+            sys.stdout.flush()
+            write_file(all_file, all_pieces, all_headers)
+        else:
+            print "empty ...",
+            if exists(all_file):
+                remove(all_file)
+    
+        print "done."
+    
+        for arch in release_headers["Architectures"].split():
+            torrent_file = torrent_prefix + component + "_binary-" + arch + torrent_suffix
+    
+            # Find the Packages file that will be parsed
+            found = False
+            for filename in packages:
+                if (filename.find(component) >= 0 and 
+                    filename.find("binary-"+arch) >= 0):
+                    found = True
+                    break
+            if not found:
+                print "WARNING: no matching Packages file for component %s, arch %s" % (component, arch)
+                if exists(torrent_file):
+                    remove(torrent_file)
+                continue
+            packages.pop(packages.index(filename))
+    
+            # Get the old data for this torrent, if any existed
+            print torrent_file + ": reading ...",
+            sys.stdout.flush()
+            old_pieces, headers = get_old(torrent_file)
+    
+            # Update the headers from the Release file ones
+            if update_headers(headers, release_headers, component, arch):
+                print "new torrent created ...",
+                sys.stdout.flush()
+                old_pieces = {}
+    
+            # Parse the database for the new data
+            print "updating ...",
+            sys.stdout.flush()
+            pieces, new_pieces = get_new(db, suite, codename, component, arch,
+                                         old_pieces)
+
+            # Add the old removed pieces so out-of-date mirrors will work too
+            for file in old_pieces:
+                pieces[old_pieces[file]] = file
+    
+            if pieces or new_pieces:
+                # Add any new pieces to the end of pieces
+                add_new(pieces, new_pieces, headers)
+                
+                # Write the headers
+                print "writing ...",
+                sys.stdout.flush()
+                write_file(torrent_file, pieces, headers)
+            else:
+                print "empty ...",
+                if exists(torrent_file):
+                    remove(torrent_file)
+                
+            print "done."
+    
+    if packages:
+        print "The following packages files were not used:"
+        for package in packages:
+            print "    %s" % package
+
+if __name__ == '__main__':
+    if len(sys.argv) >= 2:
+        db = pgdb.connect(database = 'projectb')
+        for file in sys.argv[1:]:
+            run(db, file)
+        db.close()
+    else:
+        print "Usage: " + sys.argv[0] + " Releasefile [Releasefile ...]"

Propchange: debtorrent/trunk/uniquely_projectb.py
------------------------------------------------------------------------------
    svn:executable = *