[Debtags-commits] [svn] r1556 - tagcoll/trunk/tagcoll

Enrico Zini enrico at costa.debian.org
Thu Feb 9 22:25:05 UTC 2006


Author: enrico
Date: Thu Feb  9 22:25:03 2006
New Revision: 1556

Added:
   tagcoll/trunk/tagcoll/IntIndex.cc
      - copied, changed from r1555, tagcoll/trunk/tagcoll/TDBIndexer.cc
   tagcoll/trunk/tagcoll/IntIndex.h
      - copied, changed from r1555, tagcoll/trunk/tagcoll/TDBIndexer.h
Modified:
   tagcoll/trunk/tagcoll/Makefile.am
Log:
Added IntIndex as a base to implement a faster mmapped mapping index

Copied: tagcoll/trunk/tagcoll/IntIndex.cc (from r1555, tagcoll/trunk/tagcoll/TDBIndexer.cc)
==============================================================================
--- tagcoll/trunk/tagcoll/TDBIndexer.cc	(original)
+++ tagcoll/trunk/tagcoll/IntIndex.cc	Thu Feb  9 22:25:03 2006
@@ -1,7 +1,7 @@
 /*
- * Fast index for tag data
+ * Fast index for tag data, based on integer indexes
  *
- * Copyright (C) 2005  Enrico Zini <enrico at debian.org>
+ * Copyright (C) 2006  Enrico Zini <enrico at debian.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,175 +18,151 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <tagcoll/TDBIndexer.h>
+#include <tagcoll/IntIndex.h>
 
-#include <tdb.h>
-#include <fcntl.h>	// O_RDONLY
-#include <string.h>	// strlen
-#include <errno.h>
-#include <assert.h>
-
-/*
 #include <stdlib.h>
-*/
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
 
 using namespace std;
 using namespace Tagcoll;
 
-
-static void set_tdb(TDB_CONTEXT* db, const std::string& key, OpSet<std::string> vals) throw (SystemException)
+IntIndex::IntIndex(const std::string& filename) : m_filename(filename), m_fd(-1), m_buf(0)
 {
-	TDB_DATA k, v;
-	string payload;
-
-	k.dptr  = const_cast<char*>(key.data());
-	k.dsize = key.size();
+	// Open the file
+	if ((m_fd = open(m_filename.c_str(), O_RDONLY)) == -1)
+		throw SystemException(errno, "opening index file " + filename);
 
-	// Serialize the string list
-	for (OpSet<std::string>::const_iterator i = vals.begin(); i != vals.end(); i++)
-		payload += *i + '\0';
-
-	v.dptr = const_cast<char*>(payload.data());
-	v.dsize = payload.size();
-
-	if (tdb_store(db, k, v, TDB_REPLACE) == -1)
-		throw SystemException(errno, "Writing key " + key + " to on-disk index (tdb says: " + tdb_errorstr(db) + ")");
+	m_size = lseek(m_fd, 0, SEEK_END);
+	if (m_size == (off_t)-1)
+	{
+		close(m_fd);
+		throw SystemException(errno, "reading the size of index file " + m_filename);
+	}
+	
+	// Map the file into memory
+	if ((m_buf = (const int*)mmap(0, m_size, PROT_READ, MAP_PRIVATE, m_fd, 0)) == MAP_FAILED)
+	{
+		close(m_fd);
+		throw SystemException(errno, string("mmapping file ") + m_filename);
+	}
 }
 
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::writeIndex(
-		Converter<ITEM, std::string>& itemconv,
-		Converter<TAG, std::string>& tagconv,
-		const std::string& pkgidx, const std::string& tagidx) const
+IntIndex::~IntIndex()
 {
-	TDB_CONTEXT* db;
-
-	db = tdb_open(pkgidx.c_str(), 0, 0, O_RDWR | O_CREAT, 0666);
-	if (db == NULL)
-			throw SystemException(errno, "opening index file " + pkgidx);
-	for (typename map<ITEM, OpSet<TAG> >::const_iterator i = items.begin();
-			i != items.end(); i++)
-		set_tdb(db, itemconv(i->first), tagconv(i->second));
-	tdb_close(db);
-
-	db = tdb_open(tagidx.c_str(), 0, 0, O_RDWR | O_CREAT, 0666);
-	if (db == NULL)
-			throw SystemException(errno, "opening index file " + tagidx);
-	for (typename map<TAG, OpSet<ITEM> >::const_iterator i = tags.begin();
-			i != tags.end(); i++)
-		set_tdb(db, tagconv(i->first), itemconv(i->second));
-
-	tdb_close(db);
+	// Unmap and close the file
+	munmap((void*)m_buf, m_size);
+	close(m_fd);
 }
 
 
-template<class ITEM, class TAG>
-OpSet<ITEM> TDBIndexer<ITEM, TAG>::getItemsHavingTag(const TAG& tag) const
+int IntIndexer::encodedSize() const
 {
-	typename map<TAG, OpSet<ITEM> >::const_iterator i = tags.find(tag);
-	if (i != tags.end())
-		return i->second;
-	else
-		return OpSet<ITEM>();
-}
+	// First the size of the offset array, plus all the array sizes
+	int bufsize = size() * 2;
+	
+	// Then the size of all the arrays
+	for (const_iterator i = begin(); i != end(); i++)
+		bufsize += i->size();
 
-template<class ITEM, class TAG>
-OpSet<TAG> TDBIndexer<ITEM, TAG>::getTagsOfItem(const ITEM& item) const
-{
-	typename map<ITEM, OpSet<TAG> >::const_iterator i = items.find(item);
-	if (i != items.end())
-		return i->second;
-	else
-		return OpSet<TAG>();
+	return bufsize * sizeof(int);
 }
 
-template<class ITEM, class TAG>
-OpSet<ITEM> TDBIndexer<ITEM, TAG>::getTaggedItems() const
+void IntIndexer::encode(int* buf) const
 {
-	OpSet<ITEM> res;
-	for (typename map<ITEM, OpSet<TAG> >::const_iterator i = items.begin();
-			i != items.end(); i++)
-		res += i->first;
-	return res;
+	int pos = size();
+	for (size_t i = 0; i < size(); i++)
+	{
+		buf[i] = pos;
+		buf[pos++] = (*this)[i].size();
+		for (set<int>::const_iterator j = (*this)[i].begin(); j != (*this)[i].end(); j++)
+			buf[pos++] = *j;
+	}
 }
 
-template<class ITEM, class TAG>
-OpSet<TAG> TDBIndexer<ITEM, TAG>::getAllTags() const
+void IntIndexer::write(const std::string& filename)
 {
-	OpSet<TAG> res;
-	for (typename map<TAG, OpSet<ITEM> >::const_iterator i = tags.begin();
-			i != tags.end(); i++)
-		res += i->first;
-	return res;
-}
+	// Create a temporary file next to the target file
+	char name[filename.size() + 8];
+	memcpy(name, filename.data(), filename.size());
+	memcpy(name + filename.size(), ".XXXXXX", 8);
+	int fd = mkstemp(name);
+	if (fd == -1)
+		throw SystemException(errno, "creating temporary file " + filename + ".XXXXXX");
+
+#if 0
+	int size = encodedSize();
+	int buf[size];
+	encode(buf);
+#else
+	// Enlarge the temporary file to fit the data
+	int size = encodedSize();
+	if (lseek(fd, size - 1, SEEK_SET) == -1)
+	{
+		close(fd);
+		unlink(name);
+		throw SystemException(errno, string("enlarging file (seek) ") + name);
+	}
 
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::output(Consumer<ITEM, TAG>& consumer) const
-{
-	for (typename map<ITEM, OpSet<TAG> >::const_iterator i = items.begin();
-			i != items.end(); i++)
-		consumer.consume(i->first, i->second);
-}
+	// Write one byte at the end to actually resize the file
+	if (::write(fd, &fd, 1) == -1)
+	{
+		close(fd);
+		unlink(name);
+		throw SystemException(errno, string("enlarging file (write) ") + name);
+	}
 
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::applyChange(const PatchList<ITEM, TAG>& change)
-{
-	for (typename PatchList<ITEM, TAG>::const_iterator i = change.begin(); i != change.end(); i++)
+	// Map the file into memory
+	void* buf = mmap(0, size, PROT_WRITE, MAP_SHARED, fd, 0);
+	if (buf == MAP_FAILED)
 	{
-		// Save the previous tagset in `rev'
-		OpSet<TAG> prevTags = getTags(i->first);
-		OpSet<TAG> nextTags = i->second.apply(prevTags);
+		close(fd);
+		unlink(name);
+		throw SystemException(errno, string("mmapping file ") + name);
+	}
 
-		// Set the new tagset in the item
-		items[i->first] = nextTags;
+	// Write the index data to the file
+	encode((int*)buf);
 
-		// Fix the itemsets in the involved tags
-		OpSet<TAG> t = prevTags - nextTags;
-		for (typename OpSet<TAG>::const_iterator j = t.begin(); j != t.end(); j++)
-		{
-			OpSet<TAG> items = getItems(*j) - i->first;
-			if (items.empty())
-				tags.erase(*j);
-			else
-				tags[*j] = items;
-		}
-		t = nextTags - prevTags;
-		for (typename OpSet<TAG>::const_iterator j  = t.begin(); j != t.end(); j++)
-			tags[*j] += i->first;
+	// Unmap the file
+	if (munmap(buf, size) == -1)
+	{
+		close(fd);
+		unlink(name);
+		throw SystemException(errno, string("munmapping file ") + name);
 	}
-}
-
+#endif
 
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::consumeItem(const ITEM& item, const OpSet<TAG>& tags)
-{
-	// Add the tags to the item
-	items[item] += tags;
+	// Give the file the right permissions according to umask
 
-	// Add the item to the tags
-	for (typename OpSet<TAG>::const_iterator i = tags.begin(); i != tags.end(); i++)
-		this->tags[*i] += item;
-}
+	// Read the current umask
+	mode_t mask = umask(0);
+	umask(mask);
+	// Set the file permissions
+	if (fchmod(fd, 0666 & ~mask) == -1)
+	{
+		close(fd);
+		unlink(name);
+		throw SystemException(errno, string("setting permissions on file ") + name);
+	}
 
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::consumeItems(const OpSet<ITEM>& items, const OpSet<TAG>& tags)
-{
-	for (typename OpSet<ITEM>::const_iterator i = items.begin(); i != items.end(); i++)
-		// Add the tags to the item
-		this->items[*i] += tags;
+	// Close the file
+	close(fd);
 
-	for (typename OpSet<TAG>::const_iterator i = tags.begin(); i != tags.end(); i++)
-		// Add the items to the tag
-		this->tags[*i] += items;
+	// Rename to the final file name, performing the atomic update
+	if (rename(name, filename.c_str()) == -1)
+	{
+		unlink(name);
+		throw SystemException(errno, string("renaming file ") + name + " into " + filename);
+	}
 }
 
-#ifndef INSTANTIATING_TEMPLATES
-#include <string>
-
-namespace Tagcoll {
-	template class TDBIndexer<std::string, std::string>;
-}
-#endif
 
 #ifdef COMPILE_TESTSUITE
 
@@ -195,16 +171,47 @@
 namespace tut {
 using namespace tut_tagcoll;
 
-struct tagcoll_tdbindexer_shar {
+struct tagcoll_intindex_shar {
 };
-TESTGRP(tagcoll_tdbindexer);
+TESTGRP(tagcoll_intindex);
 
 template<> template<>
 void to::test<1>()
 {
-	TDBIndexer<string, string> coll;
+	static const char* fname = "tagcoll_intindex.tmp";
+
+	// Create the index
+	IntIndexer indexer;
+	indexer.map(4, 1);
+	indexer.map(4, 2);
+	indexer.map(2, 1);
+	indexer.map(0, 5);
+	indexer.map(0, 8);
+	indexer.map(0, 1);
+	indexer.map(0, 7);
+	indexer.write(fname);
+
+	// Read the index
+	IntIndex index(fname);
+
+	// Check that the arrays have the right size
+	ensure_equals(index.size(0), 4);
+	ensure_equals(index.size(1), 0);
+	ensure_equals(index.size(2), 1);
+	ensure_equals(index.size(3), 0);
+	ensure_equals(index.size(4), 2);
+
+	// Check that the arrays are sorted and contain the right data
+	ensure_equals(index.data(0)[0], 1);
+	ensure_equals(index.data(0)[1], 5);
+	ensure_equals(index.data(0)[2], 7);
+	ensure_equals(index.data(0)[3], 8);
+	ensure_equals(index.data(2)[0], 1);
+	ensure_equals(index.data(4)[0], 1);
+	ensure_equals(index.data(4)[1], 2);
 
-	test_tagged_collection(coll);
+	// Delete the test index
+	//unlink(fname);
 }
 
 }

Copied: tagcoll/trunk/tagcoll/IntIndex.h (from r1555, tagcoll/trunk/tagcoll/TDBIndexer.h)
==============================================================================
--- tagcoll/trunk/tagcoll/TDBIndexer.h	(original)
+++ tagcoll/trunk/tagcoll/IntIndex.h	Thu Feb  9 22:25:03 2006
@@ -1,12 +1,12 @@
-#ifndef TDB_INDEXER_H
-#define TDB_INDEXER_H
+#ifndef TAGCOLL_INT_INDEX_H
+#define TAGCOLL_INT_INDEX_H
 
 /** \file
- * Fast index for tag data
+ * Fast index for tag data, based on integer indexes
  */
 
 /*
- * Copyright (C) 2005  Enrico Zini <enrico at debian.org>
+ * Copyright (C) 2006  Enrico Zini <enrico at debian.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,64 +23,69 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <tagcoll/Collection.h>
-#include <tagcoll/Serializer.h>
 #include <tagcoll/Exception.h>
-#include <map>
+#include <vector>
+#include <set>
 
 namespace Tagcoll
 {
 
 /**
- * In-memory collection whose representation is organised similarly to the
- * on-disk representation used by TDBDiskIndex.
+ * MMap-based index of a -> [x1, x2, x3] mappings
  *
- * It can be used as a working collection, and it can also write its contents
- * into an on-disk index that can later be used by TDBDiskIndex.
+ * The layout is:
  *
- * On-disk indexes produced by TDBIndexer are written in a single, optimised
- * run and tend to be more compact than the ones created by TDBDiskIndex.
+ * [offset of mapping for item 0, offset of mapping for item 1...]
+ * [size of array][sorted array of ints pointed by index 0]
+ * [size of array][sorted array of ints pointed by index 1]
+ * [size of array][sorted array of ints pointed by index 2]
+ * [...]
+ *
+ * This allows fast lookups, as well as fast lookups of unions or intersections
+ * of mapped arrays.
+ *
+ * This class tries to be a simple and fast component for building indexes, and
+ * thus no bound checking is performed.
  */
-template<class ITEM, class TAG>
-class TDBIndexer : public Collection<ITEM, TAG>
+class IntIndex
 {
 protected:
-	std::map<ITEM, OpSet<TAG> > items;
-	std::map<TAG, OpSet<ITEM> > tags;
-
-	virtual void consumeItem(const ITEM& item, const OpSet<TAG>& tags);
-	virtual void consumeItems(const OpSet<ITEM>& items, const OpSet<TAG>& tags);
+	std::string m_filename;
+	int m_size;
+	int m_fd;
+	const int* m_buf;
+	
+public:
+	IntIndex(const std::string& filename);
+	~IntIndex();
 
-	virtual OpSet<ITEM> getItemsHavingTag(const TAG& tag) const;
-	virtual OpSet<TAG> getTagsOfItem(const ITEM& item) const;
+	const int* data(unsigned int val) { return m_buf + m_buf[val] + 1; }
+	int size(unsigned int val) { return m_buf[m_buf[val]]; }
+};
 
+/**
+ * Creates an on-disk index to use for IntIndex
+ */
+class IntIndexer : public std::vector<std::set<int> >
+{
 public:
-	virtual ~TDBIndexer() {}
+	/// Store the key->val mapping into the indexer
+	void map(unsigned int key, int val)
+	{
+		if (size() <= key)
+			resize(key + 1);
+		(*this)[key].insert(val);
+	}
+	
+	/// Return the size of the encoded index data
+	int encodedSize() const;
+
+	/// Write the index data in the given buffer, which should be at least
+	/// encodedSize bytes
+	void encode(int* buf) const;
 
-    virtual bool hasItem(const ITEM& item) const { return items.find(item) != items.end(); }
-    virtual bool hasTag(const TAG& tag) const { return tags.find(tag) != tags.end(); }
-	virtual OpSet<ITEM> getTaggedItems() const;
-	virtual OpSet<TAG> getAllTags() const;
-	virtual void output(Consumer<ITEM, TAG>& consumer) const;
-	virtual void applyChange(const PatchList<ITEM, TAG>& change);
-
-	/**
-	 * Write all collected informations to a disk index
-	 *
-	 * \param itemconv
-	 *   Converter than can convert an ITEM to a string
-	 * \param tagconv
-	 *   Converter than can convert a TAG to a string
-	 * \param pkgidx
-	 *   File name for the package index to write
-	 * \param tagidx
-	 *   File name for the tag index to write
-	 */
-	void writeIndex(
-			Converter<ITEM, std::string>& itemconv,
-			Converter<TAG, std::string>& tagconv,
-			const std::string& pkgidx,
-			const std::string& tagidx) const;
+	/// Atomically write the index to the given file
+	void write(const std::string& filename);
 };
 
 };

Modified: tagcoll/trunk/tagcoll/Makefile.am
==============================================================================
--- tagcoll/trunk/tagcoll/Makefile.am	(original)
+++ tagcoll/trunk/tagcoll/Makefile.am	Thu Feb  9 22:25:03 2006
@@ -38,6 +38,8 @@
 		TDBDiskIndex.cc \
 		TDBReadonlyDiskIndex.h \
 		\
+		IntIndex.h \
+		\
 		Filters.h \
 		Implications.h \
 		Implications.cc \
@@ -75,6 +77,8 @@
 		TDBDiskIndex.cc \
 		TDBReadonlyDiskIndex.cc \
 		\
+		IntIndex.cc \
+		\
 		Filters.cc \
 		Implications.cc \
 		\



More information about the Debtags-commits mailing list