[Debtags-commits] [svn] r1556 - tagcoll/trunk/tagcoll
Enrico Zini
enrico at costa.debian.org
Thu Feb 9 22:25:05 UTC 2006
Author: enrico
Date: Thu Feb 9 22:25:03 2006
New Revision: 1556
Added:
tagcoll/trunk/tagcoll/IntIndex.cc
- copied, changed from r1555, tagcoll/trunk/tagcoll/TDBIndexer.cc
tagcoll/trunk/tagcoll/IntIndex.h
- copied, changed from r1555, tagcoll/trunk/tagcoll/TDBIndexer.h
Modified:
tagcoll/trunk/tagcoll/Makefile.am
Log:
Added IntIndex as a base to implement a faster mmapped mapping index
Copied: tagcoll/trunk/tagcoll/IntIndex.cc (from r1555, tagcoll/trunk/tagcoll/TDBIndexer.cc)
==============================================================================
--- tagcoll/trunk/tagcoll/TDBIndexer.cc (original)
+++ tagcoll/trunk/tagcoll/IntIndex.cc Thu Feb 9 22:25:03 2006
@@ -1,7 +1,7 @@
/*
- * Fast index for tag data
+ * Fast index for tag data, based on integer indexes
*
- * Copyright (C) 2005 Enrico Zini <enrico at debian.org>
+ * Copyright (C) 2006 Enrico Zini <enrico at debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -18,175 +18,151 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <tagcoll/TDBIndexer.h>
+#include <tagcoll/IntIndex.h>
-#include <tdb.h>
-#include <fcntl.h> // O_RDONLY
-#include <string.h> // strlen
-#include <errno.h>
-#include <assert.h>
-
-/*
#include <stdlib.h>
-*/
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
using namespace std;
using namespace Tagcoll;
-
-static void set_tdb(TDB_CONTEXT* db, const std::string& key, OpSet<std::string> vals) throw (SystemException)
+IntIndex::IntIndex(const std::string& filename) : m_filename(filename), m_fd(-1), m_buf(0)
{
- TDB_DATA k, v;
- string payload;
-
- k.dptr = const_cast<char*>(key.data());
- k.dsize = key.size();
+ // Open the file
+ if ((m_fd = open(m_filename.c_str(), O_RDONLY)) == -1)
+ throw SystemException(errno, "opening index file " + filename);
- // Serialize the string list
- for (OpSet<std::string>::const_iterator i = vals.begin(); i != vals.end(); i++)
- payload += *i + '\0';
-
- v.dptr = const_cast<char*>(payload.data());
- v.dsize = payload.size();
-
- if (tdb_store(db, k, v, TDB_REPLACE) == -1)
- throw SystemException(errno, "Writing key " + key + " to on-disk index (tdb says: " + tdb_errorstr(db) + ")");
+ m_size = lseek(m_fd, 0, SEEK_END);
+ if (m_size == (off_t)-1)
+ {
+ close(m_fd);
+ throw SystemException(errno, "reading the size of index file " + m_filename);
+ }
+
+ // Map the file into memory
+ if ((m_buf = (const int*)mmap(0, m_size, PROT_READ, MAP_PRIVATE, m_fd, 0)) == MAP_FAILED)
+ {
+ close(m_fd);
+ throw SystemException(errno, string("mmapping file ") + m_filename);
+ }
}
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::writeIndex(
- Converter<ITEM, std::string>& itemconv,
- Converter<TAG, std::string>& tagconv,
- const std::string& pkgidx, const std::string& tagidx) const
+IntIndex::~IntIndex()
{
- TDB_CONTEXT* db;
-
- db = tdb_open(pkgidx.c_str(), 0, 0, O_RDWR | O_CREAT, 0666);
- if (db == NULL)
- throw SystemException(errno, "opening index file " + pkgidx);
- for (typename map<ITEM, OpSet<TAG> >::const_iterator i = items.begin();
- i != items.end(); i++)
- set_tdb(db, itemconv(i->first), tagconv(i->second));
- tdb_close(db);
-
- db = tdb_open(tagidx.c_str(), 0, 0, O_RDWR | O_CREAT, 0666);
- if (db == NULL)
- throw SystemException(errno, "opening index file " + tagidx);
- for (typename map<TAG, OpSet<ITEM> >::const_iterator i = tags.begin();
- i != tags.end(); i++)
- set_tdb(db, tagconv(i->first), itemconv(i->second));
-
- tdb_close(db);
+ // Unmap and close the file
+ munmap((void*)m_buf, m_size);
+ close(m_fd);
}
-template<class ITEM, class TAG>
-OpSet<ITEM> TDBIndexer<ITEM, TAG>::getItemsHavingTag(const TAG& tag) const
+int IntIndexer::encodedSize() const
{
- typename map<TAG, OpSet<ITEM> >::const_iterator i = tags.find(tag);
- if (i != tags.end())
- return i->second;
- else
- return OpSet<ITEM>();
-}
+ // First the size of the offset array, plus all the array sizes
+ int bufsize = size() * 2;
+
+ // Then the size of all the arrays
+ for (const_iterator i = begin(); i != end(); i++)
+ bufsize += i->size();
-template<class ITEM, class TAG>
-OpSet<TAG> TDBIndexer<ITEM, TAG>::getTagsOfItem(const ITEM& item) const
-{
- typename map<ITEM, OpSet<TAG> >::const_iterator i = items.find(item);
- if (i != items.end())
- return i->second;
- else
- return OpSet<TAG>();
+ return bufsize * sizeof(int);
}
-template<class ITEM, class TAG>
-OpSet<ITEM> TDBIndexer<ITEM, TAG>::getTaggedItems() const
+void IntIndexer::encode(int* buf) const
{
- OpSet<ITEM> res;
- for (typename map<ITEM, OpSet<TAG> >::const_iterator i = items.begin();
- i != items.end(); i++)
- res += i->first;
- return res;
+ int pos = size();
+ for (size_t i = 0; i < size(); i++)
+ {
+ buf[i] = pos;
+ buf[pos++] = (*this)[i].size();
+ for (set<int>::const_iterator j = (*this)[i].begin(); j != (*this)[i].end(); j++)
+ buf[pos++] = *j;
+ }
}
-template<class ITEM, class TAG>
-OpSet<TAG> TDBIndexer<ITEM, TAG>::getAllTags() const
+void IntIndexer::write(const std::string& filename)
{
- OpSet<TAG> res;
- for (typename map<TAG, OpSet<ITEM> >::const_iterator i = tags.begin();
- i != tags.end(); i++)
- res += i->first;
- return res;
-}
+ // Create a temporary file next to the target file
+ char name[filename.size() + 8];
+ memcpy(name, filename.data(), filename.size());
+ memcpy(name + filename.size(), ".XXXXXX", 8);
+ int fd = mkstemp(name);
+ if (fd == -1)
+ throw SystemException(errno, "creating temporary file " + filename + ".XXXXXX");
+
+#if 0
+ int size = encodedSize();
+ int buf[size];
+ encode(buf);
+#else
+ // Enlarge the temporary file to fit the data
+ int size = encodedSize();
+ if (lseek(fd, size - 1, SEEK_SET) == -1)
+ {
+ close(fd);
+ unlink(name);
+ throw SystemException(errno, string("enlarging file (seek) ") + name);
+ }
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::output(Consumer<ITEM, TAG>& consumer) const
-{
- for (typename map<ITEM, OpSet<TAG> >::const_iterator i = items.begin();
- i != items.end(); i++)
- consumer.consume(i->first, i->second);
-}
+ // Write one byte at the end to actually resize the file
+ if (::write(fd, &fd, 1) == -1)
+ {
+ close(fd);
+ unlink(name);
+ throw SystemException(errno, string("enlarging file (write) ") + name);
+ }
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::applyChange(const PatchList<ITEM, TAG>& change)
-{
- for (typename PatchList<ITEM, TAG>::const_iterator i = change.begin(); i != change.end(); i++)
+ // Map the file into memory
+ void* buf = mmap(0, size, PROT_WRITE, MAP_SHARED, fd, 0);
+ if (buf == MAP_FAILED)
{
- // Save the previous tagset in `rev'
- OpSet<TAG> prevTags = getTags(i->first);
- OpSet<TAG> nextTags = i->second.apply(prevTags);
+ close(fd);
+ unlink(name);
+ throw SystemException(errno, string("mmapping file ") + name);
+ }
- // Set the new tagset in the item
- items[i->first] = nextTags;
+ // Write the index data to the file
+ encode((int*)buf);
- // Fix the itemsets in the involved tags
- OpSet<TAG> t = prevTags - nextTags;
- for (typename OpSet<TAG>::const_iterator j = t.begin(); j != t.end(); j++)
- {
- OpSet<TAG> items = getItems(*j) - i->first;
- if (items.empty())
- tags.erase(*j);
- else
- tags[*j] = items;
- }
- t = nextTags - prevTags;
- for (typename OpSet<TAG>::const_iterator j = t.begin(); j != t.end(); j++)
- tags[*j] += i->first;
+ // Unmap the file
+ if (munmap(buf, size) == -1)
+ {
+ close(fd);
+ unlink(name);
+ throw SystemException(errno, string("munmapping file ") + name);
}
-}
-
+#endif
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::consumeItem(const ITEM& item, const OpSet<TAG>& tags)
-{
- // Add the tags to the item
- items[item] += tags;
+ // Give the file the right permissions according to umask
- // Add the item to the tags
- for (typename OpSet<TAG>::const_iterator i = tags.begin(); i != tags.end(); i++)
- this->tags[*i] += item;
-}
+ // Read the current umask
+ mode_t mask = umask(0);
+ umask(mask);
+ // Set the file permissions
+ if (fchmod(fd, 0666 & ~mask) == -1)
+ {
+ close(fd);
+ unlink(name);
+ throw SystemException(errno, string("setting permissions on file ") + name);
+ }
-template<class ITEM, class TAG>
-void TDBIndexer<ITEM, TAG>::consumeItems(const OpSet<ITEM>& items, const OpSet<TAG>& tags)
-{
- for (typename OpSet<ITEM>::const_iterator i = items.begin(); i != items.end(); i++)
- // Add the tags to the item
- this->items[*i] += tags;
+ // Close the file
+ close(fd);
- for (typename OpSet<TAG>::const_iterator i = tags.begin(); i != tags.end(); i++)
- // Add the items to the tag
- this->tags[*i] += items;
+ // Rename to the final file name, performing the atomic update
+ if (rename(name, filename.c_str()) == -1)
+ {
+ unlink(name);
+ throw SystemException(errno, string("renaming file ") + name + " into " + filename);
+ }
}
-#ifndef INSTANTIATING_TEMPLATES
-#include <string>
-
-namespace Tagcoll {
- template class TDBIndexer<std::string, std::string>;
-}
-#endif
#ifdef COMPILE_TESTSUITE
@@ -195,16 +171,47 @@
namespace tut {
using namespace tut_tagcoll;
-struct tagcoll_tdbindexer_shar {
+struct tagcoll_intindex_shar {
};
-TESTGRP(tagcoll_tdbindexer);
+TESTGRP(tagcoll_intindex);
template<> template<>
void to::test<1>()
{
- TDBIndexer<string, string> coll;
+ static const char* fname = "tagcoll_intindex.tmp";
+
+ // Create the index
+ IntIndexer indexer;
+ indexer.map(4, 1);
+ indexer.map(4, 2);
+ indexer.map(2, 1);
+ indexer.map(0, 5);
+ indexer.map(0, 8);
+ indexer.map(0, 1);
+ indexer.map(0, 7);
+ indexer.write(fname);
+
+ // Read the index
+ IntIndex index(fname);
+
+ // Check that the arrays have the right size
+ ensure_equals(index.size(0), 4);
+ ensure_equals(index.size(1), 0);
+ ensure_equals(index.size(2), 1);
+ ensure_equals(index.size(3), 0);
+ ensure_equals(index.size(4), 2);
+
+ // Check that the arrays are sorted and contain the right data
+ ensure_equals(index.data(0)[0], 1);
+ ensure_equals(index.data(0)[1], 5);
+ ensure_equals(index.data(0)[2], 7);
+ ensure_equals(index.data(0)[3], 8);
+ ensure_equals(index.data(2)[0], 1);
+ ensure_equals(index.data(4)[0], 1);
+ ensure_equals(index.data(4)[1], 2);
- test_tagged_collection(coll);
+ // Delete the test index
+ //unlink(fname);
}
}
Copied: tagcoll/trunk/tagcoll/IntIndex.h (from r1555, tagcoll/trunk/tagcoll/TDBIndexer.h)
==============================================================================
--- tagcoll/trunk/tagcoll/TDBIndexer.h (original)
+++ tagcoll/trunk/tagcoll/IntIndex.h Thu Feb 9 22:25:03 2006
@@ -1,12 +1,12 @@
-#ifndef TDB_INDEXER_H
-#define TDB_INDEXER_H
+#ifndef TAGCOLL_INT_INDEX_H
+#define TAGCOLL_INT_INDEX_H
/** \file
- * Fast index for tag data
+ * Fast index for tag data, based on integer indexes
*/
/*
- * Copyright (C) 2005 Enrico Zini <enrico at debian.org>
+ * Copyright (C) 2006 Enrico Zini <enrico at debian.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -23,64 +23,69 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <tagcoll/Collection.h>
-#include <tagcoll/Serializer.h>
#include <tagcoll/Exception.h>
-#include <map>
+#include <vector>
+#include <set>
namespace Tagcoll
{
/**
- * In-memory collection whose representation is organised similarly to the
- * on-disk representation used by TDBDiskIndex.
+ * MMap-based index of a -> [x1, x2, x3] mappings
*
- * It can be used as a working collection, and it can also write its contents
- * into an on-disk index that can later be used by TDBDiskIndex.
+ * The layout is:
*
- * On-disk indexes produced by TDBIndexer are written in a single, optimised
- * run and tend to be more compact than the ones created by TDBDiskIndex.
+ * [offset of mapping for item 0, offset of mapping for item 1...]
+ * [size of array][sorted array of ints pointed by index 0]
+ * [size of array][sorted array of ints pointed by index 1]
+ * [size of array][sorted array of ints pointed by index 2]
+ * [...]
+ *
+ * This allows fast lookups, as well as fast lookups of unions or intersections
+ * of mapped arrays.
+ *
+ * This class tries to be a simple and fast component for building indexes, and
+ * thus no bound checking is performed.
*/
-template<class ITEM, class TAG>
-class TDBIndexer : public Collection<ITEM, TAG>
+class IntIndex
{
protected:
- std::map<ITEM, OpSet<TAG> > items;
- std::map<TAG, OpSet<ITEM> > tags;
-
- virtual void consumeItem(const ITEM& item, const OpSet<TAG>& tags);
- virtual void consumeItems(const OpSet<ITEM>& items, const OpSet<TAG>& tags);
+ std::string m_filename;
+ int m_size;
+ int m_fd;
+ const int* m_buf;
+
+public:
+ IntIndex(const std::string& filename);
+ ~IntIndex();
- virtual OpSet<ITEM> getItemsHavingTag(const TAG& tag) const;
- virtual OpSet<TAG> getTagsOfItem(const ITEM& item) const;
+ const int* data(unsigned int val) { return m_buf + m_buf[val] + 1; }
+ int size(unsigned int val) { return m_buf[m_buf[val]]; }
+};
+/**
+ * Creates an on-disk index to use for IntIndex
+ */
+class IntIndexer : public std::vector<std::set<int> >
+{
public:
- virtual ~TDBIndexer() {}
+ /// Store the key->val mapping into the indexer
+ void map(unsigned int key, int val)
+ {
+ if (size() <= key)
+ resize(key + 1);
+ (*this)[key].insert(val);
+ }
+
+ /// Return the size of the encoded index data
+ int encodedSize() const;
+
+ /// Write the index data in the given buffer, which should be at least
+ /// encodedSize bytes
+ void encode(int* buf) const;
- virtual bool hasItem(const ITEM& item) const { return items.find(item) != items.end(); }
- virtual bool hasTag(const TAG& tag) const { return tags.find(tag) != tags.end(); }
- virtual OpSet<ITEM> getTaggedItems() const;
- virtual OpSet<TAG> getAllTags() const;
- virtual void output(Consumer<ITEM, TAG>& consumer) const;
- virtual void applyChange(const PatchList<ITEM, TAG>& change);
-
- /**
- * Write all collected informations to a disk index
- *
- * \param itemconv
- * Converter than can convert an ITEM to a string
- * \param tagconv
- * Converter than can convert a TAG to a string
- * \param pkgidx
- * File name for the package index to write
- * \param tagidx
- * File name for the tag index to write
- */
- void writeIndex(
- Converter<ITEM, std::string>& itemconv,
- Converter<TAG, std::string>& tagconv,
- const std::string& pkgidx,
- const std::string& tagidx) const;
+ /// Atomically write the index to the given file
+ void write(const std::string& filename);
};
};
Modified: tagcoll/trunk/tagcoll/Makefile.am
==============================================================================
--- tagcoll/trunk/tagcoll/Makefile.am (original)
+++ tagcoll/trunk/tagcoll/Makefile.am Thu Feb 9 22:25:03 2006
@@ -38,6 +38,8 @@
TDBDiskIndex.cc \
TDBReadonlyDiskIndex.h \
\
+ IntIndex.h \
+ \
Filters.h \
Implications.h \
Implications.cc \
@@ -75,6 +77,8 @@
TDBDiskIndex.cc \
TDBReadonlyDiskIndex.cc \
\
+ IntIndex.cc \
+ \
Filters.cc \
Implications.cc \
\
More information about the Debtags-commits
mailing list