[Debtags-commits] [svn] r1380 - in tagcoll/trunk: . tagcoll tests
Enrico Zini
enrico at costa.debian.org
Tue Sep 27 15:26:34 UTC 2005
Author: enrico
Date: Tue Sep 27 15:26:33 2005
New Revision: 1380
Added:
tagcoll/trunk/tests/normalize.cc
Removed:
tagcoll/trunk/tests/normalize.cpp
Modified:
tagcoll/trunk/ (props changed)
tagcoll/trunk/README
tagcoll/trunk/tagcoll/experiments.cc
tagcoll/trunk/tagcoll/experiments.h
tagcoll/trunk/tests/Makefile.am
Log:
r5426 at viaza: enrico | 2005-09-26 07:56:58 -0500
First attempt at a normalization algorithm for tagsets
Modified: tagcoll/trunk/README
==============================================================================
--- tagcoll/trunk/README (original)
+++ tagcoll/trunk/README Tue Sep 27 15:26:33 2005
@@ -196,6 +196,8 @@
be assigned the same weight (like use::* -> 10, implemented-in::*->1)
This should normalise the 'special' items somehow.
+ - Replace ParserInputs with C++ isostreams
+
- Merge ItemGrouper and TDBIndexer
- Add example code
Modified: tagcoll/trunk/tagcoll/experiments.cc
==============================================================================
--- tagcoll/trunk/tagcoll/experiments.cc (original)
+++ tagcoll/trunk/tagcoll/experiments.cc Tue Sep 27 15:26:33 2005
@@ -22,73 +22,173 @@
#include <vector>
+
+#include <iostream>
+
+namespace std {
+
+template<typename TAG, typename _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& out, const Tagcoll::OpSet<TAG>& tags)
+{
+ for (typename Tagcoll::OpSet<TAG>::const_iterator i = tags.begin();
+ i != tags.end(); i++)
+ if (i == tags.begin())
+ out << *i;
+ else
+ out << ", " << *i;
+ return out;
+}
+
+}
+
using namespace std;
namespace Tagcoll {
template<typename ITEM, typename TAG>
-bool Normalizer<ITEM,TAG>::mergeTagsets(const OpSet<TAG>& newTagset)
+void Normalizer<ITEM,TAG>::buildGraph()
{
- vector< OpSet<TAG> > involved;
- int card;
+ distGraph.clear();
- // Collect all tagsets that would be merged
- for (tagsets_t::const_iterator i = tagsets.begin();
- i != tagsets.end(); i++)
- if (i->first - newTagset == OpSet<TAG>())
+ for (typename tagsets_t::const_iterator i = this->tagsets.begin();
+ i != this->tagsets.end(); i++)
+ for (typename OpSet<TAG>::const_iterator j = i->first.begin();
+ j != i->first.end(); j++)
{
- involved.push_back(i->first);
- card += i->second;
+ OpSet<TAG> test = i->first - *j;
+ if (this->tagsets.find(test) != this->tagsets.end())
+ {
+ distGraph[test].push_back(i->first);
+ distGraph[i->first].push_back(test);
+ }
}
+}
+
+template<typename ITEM, typename TAG>
+void Normalizer<ITEM,TAG>::removeAfterMerge(const OpSet<TAG>& ts, const OpSet<TAG>& merged)
+{
+ int size = this->tagsets[ts].size();
+
+ this->tagsets.erase(ts);
+
+ OpSet<TAG> removed = ts - merged;
+ for (typename OpSet<TAG>::const_iterator i = removed.begin(); i != removed.end(); i++)
+ this->tags.del(*i, size);
+
+ // Erase ts from all arcs that point to it
+ typename distgraph_t::iterator near = distGraph.find(ts);
+ if (near != distGraph.end())
+ for (typename vector< OpSet<TAG> >::const_iterator i = near->second.begin();
+ i != near->second.end(); i++)
+ {
+ typename distgraph_t::iterator other = distGraph.find(*i);
+ if (other != distGraph.end())
+ for (typename vector< OpSet<TAG> >::iterator j = other->second.begin();
+ j != other->second.end(); j++)
+ if (*j == ts)
+ {
+ other->second.erase(j);
+ break;
+ }
+ }
+
+ // Erase ts from distgraph
+ distGraph.erase(ts);
+}
+
+template<typename ITEM, typename TAG>
+bool Normalizer<ITEM,TAG>::mergeTagsets(const OpSet<TAG>& ts1, const OpSet<TAG>& ts2)
+{
+ OpSet<TAG> merge = ts1 ^ ts2;
+ OpSet<ITEM> items1 = this->tagsets[ts1];
+ OpSet<ITEM> items2 = this->tagsets[ts2];
+ OpSet<ITEM> itemsm = this->tagsets[merge];
// Don't merge if the result would be too big
- if (card > max_threshold)
+ if (items1.size() + items2.size() + itemsm.size() > max_threshold)
return false;
// Merge
+
+ removeAfterMerge(ts1, merge);
+ removeAfterMerge(ts2, merge);
- // Merge the items
- OpSet<ITEM> newItems;
- for (vector< OpSet<TAG> >::const_iterator i = involved.begin();
- i != involved.end(); i++)
- {
- newItems += tagsets[*i];
- tagsets.erase(*i);
- }
+ this->tagsets[merge] += items1;
+ this->tagsets[merge] += items2;
+
+ /*
+ cerr << "Rebuilding graph..." << endl;
+ buildGraph();
+ cerr << "Built graph." << endl;
+ */
- tagsets[newTagset] = newItems;
return true;
}
-
template<typename ITEM, typename TAG>
void Normalizer<ITEM,TAG>::normalize()
{
+ cerr << "Building graph..." << endl;
+ buildGraph();
+ cerr << "Built graph." << endl;
+
bool done = false;
while (!done)
{
+ done = true;
+
+ cerr << "Starting run." << endl;
+
vector< OpSet<TAG> > smallTagsets;
// Collect the small tagsets
- for (tagsets_t::const_iterator i = tagsets.begin();
- i != tagsets.end(); i++)
+ for (typename tagsets_t::const_iterator i = this->tagsets.begin();
+ i != this->tagsets.end(); i++)
if (i->second.size() < merge_threshold)
- smallTagsets += i->first;
+ smallTagsets.push_back(i->first);
- for (int i = 0; i < smallTagsets.size(); i++)
- for (int j = i + 1; j < smallTagsets.size(); j++)
- if (smallTagsets[i].distance(smallTagsets[j]) == 1)
- if (mergeTagsets(smallTagsets[i] + smallTagsets[j]))
- goto next;
- done = true;
-next:
+ for (size_t i = 0; i < smallTagsets.size(); i++)
+ {
+ typename distgraph_t::const_iterator near = distGraph.find(smallTagsets[i]);
+ if (near == distGraph.end())
+ continue;
+
+ // See which of the nearest sets is the smallest
+ OpSet<TAG> smallest;
+ size_t smallest_size = 1000;
+ for (size_t j = 0; j < near->second.size(); j++)
+ {
+ size_t size = this->tagsets[near->second[j]].size();
+ if (size < smallest_size)
+ {
+ smallest = near->second[j];
+ smallest_size = size;
+ }
+ }
+
+ if (mergeTagsets(smallTagsets[i], smallest))
+ {
+ cerr << i << "/" << smallTagsets.size() << " Merged " << smallTagsets[i] << " and " << smallest << endl;
+ //smallTagsets.erase(smallTagsets[i]);
+ //smallTagsets.erase(smallest);
+ done = false;
+ }
+ }
}
}
}
+#ifndef INSTANTIATING_TEMPLATES
+#include <string>
+
+namespace Tagcoll {
+ template class Normalizer<std::string, std::string>;
+}
+#endif
+
#ifdef COMPILE_TESTSUITE
@@ -97,7 +197,7 @@
namespace tut {
using namespace tut_tagcoll;
-struct tagcoll_experiments {
+struct tagcoll_experiments_shar {
};
TESTGRP(tagcoll_experiments);
Modified: tagcoll/trunk/tagcoll/experiments.h
==============================================================================
--- tagcoll/trunk/tagcoll/experiments.h (original)
+++ tagcoll/trunk/tagcoll/experiments.h Tue Sep 27 15:26:33 2005
@@ -25,6 +25,8 @@
#include <tagcoll/CardinalityStore.h>
+#include <vector>
+
namespace Tagcoll
{
@@ -32,11 +34,19 @@
class Normalizer : public CardinalityStore<ITEM, TAG>
{
protected:
- int max_threshold;
- int merge_threshold;
- int min_threshold;
+ unsigned int max_threshold;
+ unsigned int merge_threshold;
+ unsigned int min_threshold;
+
+ typedef CardinalityStore<ITEM, TAG> tagsets_t;
+
+ typedef std::map< OpSet<TAG>, std::vector< OpSet<TAG> > > distgraph_t;
+ distgraph_t distGraph;
+
+ void buildGraph();
+ bool mergeTagsets(const OpSet<TAG>& ts1, const OpSet<TAG>& ts2);
+ void removeAfterMerge(const OpSet<TAG>& ts, const OpSet<TAG>& merged);
- bool mergeTagsets(const OpSet<TAG>& newTagset);
public:
Normalizer() :
Modified: tagcoll/trunk/tests/Makefile.am
==============================================================================
--- tagcoll/trunk/tests/Makefile.am (original)
+++ tagcoll/trunk/tests/Makefile.am Tue Sep 27 15:26:33 2005
@@ -3,13 +3,17 @@
libtagcoll_test_SOURCES = tut-main.cpp test-textformat.cc test-tdbdiskindex.cc test-tdbreadonlydiskindex.cc
libtagcoll_test_LDADD = -dlpreopen ../tagcoll/libtagcoll.la ../tagcoll/tagexpr/libtagexpr.la -ltdb
-noinst_PROGRAMS = dump-tdbdi
+noinst_PROGRAMS = dump-tdbdi normalize
# test-tagset
dump_tdbdi_SOURCES = \
dump-tdbdi.cc
dump_tdbdi_LDADD = ../tagcoll/libtagcoll.la -ltdb
+normalize_SOURCES = \
+ normalize.cc
+normalize_LDADD = ../tagcoll/libtagcoll.la -ltdb
+
#test_tagset_SOURCES = \
#test-tagset.cc
#test_tagset_LDADD = ../tagcoll/libtagcoll.la
More information about the Debtags-commits
mailing list