[Debtags-commits] [svn] r1380 - in tagcoll/trunk: . tagcoll tests

Enrico Zini enrico at costa.debian.org
Tue Sep 27 15:26:34 UTC 2005


Author: enrico
Date: Tue Sep 27 15:26:33 2005
New Revision: 1380

Added:
   tagcoll/trunk/tests/normalize.cc
Removed:
   tagcoll/trunk/tests/normalize.cpp
Modified:
   tagcoll/trunk/   (props changed)
   tagcoll/trunk/README
   tagcoll/trunk/tagcoll/experiments.cc
   tagcoll/trunk/tagcoll/experiments.h
   tagcoll/trunk/tests/Makefile.am
Log:
 r5426 at viaza:  enrico | 2005-09-26 07:56:58 -0500
 First attempt at a normalization algorithm for tagsets


Modified: tagcoll/trunk/README
==============================================================================
--- tagcoll/trunk/README	(original)
+++ tagcoll/trunk/README	Tue Sep 27 15:26:33 2005
@@ -196,6 +196,8 @@
       be assigned the same weight (like use::* -> 10, implemented-in::*->1)
    This should normalise the 'special' items somehow.
 
+ - Replace ParserInputs with C++ isostreams
+
  - Merge ItemGrouper and TDBIndexer
    
  - Add example code

Modified: tagcoll/trunk/tagcoll/experiments.cc
==============================================================================
--- tagcoll/trunk/tagcoll/experiments.cc	(original)
+++ tagcoll/trunk/tagcoll/experiments.cc	Tue Sep 27 15:26:33 2005
@@ -22,73 +22,173 @@
 
 #include <vector>
 
+
+#include <iostream>
+
+namespace std {
+
+template<typename TAG, typename _Traits>
+basic_ostream<char, _Traits>& operator<<(basic_ostream<char, _Traits>& out, const Tagcoll::OpSet<TAG>& tags)
+{
+	for (typename Tagcoll::OpSet<TAG>::const_iterator i = tags.begin();
+			i != tags.end(); i++)
+		if (i == tags.begin())
+			out << *i;
+		else
+			out << ", " << *i;
+	return out;
+}
+
+}
+
 using namespace std;
 
 namespace Tagcoll {
 
 template<typename ITEM, typename TAG>
-bool Normalizer<ITEM,TAG>::mergeTagsets(const OpSet<TAG>& newTagset)
+void Normalizer<ITEM,TAG>::buildGraph()
 {
-	vector< OpSet<TAG> > involved;
-	int card;
+	distGraph.clear();
 
-	// Collect all tagsets that would be merged
-	for (tagsets_t::const_iterator i = tagsets.begin();
-			i != tagsets.end(); i++)
-		if (i->first - newTagset == OpSet<TAG>())
+	for (typename tagsets_t::const_iterator i = this->tagsets.begin();
+			i != this->tagsets.end(); i++)
+		for (typename OpSet<TAG>::const_iterator j = i->first.begin();
+				j != i->first.end(); j++)
 		{
-			involved.push_back(i->first);
-			card += i->second;
+			OpSet<TAG> test = i->first - *j;
+			if (this->tagsets.find(test) != this->tagsets.end())
+			{
+				distGraph[test].push_back(i->first);
+				distGraph[i->first].push_back(test);
+			}
 		}
+}
+
+template<typename ITEM, typename TAG>
+void Normalizer<ITEM,TAG>::removeAfterMerge(const OpSet<TAG>& ts, const OpSet<TAG>& merged)
+{
+	int size = this->tagsets[ts].size();
+
+	this->tagsets.erase(ts);
+
+	OpSet<TAG> removed = ts - merged;
+	for (typename OpSet<TAG>::const_iterator i = removed.begin(); i != removed.end(); i++)
+		this->tags.del(*i, size);
+
+	// Erase ts from all arcs that point to it
+	typename distgraph_t::iterator near = distGraph.find(ts);
+	if (near != distGraph.end())
+		for (typename vector< OpSet<TAG> >::const_iterator i = near->second.begin();
+				i != near->second.end(); i++)
+		{
+			typename distgraph_t::iterator other = distGraph.find(*i);
+			if (other != distGraph.end())
+				for (typename vector< OpSet<TAG> >::iterator j = other->second.begin();
+						j != other->second.end(); j++)
+					if (*j == ts)
+					{
+						other->second.erase(j);
+						break;
+					}
+		}
+
+	// Erase ts from distgraph
+	distGraph.erase(ts);
+}
+
+template<typename ITEM, typename TAG>
+bool Normalizer<ITEM,TAG>::mergeTagsets(const OpSet<TAG>& ts1, const OpSet<TAG>& ts2)
+{
+	OpSet<TAG> merge = ts1 ^ ts2;
+	OpSet<ITEM> items1 = this->tagsets[ts1];
+	OpSet<ITEM> items2 = this->tagsets[ts2];
+	OpSet<ITEM> itemsm = this->tagsets[merge];
 
 	// Don't merge if the result would be too big
-	if (card > max_threshold)
+	if (items1.size() + items2.size() + itemsm.size() > max_threshold)
 		return false;
 
 	// Merge
+
+	removeAfterMerge(ts1, merge);
+	removeAfterMerge(ts2, merge);
 	
-	// Merge the items
-	OpSet<ITEM> newItems;
-	for (vector< OpSet<TAG> >::const_iterator i = involved.begin();
-			i != involved.end(); i++)
-	{
-		newItems += tagsets[*i];
-		tagsets.erase(*i);
-	}
+	this->tagsets[merge] += items1;
+	this->tagsets[merge] += items2;
+
+	/*
+	cerr << "Rebuilding graph..." << endl;
+	buildGraph();
+	cerr << "Built graph." << endl;
+	*/
 
-	tagsets[newTagset] = newItems;
 	return true;
 }
 
-
 template<typename ITEM, typename TAG>
 void Normalizer<ITEM,TAG>::normalize()
 {
+	cerr << "Building graph..." << endl;
+	buildGraph();
+	cerr << "Built graph." << endl;
+
 	bool done = false;
 
 	while (!done)
 	{
+		done = true;
+
+		cerr << "Starting run." << endl;
+
 		vector< OpSet<TAG> > smallTagsets;
 
 		// Collect the small tagsets
-		for (tagsets_t::const_iterator i = tagsets.begin();
-				i != tagsets.end(); i++)
+		for (typename tagsets_t::const_iterator i = this->tagsets.begin();
+				i != this->tagsets.end(); i++)
 			if (i->second.size() < merge_threshold)
-				smallTagsets += i->first;
+				smallTagsets.push_back(i->first);
 
-		for (int i = 0; i < smallTagsets.size(); i++)
-			for (int j = i + 1; j < smallTagsets.size(); j++)
-				if (smallTagsets[i].distance(smallTagsets[j]) == 1)
-					if (mergeTagsets(smallTagsets[i] + smallTagsets[j]))
-						goto next;
-		done = true;
-next:
+		for (size_t i = 0; i < smallTagsets.size(); i++)
+		{
+			typename distgraph_t::const_iterator near = distGraph.find(smallTagsets[i]);
+			if (near == distGraph.end())
+				continue;
+
+			// See which of the nearest sets is the smallest
+			OpSet<TAG> smallest;
+			size_t smallest_size = 1000;
+			for (size_t j = 0; j < near->second.size(); j++)
+			{
+				size_t size = this->tagsets[near->second[j]].size();
+				if (size < smallest_size)
+				{
+					smallest = near->second[j];
+					smallest_size = size;
+				}
+			}
+
+			if (mergeTagsets(smallTagsets[i], smallest))
+			{
+				cerr << i << "/" << smallTagsets.size() << " Merged " << smallTagsets[i] << " and " << smallest << endl;
+				//smallTagsets.erase(smallTagsets[i]);
+				//smallTagsets.erase(smallest);
+				done = false;
+			}
+		}
 	}
 }
 
 }
 
 
+#ifndef INSTANTIATING_TEMPLATES
+#include <string>
+
+namespace Tagcoll {
+	    template class Normalizer<std::string, std::string>;
+}
+#endif
+
 
 #ifdef COMPILE_TESTSUITE
 
@@ -97,7 +197,7 @@
 namespace tut {
 using namespace tut_tagcoll;
 
-struct tagcoll_experiments {
+struct tagcoll_experiments_shar {
 };
 TESTGRP(tagcoll_experiments);
 

Modified: tagcoll/trunk/tagcoll/experiments.h
==============================================================================
--- tagcoll/trunk/tagcoll/experiments.h	(original)
+++ tagcoll/trunk/tagcoll/experiments.h	Tue Sep 27 15:26:33 2005
@@ -25,6 +25,8 @@
 
 #include <tagcoll/CardinalityStore.h>
 
+#include <vector>
+
 namespace Tagcoll
 {
 
@@ -32,11 +34,19 @@
 class Normalizer : public CardinalityStore<ITEM, TAG>
 {
 protected:
-	int max_threshold;
-	int merge_threshold;
-	int min_threshold;
+	unsigned int max_threshold;
+	unsigned int merge_threshold;
+	unsigned int min_threshold;
+
+	typedef CardinalityStore<ITEM, TAG> tagsets_t;
+
+	typedef std::map< OpSet<TAG>, std::vector< OpSet<TAG> > > distgraph_t;
+	distgraph_t distGraph;
+
+	void buildGraph();
+	bool mergeTagsets(const OpSet<TAG>& ts1, const OpSet<TAG>& ts2);
+	void removeAfterMerge(const OpSet<TAG>& ts, const OpSet<TAG>& merged);
 
-	bool mergeTagsets(const OpSet<TAG>& newTagset);
 
 public:
 	Normalizer() :

Modified: tagcoll/trunk/tests/Makefile.am
==============================================================================
--- tagcoll/trunk/tests/Makefile.am	(original)
+++ tagcoll/trunk/tests/Makefile.am	Tue Sep 27 15:26:33 2005
@@ -3,13 +3,17 @@
 libtagcoll_test_SOURCES = tut-main.cpp test-textformat.cc test-tdbdiskindex.cc test-tdbreadonlydiskindex.cc
 libtagcoll_test_LDADD = -dlpreopen ../tagcoll/libtagcoll.la ../tagcoll/tagexpr/libtagexpr.la -ltdb
 
-noinst_PROGRAMS = dump-tdbdi
+noinst_PROGRAMS = dump-tdbdi normalize
 # test-tagset
 
 dump_tdbdi_SOURCES = \
 	dump-tdbdi.cc
 dump_tdbdi_LDADD = ../tagcoll/libtagcoll.la -ltdb
 
+normalize_SOURCES = \
+	normalize.cc
+normalize_LDADD = ../tagcoll/libtagcoll.la -ltdb
+
 #test_tagset_SOURCES = \
 	#test-tagset.cc
 #test_tagset_LDADD = ../tagcoll/libtagcoll.la



More information about the Debtags-commits mailing list