[Debtags-commits] [svn] r1812 - in debtags/1.6.0: . tools

Enrico Zini enrico at costa.debian.org
Fri Jul 7 17:00:52 UTC 2006


Author: enrico
Date: Fri Jul  7 17:00:51 2006
New Revision: 1812

Modified:
   debtags/1.6.0/   (props changed)
   debtags/1.6.0/tools/Printer.h
   debtags/1.6.0/tools/debtags.cc
Log:
 r2991 at viaza:  enrico | 2006-07-07 19:00:26 +0200
 Refactored the smart search
 


Modified: debtags/1.6.0/tools/Printer.h
==============================================================================
--- debtags/1.6.0/tools/Printer.h	(original)
+++ debtags/1.6.0/tools/Printer.h	Fri Jul  7 17:00:51 2006
@@ -33,6 +33,7 @@
 #include <ept/forward.h>
 #include <ept/cache/tag.h>
 #include <ept/cache/package.h>
+#include <ept/cache/apt/packages.h>
 #include <tagcoll/TextFormat.h>
 #include <tagcoll/coll/fast.h>
 

Modified: debtags/1.6.0/tools/debtags.cc
==============================================================================
--- debtags/1.6.0/tools/debtags.cc	(original)
+++ debtags/1.6.0/tools/debtags.cc	Fri Jul  7 17:00:51 2006
@@ -29,7 +29,7 @@
 
 #include <ept/init.h>
 #include <ept/forward.h>
-#include <ept/cache/cache.h>
+//#include <ept/cache/cache.h>
 #include <ept/cache/debtags/serializer.h>
 #include <ept/cache/debtags/vocabulary.h>
 #include <ept/cache/debtags/tagmap.h>
@@ -37,8 +37,8 @@
 #include <ept/cache/tag.h>
 #include <ept/cache/package.h>
 #include <ept/cache/version.h>
-#include <ept/predicate/factory.h>
-#include <ept/predicate/predicate.h>
+//#include <ept/predicate/factory.h>
+//#include <ept/predicate/predicate.h>
 
 #include <tagcoll/input/stdio.h>
 #include <tagcoll/stream/filters.h>
@@ -728,6 +728,153 @@
 };
 #endif
 
+template<typename Tag, typename Number = int>
+class TagMetrics : public std::map<Tag, Number>
+{
+	class MetricsOrder
+	{
+		const TagMetrics& metric;
+	public:
+		MetricsOrder(const TagMetrics& metric) : metric(metric) {}
+		bool operator()(const Tag& t1, const Tag& t2)
+		{
+			// Returns true if t1 precedes t2, and false otherwise
+			return metric.get(t1) < metric.get(t2);
+		}
+	};
+	class DiscriminanceOrder
+	{
+		const TagMetrics& metric;
+		int itemCount;
+	public:
+		DiscriminanceOrder(const TagMetrics& metric, int itemCount)
+			: metric(metric), itemCount(itemCount) {}
+		bool operator()(const Tag& t1, const Tag& t2)
+		{
+			// Returns true if t1 precedes t2, and false otherwise
+			return abs(itemCount / 2 - metric.get(t1)) < abs(itemCount / 2 - metric.get(t2));
+		}
+	};
+
+public:
+	void add(const Tag& tag, const Number& val)
+	{
+		typename TagMetrics::iterator i = this->find(tag);
+		if (i == this->end())
+			insert(make_pair(tag, val));
+		else
+			i->second += val;
+	}
+
+	Number get(const Tag& tag) const
+	{
+		typename TagMetrics::const_iterator i = this->find(tag);
+		if (i == this->end())
+			return 0;
+		else
+			return i->second;
+	}
+
+	TagMetrics<Tag, Number> rankMetrics() const
+	{
+		vector<Tag> sorted = tagsSortedByMetrics();
+		TagMetrics<Tag, Number> res;
+		for (size_t i = 0; i < sorted.size(); ++i)
+			res.add(sorted[i], sorted.size() - i);
+		return res;
+	}
+
+	TagMetrics<Tag, Number> jumpsFrom(const TagMetrics<Tag, Number>& other) const
+	{
+		// Compute rank metrics
+		TagMetrics<Tag, Number> rank1 = other.rankMetrics();
+		TagMetrics<Tag, Number> rank2 = rankMetrics();
+
+		TagMetrics<Tag, Number> res;
+		typename TagMetrics::const_iterator i1 = rank1.begin();
+		typename TagMetrics::const_iterator i2 = rank2.begin();
+		while (i1 != rank1.end() || i2 != rank2.end())
+		{
+			if (i1->first == i2->first)
+			{
+				res.add(i1->first, i1->second - i2->second);
+				++i1;
+				++i2;
+			} else if (i1 == rank1.end() || (i2 != rank2.end() && i2->first < i1->first)) {
+				res.add(i2->first, rank2.size() - i2->second);
+				++i2;
+			} else {
+				res.add(i1->first, i1->second - rank1.size());
+				++i1;
+			}
+		}
+		return res;
+	}
+
+	vector<Tag> tagsSortedByMetrics() const
+	{
+		vector<Tag> res;
+		for (typename TagMetrics::const_iterator i = this->begin(); i != this->end(); ++i)
+			res.push_back(i->first);
+		std::sort(res.begin(), res.end(), MetricsOrder(*this));
+		return res;
+	}
+
+	vector<Tag> tagsSortedByDiscriminance(int itemCount) const
+	{
+		vector<Tag> res;
+		for (typename TagMetrics::const_iterator i = this->begin(); i != this->end(); ++i)
+			res.push_back(i->first);
+		std::sort(res.begin(), res.end(), DiscriminanceOrder(*this, itemCount));
+		return res;
+	}
+
+	template<typename COLL>
+	static TagMetrics<Tag, Number> computeFromTags(const COLL& coll)
+	{
+		TagMetrics<Tag, Number> res;
+		for (typename COLL::const_tag_iterator i = coll.tagBegin();
+				i != coll.tagEnd(); ++i)
+			res.add(i->first, i->second.size());
+		return res;
+	}
+
+	void dump(const std::string& prefix, ostream& out)
+	{
+		TagMetrics<Tag, Number> rm = rankMetrics();
+		vector<Tag> tags = tagsSortedByMetrics();
+		int rank = 0;
+		for (typename vector<Tag>::const_iterator i = tags.begin(); i != tags.end(); ++i, ++rank)
+			out << prefix << tags.size() - rank << ":" << rm.get(*i) << ") " << i->fullname() << ": " << this->get(*i) << endl;
+	}
+};
+
+template<typename Metrics>
+class TagMetricsInserter : public wibble::mixin::OutputIterator< TagMetricsInserter<Metrics> >
+{
+	Metrics& m;
+
+public:
+	TagMetricsInserter(Metrics& m) : m(m) {}
+
+	template<typename Items, typename Tags>
+	TagMetricsInserter<Metrics>& operator=(const std::pair<Items, Tags>& data)
+	{
+		int size = data.first.size();
+		for (typename Tags::const_iterator i = data.second.begin();
+				i != data.second.end(); ++i)
+			m.add(*i, size);
+		return *this;
+	}
+};
+
+template<typename Tag, typename Number>
+TagMetricsInserter< TagMetrics<Tag, Number> > tagMetricsInserter(TagMetrics<Tag, Number>& out)
+{
+	return TagMetricsInserter< TagMetrics<Tag, Number> >(out);
+}
+
+
 class SmartSearcher
 {
 protected:
@@ -738,13 +885,16 @@
 	//	2) iterare usando la nuova funzione di ranking
 	Ept& ept;
 
-	coll::Simple<Package, Tag> fullColl;
+	coll::Fast<Package, Tag> fullColl;
 	coll::Fast<Package, Tag> coll;
 
 	std::string pattern;
 	std::set<Tag> wanted;
 	std::set<Tag> unwanted;
 	std::set<Tag> ignored;
+	vector<Tag> interesting;
+
+	vector<Tag> tagsInMenu;
 
 	bool patternMatch(const Package& pkg)
 	{
@@ -767,94 +917,14 @@
 		return true;
 	}
 
-	struct Ranker
-	{
-		map<Tag, int> cardFull;
-		map<Tag, int> cardFilt;
-		int totalFull;
-		int totalFilt;
-
-		Ranker() : totalFull(0), totalFilt(0) {}
-
-		void inc(std::map<Tag, int>& map, const Tag& tag)
-		{
-			std::map<Tag, int>::iterator i = map.find(tag);
-			if (i == map.end())
-				map.insert(make_pair(tag, 1));
-			else
-				++i->second;
-		}
-		void incFull(const Tag& tag) { inc(cardFull, tag); }
-		void incFilt(const Tag& tag) { inc(cardFilt, tag); }
-
-
-		template<typename A, typename B>
-		map<B, A> reverse(const map<A, B>& m)
-		{
-			// Sort by cardinality
-			map<B, A> res;
-			for (typename std::map<A, B>::const_iterator i = m.begin();
-					i != m.end(); ++i)
-				res.insert(make_pair(i->second, i->first));
-			return res;
-		}
-
-		map<Tag, float> rank(const map<Tag, int>& cards, int tot)
-		{
-			// Compute ranks
-			map<Tag, float> ranks;
-			for (std::map<Tag, int>::const_iterator i = cards.begin();
-					i != cards.end(); ++i)
-				ranks[i->first] = (float)i->second / tot;
-			return ranks;
-		}
-
-		struct relLess
-		{
-			bool operator()(const std::pair<Tag, float>& a, const std::pair<Tag, float>& b)
-			{
-				return a.second < b.second;
-			}
-		};
-
-		set<Tag> topTags(SmartSearcher& s, size_t count = 5)
-		{
-			vector< pair<Tag, float> > relevance;
-			for (map<Tag, int>::const_iterator i = cardFilt.begin(); i != cardFilt.end(); ++i)
-			{
-				float rankNew = (float)i->second / totalFilt;
-				float rankOld = (float)cardFull[i->first] / totalFull;
-
-				relevance.push_back(make_pair(i->first, rankNew - rankOld));
-			}
-
-			std::sort(relevance.begin(), relevance.end(), relLess());
-
-			// Set the 5 topmost tags as 'wanted'
-	//		for (map<float, Tag>::const_iterator i = relevance.begin();
-	//				i != relevance.end(); ++i)
-	//			cout << "REL " << i->second.fullname() << ": " << i->first << endl;
-
-			set<Tag> res;
-//			for (vector< pair<Tag, float> >::const_reverse_iterator i = relevance.rbegin();
-//					i != relevance.rend() && res.size() < count; ++i)
-			for (int i = relevance.size() - 1; i >= 0 && res.size() < count; --i)
-				if (!utils::set_contains(s.ignored, relevance[i].first) &&
-					!utils::set_contains(s.wanted, relevance[i].first))
-					res.insert(relevance[i].first);
-			return res;
-		}
-	};
-
 	template<typename OUT>
 	class Filter : public wibble::mixin::OutputIterator< Filter<OUT> >
 	{
 		SmartSearcher& s;
-		Ranker& r;
 		OUT out;
 
 	public:
-		Filter(SmartSearcher& s, Ranker& r, const OUT& out) : s(s), r(r), out(out) {}
+		Filter(SmartSearcher& s, const OUT& out) : s(s), out(out) {}
 
 		template<typename ITEMS, typename TAGS>
 		Filter<OUT>& operator=(const std::pair<ITEMS, TAGS>& data)
@@ -864,6 +934,7 @@
 			{
 				if (!s.tagMatch(*i))
 					continue;
+#if 0
 				bool matches = s.patternMatch(*i);
 				for (typename TAGS::const_iterator t = data.second.begin();
 						t != data.second.end(); ++t)
@@ -876,7 +947,7 @@
 						++r.totalFilt;
 					}
 				}
-
+#endif
 				*out = data;
 				++out;
 			}
@@ -884,17 +955,111 @@
 		}
 	};
 	template<typename OUT>
-	Filter<OUT> filter(Ranker& r, const OUT& out)
+	Filter<OUT> filter(const OUT& out)
+	{
+		return Filter<OUT>(*this, out);
+	}
+
+	void autoSelect(const std::vector<Tag>& tags, size_t maxAuto = 5, size_t maxUser = 7)
+	{
+		interesting.clear();
+		if (tags.empty())
+			return;
+
+		size_t autoCount = (tags.size() - maxUser) / 2;
+		if (autoCount > maxAuto) autoCount = maxAuto;
+		size_t userCount = tags.size() - (2 * autoCount);
+		if (userCount > maxUser) userCount = maxUser;
+
+		// Use the bottom autoCount tags as unwanted
+		for (size_t i = 0; i < autoCount; ++i)
+			unwanted.insert(tags[i]);
+
+		// Use the top autoCount tags as wanted
+		for (size_t i = tags.size() - 1; i >= tags.size() - autoCount; --i)
+			wanted.insert(tags[i]);
+
+		// Get the next userCount packages as interesting
+		for (size_t i = tags.size() - autoCount; i >= tags.size() - autoCount - userCount; --i)
+			interesting.push_back(tags[i]);
+	}
+
+	void showTags()
 	{
-		return Filter<OUT>(*this, r, out);
+		tagsInMenu.clear();
+		int idx = 1;
+
+		for (std::set<Tag>::const_iterator i = wanted.begin();
+				i != wanted.end(); ++i)
+		{
+			cout << idx << ") " << i->fullname() << " (wanted)" << endl;
+			tagsInMenu.push_back(*i);
+			++idx;
+		}
+
+		for (std::set<Tag>::const_iterator i = unwanted.begin();
+				i != unwanted.end(); ++i)
+		{
+			cout << idx << ") " << i->fullname() << " (unwanted)" << endl;
+			tagsInMenu.push_back(*i);
+			++idx;
+		}
+
+		for (std::set<Tag>::const_iterator i = ignored.begin();
+				i != ignored.end(); ++i)
+		{
+			cout << idx << ") " << i->fullname() << " (ignored)" << endl;
+			tagsInMenu.push_back(*i);
+			++idx;
+		}
+
+		for (std::vector<Tag>::const_iterator i = interesting.begin();
+				i != interesting.end(); ++i)
+		{
+			cout << idx << ") " << i->fullname() << endl;
+			tagsInMenu.push_back(*i);
+			++idx;
+		}
+	}
+
+	void refilter()
+	{
+		// Regenerate coll
+		coll = coll::Fast<Package, Tag>();
+		fullColl.output(filter(inserter(coll)));
+
+		// Compute the most interesting tags
+		TagMetrics<Tag, int> collMetrics = TagMetrics<Tag, int>::computeFromTags(coll);
+		vector<Tag> tags = collMetrics.tagsSortedByDiscriminance(coll.itemCount());
+
+		// Select them as interesting to be displayed
+		autoSelect(tags, 0);
 	}
 
 public:
 	SmartSearcher(Ept& ept, const std::string& pattern) : ept(ept), pattern(pattern)
 	{
+		// Perform the initial filtering using the keyword search
 		for (Packages::iterator i = ept.packages().begin();
 				i != ept.packages().end(); ++i)
+		{
 			fullColl.insert(wibble::singleton(*i), ept.tagmap().getTagsOfItem(*i));
+			if (patternMatch(*i))
+				coll.insert(wibble::singleton(*i), ept.tagmap().getTagsOfItem(*i));
+		}
+
+		// Compute the set of tags that better represent the keyword search
+		TagMetrics<Tag, int> metrics1 = TagMetrics<Tag, int>::computeFromTags(fullColl);
+		TagMetrics<Tag, int> metrics2 = TagMetrics<Tag, int>::computeFromTags(coll);
+		TagMetrics<Tag, int> jumps = metrics2.jumpsFrom(metrics1);
+		vector<Tag> tags = jumps.tagsSortedByMetrics();
+
+		//metrics1.dump("BEF ", cout);
+		//metrics2.dump("AFT ", cout);
+		//jumps.dump("JMP ", cout);
+
+		//autoSelect(tags, 1);
+		autoSelect(tags, 0);
 	}
 
 #if 0
@@ -922,50 +1087,69 @@
 		bool done = false;
 		while (!done)
 		{
-			Ranker r;
-			coll = coll::Fast<Package, Tag>();
-			fullColl.output(filter(r, inserter(coll)));
-			std::set<Tag> top = r.topTags(*this, 1);
-
-			if (top.empty())
-			{
-				coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
-				done = true;
-			} else {
-				Tag topTag = *top.begin();
-				int card = coll.getCardinality(topTag);
-
-				if (!wanted.empty())
-					cout << "[VERB] Wanted: " << wanted << endl;
-				if (!unwanted.empty())
-					cout << "[VERB] Unwanted: " << unwanted << endl;
-				if (!ignored.empty())
-					cout << "[VERB] Ignored: " << ignored << endl;
-				cout << " * " << coll.itemCount() << " packages.  Top tag: " << topTag.fullname() << " (attached to " << card << " items)" << endl << endl;
-				string ans;
-				bool badAnswer = true;
-				do {
-					badAnswer = false;
-					cout << "Do you want tag " << topTag.fullname() << "? (Yes/No/Ignore/View/Done/Quit)> ";
-					cin >> ans;
-
-					if (ans == "Y" || ans == "y")
-						wanted.insert(topTag);
-					else if (ans == "N" || ans == "n")
-						unwanted.insert(topTag);
-					else if (ans == "I" || ans == "i")
-						ignored.insert(topTag);
-					else if (ans == "V" || ans == "v")
-						coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
-					else if (ans == "D" || ans == "d")
+			cout << "Tag selection:" << endl;
+			showTags();
+			cout << coll.itemCount() << " packages selected so far." << endl;
+			string ans;
+			bool badAnswer = true;
+			do {
+				badAnswer = false;
+				// TODO: allow to add tags based on a keyword search on coll
+				cout << "Your choice (+#, -#, =#, View, Done, Quit): ";
+				cin >> ans;
+
+				if (ans == "")
+					badAnswer = true;
+				else if (ans[0] == '+') {
+					int idx = strtoul(ans.substr(1).c_str(), NULL, 10);
+					if (idx < 0 || idx >= tagsInMenu.size())
+						badAnswer = true;
+					else
 					{
-						coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
-						done = true;
+						Tag tag = tagsInMenu[idx - 1];
+						cout << "Selected: " << tag.fullname() << endl;
+						wanted.insert(tag);
+						unwanted.erase(tag);
+						ignored.erase(tag);
+						refilter();
 					}
-					else if (ans == "Q" || ans == "q")
-						done = true;
-				} while (badAnswer);
-			}
+				} else if (ans[0] == '-') {
+					int idx = strtoul(ans.substr(1).c_str(), NULL, 10);
+					if (idx < 0 || idx >= tagsInMenu.size())
+						badAnswer = true;
+					else
+					{
+						Tag tag = tagsInMenu[idx - 1];
+						cout << "Selected: " << tag.fullname() << endl;
+						wanted.erase(tag);
+						unwanted.insert(tag);
+						ignored.erase(tag);
+						refilter();
+					}
+				} else if (ans[0] == '=') {
+					int idx = strtoul(ans.substr(1).c_str(), NULL, 10);
+					if (idx < 0 || idx >= tagsInMenu.size())
+						badAnswer = true;
+					else
+					{
+						Tag tag = tagsInMenu[idx - 1];
+						cout << "Selected: " << tag.fullname() << endl;
+						wanted.erase(tag);
+						unwanted.erase(tag);
+						ignored.insert(tag);
+						refilter();
+					}
+				} else if (ans == "V" || ans == "v") {
+					coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
+				} else if (ans == "D" || ans == "d") {
+					coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
+					done = true;
+				} else if (ans == "Q" || ans == "q") {
+					done = true;
+				} else
+					badAnswer = true;
+			} while (badAnswer);
+
 		}
 	}
 
@@ -2109,6 +2293,7 @@
 	}
 }
 
+#include <ept/cache/apt/packages.tcc>
 #include <ept/cache/debtags/tagmap.tcc>
 #include <tagcoll/coll/fast.tcc>
 



More information about the Debtags-commits mailing list