[Debtags-commits] [svn] r1812 - in debtags/1.6.0: . tools
Enrico Zini
enrico at costa.debian.org
Fri Jul 7 17:00:52 UTC 2006
Author: enrico
Date: Fri Jul 7 17:00:51 2006
New Revision: 1812
Modified:
debtags/1.6.0/ (props changed)
debtags/1.6.0/tools/Printer.h
debtags/1.6.0/tools/debtags.cc
Log:
r2991 at viaza: enrico | 2006-07-07 19:00:26 +0200
Refactored the smart search
Modified: debtags/1.6.0/tools/Printer.h
==============================================================================
--- debtags/1.6.0/tools/Printer.h (original)
+++ debtags/1.6.0/tools/Printer.h Fri Jul 7 17:00:51 2006
@@ -33,6 +33,7 @@
#include <ept/forward.h>
#include <ept/cache/tag.h>
#include <ept/cache/package.h>
+#include <ept/cache/apt/packages.h>
#include <tagcoll/TextFormat.h>
#include <tagcoll/coll/fast.h>
Modified: debtags/1.6.0/tools/debtags.cc
==============================================================================
--- debtags/1.6.0/tools/debtags.cc (original)
+++ debtags/1.6.0/tools/debtags.cc Fri Jul 7 17:00:51 2006
@@ -29,7 +29,7 @@
#include <ept/init.h>
#include <ept/forward.h>
-#include <ept/cache/cache.h>
+//#include <ept/cache/cache.h>
#include <ept/cache/debtags/serializer.h>
#include <ept/cache/debtags/vocabulary.h>
#include <ept/cache/debtags/tagmap.h>
@@ -37,8 +37,8 @@
#include <ept/cache/tag.h>
#include <ept/cache/package.h>
#include <ept/cache/version.h>
-#include <ept/predicate/factory.h>
-#include <ept/predicate/predicate.h>
+//#include <ept/predicate/factory.h>
+//#include <ept/predicate/predicate.h>
#include <tagcoll/input/stdio.h>
#include <tagcoll/stream/filters.h>
@@ -728,6 +728,153 @@
};
#endif
+template<typename Tag, typename Number = int>
+class TagMetrics : public std::map<Tag, Number>
+{
+ class MetricsOrder
+ {
+ const TagMetrics& metric;
+ public:
+ MetricsOrder(const TagMetrics& metric) : metric(metric) {}
+ bool operator()(const Tag& t1, const Tag& t2)
+ {
+ // Returns true if t1 precedes t2, and false otherwise
+ return metric.get(t1) < metric.get(t2);
+ }
+ };
+ class DiscriminanceOrder
+ {
+ const TagMetrics& metric;
+ int itemCount;
+ public:
+ DiscriminanceOrder(const TagMetrics& metric, int itemCount)
+ : metric(metric), itemCount(itemCount) {}
+ bool operator()(const Tag& t1, const Tag& t2)
+ {
+ // Returns true if t1 precedes t2, and false otherwise
+ return abs(itemCount / 2 - metric.get(t1)) < abs(itemCount / 2 - metric.get(t2));
+ }
+ };
+
+public:
+ void add(const Tag& tag, const Number& val)
+ {
+ typename TagMetrics::iterator i = this->find(tag);
+ if (i == this->end())
+ insert(make_pair(tag, val));
+ else
+ i->second += val;
+ }
+
+ Number get(const Tag& tag) const
+ {
+ typename TagMetrics::const_iterator i = this->find(tag);
+ if (i == this->end())
+ return 0;
+ else
+ return i->second;
+ }
+
+ TagMetrics<Tag, Number> rankMetrics() const
+ {
+ vector<Tag> sorted = tagsSortedByMetrics();
+ TagMetrics<Tag, Number> res;
+ for (size_t i = 0; i < sorted.size(); ++i)
+ res.add(sorted[i], sorted.size() - i);
+ return res;
+ }
+
+ TagMetrics<Tag, Number> jumpsFrom(const TagMetrics<Tag, Number>& other) const
+ {
+ // Compute rank metrics
+ TagMetrics<Tag, Number> rank1 = other.rankMetrics();
+ TagMetrics<Tag, Number> rank2 = rankMetrics();
+
+ TagMetrics<Tag, Number> res;
+ typename TagMetrics::const_iterator i1 = rank1.begin();
+ typename TagMetrics::const_iterator i2 = rank2.begin();
+ while (i1 != rank1.end() || i2 != rank2.end())
+ {
+ if (i1->first == i2->first)
+ {
+ res.add(i1->first, i1->second - i2->second);
+ ++i1;
+ ++i2;
+ } else if (i1 == rank1.end() || (i2 != rank2.end() && i2->first < i1->first)) {
+ res.add(i2->first, rank2.size() - i2->second);
+ ++i2;
+ } else {
+ res.add(i1->first, i1->second - rank1.size());
+ ++i1;
+ }
+ }
+ return res;
+ }
+
+ vector<Tag> tagsSortedByMetrics() const
+ {
+ vector<Tag> res;
+ for (typename TagMetrics::const_iterator i = this->begin(); i != this->end(); ++i)
+ res.push_back(i->first);
+ std::sort(res.begin(), res.end(), MetricsOrder(*this));
+ return res;
+ }
+
+ vector<Tag> tagsSortedByDiscriminance(int itemCount) const
+ {
+ vector<Tag> res;
+ for (typename TagMetrics::const_iterator i = this->begin(); i != this->end(); ++i)
+ res.push_back(i->first);
+ std::sort(res.begin(), res.end(), DiscriminanceOrder(*this, itemCount));
+ return res;
+ }
+
+ template<typename COLL>
+ static TagMetrics<Tag, Number> computeFromTags(const COLL& coll)
+ {
+ TagMetrics<Tag, Number> res;
+ for (typename COLL::const_tag_iterator i = coll.tagBegin();
+ i != coll.tagEnd(); ++i)
+ res.add(i->first, i->second.size());
+ return res;
+ }
+
+ void dump(const std::string& prefix, ostream& out)
+ {
+ TagMetrics<Tag, Number> rm = rankMetrics();
+ vector<Tag> tags = tagsSortedByMetrics();
+ int rank = 0;
+ for (typename vector<Tag>::const_iterator i = tags.begin(); i != tags.end(); ++i, ++rank)
+ out << prefix << tags.size() - rank << ":" << rm.get(*i) << ") " << i->fullname() << ": " << this->get(*i) << endl;
+ }
+};
+
+template<typename Metrics>
+class TagMetricsInserter : public wibble::mixin::OutputIterator< TagMetricsInserter<Metrics> >
+{
+ Metrics& m;
+
+public:
+ TagMetricsInserter(Metrics& m) : m(m) {}
+
+ template<typename Items, typename Tags>
+ TagMetricsInserter<Metrics>& operator=(const std::pair<Items, Tags>& data)
+ {
+ int size = data.first.size();
+ for (typename Tags::const_iterator i = data.second.begin();
+ i != data.second.end(); ++i)
+ m.add(*i, size);
+ return *this;
+ }
+};
+
+template<typename Tag, typename Number>
+TagMetricsInserter< TagMetrics<Tag, Number> > tagMetricsInserter(TagMetrics<Tag, Number>& out)
+{
+ return TagMetricsInserter< TagMetrics<Tag, Number> >(out);
+}
+
+
class SmartSearcher
{
protected:
@@ -738,13 +885,16 @@
// 2) iterare usando la nuova funzione di ranking
Ept& ept;
- coll::Simple<Package, Tag> fullColl;
+ coll::Fast<Package, Tag> fullColl;
coll::Fast<Package, Tag> coll;
std::string pattern;
std::set<Tag> wanted;
std::set<Tag> unwanted;
std::set<Tag> ignored;
+ vector<Tag> interesting;
+
+ vector<Tag> tagsInMenu;
bool patternMatch(const Package& pkg)
{
@@ -767,94 +917,14 @@
return true;
}
- struct Ranker
- {
- map<Tag, int> cardFull;
- map<Tag, int> cardFilt;
- int totalFull;
- int totalFilt;
-
- Ranker() : totalFull(0), totalFilt(0) {}
-
- void inc(std::map<Tag, int>& map, const Tag& tag)
- {
- std::map<Tag, int>::iterator i = map.find(tag);
- if (i == map.end())
- map.insert(make_pair(tag, 1));
- else
- ++i->second;
- }
- void incFull(const Tag& tag) { inc(cardFull, tag); }
- void incFilt(const Tag& tag) { inc(cardFilt, tag); }
-
-
- template<typename A, typename B>
- map<B, A> reverse(const map<A, B>& m)
- {
- // Sort by cardinality
- map<B, A> res;
- for (typename std::map<A, B>::const_iterator i = m.begin();
- i != m.end(); ++i)
- res.insert(make_pair(i->second, i->first));
- return res;
- }
-
- map<Tag, float> rank(const map<Tag, int>& cards, int tot)
- {
- // Compute ranks
- map<Tag, float> ranks;
- for (std::map<Tag, int>::const_iterator i = cards.begin();
- i != cards.end(); ++i)
- ranks[i->first] = (float)i->second / tot;
- return ranks;
- }
-
- struct relLess
- {
- bool operator()(const std::pair<Tag, float>& a, const std::pair<Tag, float>& b)
- {
- return a.second < b.second;
- }
- };
-
- set<Tag> topTags(SmartSearcher& s, size_t count = 5)
- {
- vector< pair<Tag, float> > relevance;
- for (map<Tag, int>::const_iterator i = cardFilt.begin(); i != cardFilt.end(); ++i)
- {
- float rankNew = (float)i->second / totalFilt;
- float rankOld = (float)cardFull[i->first] / totalFull;
-
- relevance.push_back(make_pair(i->first, rankNew - rankOld));
- }
-
- std::sort(relevance.begin(), relevance.end(), relLess());
-
- // Set the 5 topmost tags as 'wanted'
- // for (map<float, Tag>::const_iterator i = relevance.begin();
- // i != relevance.end(); ++i)
- // cout << "REL " << i->second.fullname() << ": " << i->first << endl;
-
- set<Tag> res;
-// for (vector< pair<Tag, float> >::const_reverse_iterator i = relevance.rbegin();
-// i != relevance.rend() && res.size() < count; ++i)
- for (int i = relevance.size() - 1; i >= 0 && res.size() < count; --i)
- if (!utils::set_contains(s.ignored, relevance[i].first) &&
- !utils::set_contains(s.wanted, relevance[i].first))
- res.insert(relevance[i].first);
- return res;
- }
- };
-
template<typename OUT>
class Filter : public wibble::mixin::OutputIterator< Filter<OUT> >
{
SmartSearcher& s;
- Ranker& r;
OUT out;
public:
- Filter(SmartSearcher& s, Ranker& r, const OUT& out) : s(s), r(r), out(out) {}
+ Filter(SmartSearcher& s, const OUT& out) : s(s), out(out) {}
template<typename ITEMS, typename TAGS>
Filter<OUT>& operator=(const std::pair<ITEMS, TAGS>& data)
@@ -864,6 +934,7 @@
{
if (!s.tagMatch(*i))
continue;
+#if 0
bool matches = s.patternMatch(*i);
for (typename TAGS::const_iterator t = data.second.begin();
t != data.second.end(); ++t)
@@ -876,7 +947,7 @@
++r.totalFilt;
}
}
-
+#endif
*out = data;
++out;
}
@@ -884,17 +955,111 @@
}
};
template<typename OUT>
- Filter<OUT> filter(Ranker& r, const OUT& out)
+ Filter<OUT> filter(const OUT& out)
+ {
+ return Filter<OUT>(*this, out);
+ }
+
+ void autoSelect(const std::vector<Tag>& tags, size_t maxAuto = 5, size_t maxUser = 7)
+ {
+ interesting.clear();
+ if (tags.empty())
+ return;
+
+ size_t autoCount = (tags.size() - maxUser) / 2;
+ if (autoCount > maxAuto) autoCount = maxAuto;
+ size_t userCount = tags.size() - (2 * autoCount);
+ if (userCount > maxUser) userCount = maxUser;
+
+ // Use the bottom autoCount tags as unwanted
+ for (size_t i = 0; i < autoCount; ++i)
+ unwanted.insert(tags[i]);
+
+ // Use the top autoCount tags as wanted
+ for (size_t i = tags.size() - 1; i >= tags.size() - autoCount; --i)
+ wanted.insert(tags[i]);
+
+ // Get the next userCount packages as interesting
+ for (size_t i = tags.size() - autoCount; i >= tags.size() - autoCount - userCount; --i)
+ interesting.push_back(tags[i]);
+ }
+
+ void showTags()
{
- return Filter<OUT>(*this, r, out);
+ tagsInMenu.clear();
+ int idx = 1;
+
+ for (std::set<Tag>::const_iterator i = wanted.begin();
+ i != wanted.end(); ++i)
+ {
+ cout << idx << ") " << i->fullname() << " (wanted)" << endl;
+ tagsInMenu.push_back(*i);
+ ++idx;
+ }
+
+ for (std::set<Tag>::const_iterator i = unwanted.begin();
+ i != unwanted.end(); ++i)
+ {
+ cout << idx << ") " << i->fullname() << " (unwanted)" << endl;
+ tagsInMenu.push_back(*i);
+ ++idx;
+ }
+
+ for (std::set<Tag>::const_iterator i = ignored.begin();
+ i != ignored.end(); ++i)
+ {
+ cout << idx << ") " << i->fullname() << " (ignored)" << endl;
+ tagsInMenu.push_back(*i);
+ ++idx;
+ }
+
+ for (std::vector<Tag>::const_iterator i = interesting.begin();
+ i != interesting.end(); ++i)
+ {
+ cout << idx << ") " << i->fullname() << endl;
+ tagsInMenu.push_back(*i);
+ ++idx;
+ }
+ }
+
+ void refilter()
+ {
+ // Regenerate coll
+ coll = coll::Fast<Package, Tag>();
+ fullColl.output(filter(inserter(coll)));
+
+ // Compute the most interesting tags
+ TagMetrics<Tag, int> collMetrics = TagMetrics<Tag, int>::computeFromTags(coll);
+ vector<Tag> tags = collMetrics.tagsSortedByDiscriminance(coll.itemCount());
+
+ // Select them as interesting to be displayed
+ autoSelect(tags, 0);
}
public:
SmartSearcher(Ept& ept, const std::string& pattern) : ept(ept), pattern(pattern)
{
+ // Perform the initial filtering using the keyword search
for (Packages::iterator i = ept.packages().begin();
i != ept.packages().end(); ++i)
+ {
fullColl.insert(wibble::singleton(*i), ept.tagmap().getTagsOfItem(*i));
+ if (patternMatch(*i))
+ coll.insert(wibble::singleton(*i), ept.tagmap().getTagsOfItem(*i));
+ }
+
+ // Compute the set of tags that better represent the keyword search
+ TagMetrics<Tag, int> metrics1 = TagMetrics<Tag, int>::computeFromTags(fullColl);
+ TagMetrics<Tag, int> metrics2 = TagMetrics<Tag, int>::computeFromTags(coll);
+ TagMetrics<Tag, int> jumps = metrics2.jumpsFrom(metrics1);
+ vector<Tag> tags = jumps.tagsSortedByMetrics();
+
+ //metrics1.dump("BEF ", cout);
+ //metrics2.dump("AFT ", cout);
+ //jumps.dump("JMP ", cout);
+
+ //autoSelect(tags, 1);
+ autoSelect(tags, 0);
}
#if 0
@@ -922,50 +1087,69 @@
bool done = false;
while (!done)
{
- Ranker r;
- coll = coll::Fast<Package, Tag>();
- fullColl.output(filter(r, inserter(coll)));
- std::set<Tag> top = r.topTags(*this, 1);
-
- if (top.empty())
- {
- coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
- done = true;
- } else {
- Tag topTag = *top.begin();
- int card = coll.getCardinality(topTag);
-
- if (!wanted.empty())
- cout << "[VERB] Wanted: " << wanted << endl;
- if (!unwanted.empty())
- cout << "[VERB] Unwanted: " << unwanted << endl;
- if (!ignored.empty())
- cout << "[VERB] Ignored: " << ignored << endl;
- cout << " * " << coll.itemCount() << " packages. Top tag: " << topTag.fullname() << " (attached to " << card << " items)" << endl << endl;
- string ans;
- bool badAnswer = true;
- do {
- badAnswer = false;
- cout << "Do you want tag " << topTag.fullname() << "? (Yes/No/Ignore/View/Done/Quit)> ";
- cin >> ans;
-
- if (ans == "Y" || ans == "y")
- wanted.insert(topTag);
- else if (ans == "N" || ans == "n")
- unwanted.insert(topTag);
- else if (ans == "I" || ans == "i")
- ignored.insert(topTag);
- else if (ans == "V" || ans == "v")
- coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
- else if (ans == "D" || ans == "d")
+ cout << "Tag selection:" << endl;
+ showTags();
+ cout << coll.itemCount() << " packages selected so far." << endl;
+ string ans;
+ bool badAnswer = true;
+ do {
+ badAnswer = false;
+ // TODO: allow to add tags based on a keyword search on coll
+ cout << "Your choice (+#, -#, =#, View, Done, Quit): ";
+ cin >> ans;
+
+ if (ans == "")
+ badAnswer = true;
+ else if (ans[0] == '+') {
+ int idx = strtoul(ans.substr(1).c_str(), NULL, 10);
+ if (idx < 0 || idx >= tagsInMenu.size())
+ badAnswer = true;
+ else
{
- coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
- done = true;
+ Tag tag = tagsInMenu[idx - 1];
+ cout << "Selected: " << tag.fullname() << endl;
+ wanted.insert(tag);
+ unwanted.erase(tag);
+ ignored.erase(tag);
+ refilter();
}
- else if (ans == "Q" || ans == "q")
- done = true;
- } while (badAnswer);
- }
+ } else if (ans[0] == '-') {
+ int idx = strtoul(ans.substr(1).c_str(), NULL, 10);
+ if (idx < 0 || idx >= tagsInMenu.size())
+ badAnswer = true;
+ else
+ {
+ Tag tag = tagsInMenu[idx - 1];
+ cout << "Selected: " << tag.fullname() << endl;
+ wanted.erase(tag);
+ unwanted.insert(tag);
+ ignored.erase(tag);
+ refilter();
+ }
+ } else if (ans[0] == '=') {
+ int idx = strtoul(ans.substr(1).c_str(), NULL, 10);
+ if (idx < 0 || idx >= tagsInMenu.size())
+ badAnswer = true;
+ else
+ {
+ Tag tag = tagsInMenu[idx - 1];
+ cout << "Selected: " << tag.fullname() << endl;
+ wanted.erase(tag);
+ unwanted.erase(tag);
+ ignored.insert(tag);
+ refilter();
+ }
+ } else if (ans == "V" || ans == "v") {
+ coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
+ } else if (ans == "D" || ans == "d") {
+ coll.output(PackagePrinter(ept.tagmap(), PackagePrinter::SHORT));
+ done = true;
+ } else if (ans == "Q" || ans == "q") {
+ done = true;
+ } else
+ badAnswer = true;
+ } while (badAnswer);
+
}
}
@@ -2109,6 +2293,7 @@
}
}
+#include <ept/cache/apt/packages.tcc>
#include <ept/cache/debtags/tagmap.tcc>
#include <tagcoll/coll/fast.tcc>
More information about the Debtags-commits
mailing list