[Debtags-commits] [svn] r1984 - in daemon: . src src/lib
Enrico Zini
enrico at costa.debian.org
Mon Oct 2 22:16:17 UTC 2006
Author: enrico
Date: Mon Oct 2 22:16:17 2006
New Revision: 1984
Modified:
daemon/ (props changed)
daemon/CMakeLists.txt
daemon/src/CMakeLists.txt
daemon/src/debtagsd.cpp
daemon/src/lib/FullText.cpp
daemon/src/lib/FullText.h
Log:
r3415 at viaza: enrico | 2006-09-25 18:50:24 +0200
Added xapian search for similar packages
Modified: daemon/CMakeLists.txt
==============================================================================
--- daemon/CMakeLists.txt (original)
+++ daemon/CMakeLists.txt Mon Oct 2 22:16:17 2006
@@ -3,6 +3,6 @@
#include (/usr/share/CMake/Modules/UsePkgConfig.cmake)
include (UsePkgConfig)
PKGCONFIG("libwibble" WIBBLE_INCLUDE_DIR WIBBLE_LIB_DIR WIBBLE_LIBS WIBBLE_CFLAGS)
-PKGCONFIG("libtagcoll" TAGCOLL_INCLUDE_DIR TAGCOLL_LIB_DIR TAGCOLL_LIBS TAGCOLL_CFLAGS)
+PKGCONFIG("libtagcoll2" TAGCOLL_INCLUDE_DIR TAGCOLL_LIB_DIR TAGCOLL_LIBS TAGCOLL_CFLAGS)
ADD_SUBDIRECTORY( src )
Modified: daemon/src/CMakeLists.txt
==============================================================================
--- daemon/src/CMakeLists.txt (original)
+++ daemon/src/CMakeLists.txt Mon Oct 2 22:16:17 2006
@@ -4,7 +4,7 @@
ADD_EXECUTABLE( debtagsd debtagsd.cpp config.cpp ${libsrc} )
-TARGET_LINK_LIBRARIES( debtagsd wibble tagcoll xapian z )
+TARGET_LINK_LIBRARIES( debtagsd wibble tagcoll2 xapian z )
set_target_properties( debtagsd PROPERTIES COMPILE_FLAGS "${TAGCOLL_CFLAGS} ${WIBBLE_CFLAGS} -gstabs+ -Wall -O0 ")
set_target_properties( debtagsd PROPERTIES LINK_FLAGS "-static")
Modified: daemon/src/debtagsd.cpp
==============================================================================
--- daemon/src/debtagsd.cpp (original)
+++ daemon/src/debtagsd.cpp Mon Oct 2 22:16:17 2006
@@ -9,6 +9,7 @@
#include <wibble/regexp.h>
#include <wibble/sys/childprocess.h>
#include <wibble/sys/process.h>
+#include <wibble/operators.h>
#include <tagcoll/coll/fast.h>
#include <tagcoll/input/stdio.h>
@@ -263,6 +264,14 @@
conn.write(str.str());
}
+ void outputSTag(const std::string& tag, size_t card, Connection& conn)
+ {
+ using namespace std;
+ stringstream str;
+ str << tag << " " << card << "\n";
+ conn.write(str.str());
+ }
+
/// Interact with one client
void handleConnection(Connection& conn)
{
@@ -313,8 +322,62 @@
for (set<string>::const_iterator i = pkgs.begin(); i != pkgs.end(); ++i)
subcoll.insert(wibble::singleton(*i), coll.getTagsOfItem(*i));
vector<string> tags = subcoll.tagsInRelevanceOrder(coll);
+ set<string> inters;
for (vector<string>::const_reverse_iterator i = tags.rbegin(); i != tags.rend(); ++i)
- outputTag(*i, conn);
+ {
+ using namespace wibble::operators;
+ if (i == tags.rbegin())
+ inters = coll.getItemsHavingTag(*i);
+ else
+ inters &= coll.getItemsHavingTag(*i);
+ outputSTag(*i, inters.size(), conn);
+ }
+ }
+ else if (cmd == "SIM")
+ {
+ std::vector<std::string> pkgs = fts.similar(line);
+ for (vector<string>::const_iterator i = pkgs.begin(); i != pkgs.end(); ++i)
+ outputPackage(*i, conn);
+ }
+ else if (cmd == "SUGG")
+ {
+ // Output the list of suggested tags
+ std::set<std::string> tags = coll.getTagsOfItem(line);
+ if (!tags.empty())
+ {
+ using namespace wibble::operators;
+ map< int, set<string> > sets;
+ for (tagcoll::coll::Fast<std::string, std::string>::const_iterator i = coll.begin();
+ i != coll.end(); ++i)
+ {
+ set<string> inters = tags & i->second;
+ for (set<string>::const_iterator j = i->second.begin();
+ j != i->second.end(); ++j)
+ for (size_t k = 0; k <= inters.size(); ++k)
+ sets[k].insert(*j);
+ }
+ if (!sets.empty())
+ {
+ double minscore = 1000000;
+ map< int, set<string> >::const_iterator cand = sets.end();
+ for (map< int, set<string> >::const_iterator i = sets.begin();
+ i != sets.end(); ++i)
+ {
+ int hits = i->second.size();
+ double score = (hits-50)*(hits-50)/hits;
+ if (score < minscore)
+ {
+ minscore = score;
+ cand = i;
+ }
+ }
+
+ if (cand != sets.end())
+ for (set<string>::const_iterator j = cand->second.begin();
+ j != cand->second.end(); ++j)
+ conn.write(*j + "\n");
+ }
+ }
}
else
conn.write("Command not recognized: " + cmd + "\n");
Modified: daemon/src/lib/FullText.cpp
==============================================================================
--- daemon/src/lib/FullText.cpp (original)
+++ daemon/src/lib/FullText.cpp Mon Oct 2 22:16:17 2006
@@ -70,6 +70,44 @@
cfg.log() << "Finised indexing package descriptions." << std::endl;
//database.commit_transaction();
}
+
+ Xapian::docid getDocID(const std::string& pkg)
+ {
+ Xapian::Enquire enquire(database);
+ Xapian::Query query(pkg, 1, 1);
+ enquire.set_query(query);
+ Xapian::MSet matches = enquire.get_mset(0, 1);
+ if (matches.empty())
+ return 0;
+ // cfg.log() << "DOC " << matches.begin().get_document().get_data() << endl;
+ return *matches.begin();
+ }
+
+#if 0
+ Xapian::Query similarityQuery(const std::string& pkg)
+ {
+ using namespace std;
+ tagcoll::input::Stdio in(cfg.pkgdb);
+ DebDBParser parser(in);
+ DebDBParser::Record rec;
+ vector<string> postings;
+ postings.push_back(pkg);
+ cfg.log() << "SQT " << pkg << std::endl;
+ while (parser.nextRecord(rec))
+ {
+ if (rec["Package"] != pkg)
+ continue;
+
+ wibble::Tokenizer tok(rec["Description"], "[A-Za-z0-9_-]+", REG_EXTENDED);
+ for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i)
+ {
+ cfg.log() << "SQT " << normalise(*i) << std::endl;
+ postings.push_back(normalise(*i));
+ }
+ }
+ return Xapian::Query(Xapian::Query::OP_ELITE_SET, postings.begin(), postings.end());
+ }
+#endif
};
@@ -106,4 +144,61 @@
return res;
}
+std::vector<std::string> FullTextSearch::similar(const std::string& pkg)
+{
+ using namespace std;
+ Xapian::docid id = data->getDocID(pkg);
+ if (id == 0)
+ return vector<string>();
+
+ Xapian::Enquire enquire(data->database);
+ Xapian::RSet rset;
+ rset.add_document(id);
+ Xapian::ESet eset = enquire.get_eset(7, rset);
+ if (eset.empty())
+ return vector<string>();
+// for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
+// cfg.log() << "EI " << *i << ": " << i.get_weight() << endl;
+
+ Xapian::Query query(Xapian::Query::OP_OR, eset.begin(), eset.end());
+ enquire.set_query(query);
+ //Xapian::MSet matches = enquire.get_mset(0, 10);
+ Xapian::MSet matches = enquire.get_mset(0, 100);
+ std::vector<std::string> res;
+ for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
+ {
+ // Cut off poor results
+ if (res.size() > 20 && i.get_percent() < 60) break;
+ //cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "%" << endl;
+ if (i.get_document().get_data() != pkg)
+ res.push_back(i.get_document().get_data());
+ }
+ return res;
+}
+#if 0
+std::set<std::string> FullTextSearch::similar(const std::string& pkg)
+{
+ using namespace std;
+ Xapian::Enquire enquire(data->database);
+ enquire.set_query(data->similarityQuery(pkg));
+ Xapian::MSet matches = enquire.get_mset(0, 100);
+ std::set<std::string> res;
+ for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
+ {
+ // Cut off poor results
+ if (res.size() > 10 && i.get_percent() < 60) break;
+ cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "% " << *i << endl;
+ res.insert(i.get_document().get_data());
+ }
+
+ Xapian::RSet rset;
+ rset.add_document(1863);
+ Xapian::ESet eset = enquire.get_eset(100, rset);
+ cfg.log() << "ES " << eset.size() << endl;
+ for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
+ cfg.log() << "EI " << *i << endl;
+ return res;
+}
+#endif
+
// vim:set ts=4 sw=4:
Modified: daemon/src/lib/FullText.h
==============================================================================
--- daemon/src/lib/FullText.h (original)
+++ daemon/src/lib/FullText.h Mon Oct 2 22:16:17 2006
@@ -19,6 +19,7 @@
~FullTextSearch();
std::set<std::string> search(const std::vector<std::string>& keys);
+ std::vector<std::string> similar(const std::string& pkg);
};
// vim:set ts=4 sw=4:
More information about the Debtags-commits
mailing list