[Debtags-commits] [svn] r1984 - in daemon: . src src/lib

Enrico Zini enrico at costa.debian.org
Mon Oct 2 22:16:17 UTC 2006


Author: enrico
Date: Mon Oct  2 22:16:17 2006
New Revision: 1984

Modified:
   daemon/   (props changed)
   daemon/CMakeLists.txt
   daemon/src/CMakeLists.txt
   daemon/src/debtagsd.cpp
   daemon/src/lib/FullText.cpp
   daemon/src/lib/FullText.h
Log:
 r3415 at viaza:  enrico | 2006-09-25 18:50:24 +0200
 Added xapian search for similar packages


Modified: daemon/CMakeLists.txt
==============================================================================
--- daemon/CMakeLists.txt	(original)
+++ daemon/CMakeLists.txt	Mon Oct  2 22:16:17 2006
@@ -3,6 +3,6 @@
 #include (/usr/share/CMake/Modules/UsePkgConfig.cmake)
 include (UsePkgConfig)
 PKGCONFIG("libwibble" WIBBLE_INCLUDE_DIR WIBBLE_LIB_DIR WIBBLE_LIBS WIBBLE_CFLAGS)
-PKGCONFIG("libtagcoll" TAGCOLL_INCLUDE_DIR TAGCOLL_LIB_DIR TAGCOLL_LIBS TAGCOLL_CFLAGS)
+PKGCONFIG("libtagcoll2" TAGCOLL_INCLUDE_DIR TAGCOLL_LIB_DIR TAGCOLL_LIBS TAGCOLL_CFLAGS)
 
 ADD_SUBDIRECTORY( src )

Modified: daemon/src/CMakeLists.txt
==============================================================================
--- daemon/src/CMakeLists.txt	(original)
+++ daemon/src/CMakeLists.txt	Mon Oct  2 22:16:17 2006
@@ -4,7 +4,7 @@
 
 ADD_EXECUTABLE( debtagsd debtagsd.cpp config.cpp ${libsrc} )
 
-TARGET_LINK_LIBRARIES( debtagsd wibble tagcoll xapian z )
+TARGET_LINK_LIBRARIES( debtagsd wibble tagcoll2 xapian z )
 
 set_target_properties( debtagsd PROPERTIES COMPILE_FLAGS "${TAGCOLL_CFLAGS} ${WIBBLE_CFLAGS} -gstabs+ -Wall -O0 ")
 set_target_properties( debtagsd PROPERTIES LINK_FLAGS "-static")

Modified: daemon/src/debtagsd.cpp
==============================================================================
--- daemon/src/debtagsd.cpp	(original)
+++ daemon/src/debtagsd.cpp	Mon Oct  2 22:16:17 2006
@@ -9,6 +9,7 @@
 #include <wibble/regexp.h>
 #include <wibble/sys/childprocess.h>
 #include <wibble/sys/process.h>
+#include <wibble/operators.h>
 
 #include <tagcoll/coll/fast.h>
 #include <tagcoll/input/stdio.h>
@@ -263,6 +264,14 @@
 		conn.write(str.str());
 	}
 
+	void outputSTag(const std::string& tag, size_t card, Connection& conn)
+	{
+		using namespace std;
+		stringstream str;
+		str << tag << " " << card << "\n";
+		conn.write(str.str());
+	}
+
 	/// Interact with one client
 	void handleConnection(Connection& conn)
 	{
@@ -313,8 +322,62 @@
 				for (set<string>::const_iterator i = pkgs.begin(); i != pkgs.end(); ++i)
 					subcoll.insert(wibble::singleton(*i), coll.getTagsOfItem(*i));
 				vector<string> tags = subcoll.tagsInRelevanceOrder(coll);
+				set<string> inters;
 				for (vector<string>::const_reverse_iterator i = tags.rbegin(); i != tags.rend(); ++i)
-					outputTag(*i, conn);
+				{
+					using namespace wibble::operators;
+					if (i == tags.rbegin())
+						inters = coll.getItemsHavingTag(*i);
+					else
+						inters &= coll.getItemsHavingTag(*i);
+					outputSTag(*i, inters.size(), conn);
+				}
+			}
+			else if (cmd == "SIM")
+			{
+				std::vector<std::string> pkgs = fts.similar(line);
+				for (vector<string>::const_iterator i = pkgs.begin(); i != pkgs.end(); ++i)
+					outputPackage(*i, conn);
+			}
+			else if (cmd == "SUGG")
+			{
+				// Output the list of suggested tags
+				std::set<std::string> tags = coll.getTagsOfItem(line);
+				if (!tags.empty())
+				{
+					using namespace wibble::operators;
+					map< int, set<string> > sets;
+					for (tagcoll::coll::Fast<std::string, std::string>::const_iterator i = coll.begin();
+							i != coll.end(); ++i)
+					{
+						set<string> inters = tags & i->second;
+						for (set<string>::const_iterator j = i->second.begin();
+								j != i->second.end(); ++j)
+							for (size_t k = 0; k <= inters.size(); ++k)
+								sets[k].insert(*j);
+					}
+					if (!sets.empty())
+					{
+						double minscore = 1000000;
+						map< int, set<string> >::const_iterator cand = sets.end();
+						for (map< int, set<string> >::const_iterator i = sets.begin();
+								i != sets.end(); ++i)
+						{
+							int hits = i->second.size();
+							double score = (hits-50)*(hits-50)/hits;
+							if (score < minscore)
+							{
+								minscore = score;
+								cand = i;
+							}
+						}
+
+						if (cand != sets.end())
+							for (set<string>::const_iterator j = cand->second.begin();
+									j != cand->second.end(); ++j)
+								conn.write(*j + "\n");
+					}
+				}
 			}
 			else
 				conn.write("Command not recognized: " + cmd + "\n");

Modified: daemon/src/lib/FullText.cpp
==============================================================================
--- daemon/src/lib/FullText.cpp	(original)
+++ daemon/src/lib/FullText.cpp	Mon Oct  2 22:16:17 2006
@@ -70,6 +70,44 @@
 		cfg.log() << "Finised indexing package descriptions." << std::endl;
 		//database.commit_transaction();
 	}
+
+	Xapian::docid getDocID(const std::string& pkg)
+	{
+		Xapian::Enquire enquire(database);
+		Xapian::Query query(pkg, 1, 1);
+		enquire.set_query(query);
+		Xapian::MSet matches = enquire.get_mset(0, 1);
+		if (matches.empty())
+			return 0;
+		//	cfg.log() << "DOC " << matches.begin().get_document().get_data() << endl;
+		return *matches.begin();
+	}
+
+#if 0
+	Xapian::Query similarityQuery(const std::string& pkg)
+	{
+		using namespace std;
+		tagcoll::input::Stdio in(cfg.pkgdb);
+		DebDBParser parser(in);
+		DebDBParser::Record rec;
+		vector<string> postings;
+		postings.push_back(pkg);
+		cfg.log() << "SQT " << pkg << std::endl;
+		while (parser.nextRecord(rec))
+		{
+			if (rec["Package"] != pkg)
+				continue;
+
+			wibble::Tokenizer tok(rec["Description"], "[A-Za-z0-9_-]+", REG_EXTENDED);
+			for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i)
+			{
+				cfg.log() << "SQT " << normalise(*i) << std::endl;
+				postings.push_back(normalise(*i));
+			}
+		}
+		return Xapian::Query(Xapian::Query::OP_ELITE_SET, postings.begin(), postings.end());
+	}
+#endif
 };
 
 
@@ -106,4 +144,61 @@
 	return res;
 }
 
+std::vector<std::string> FullTextSearch::similar(const std::string& pkg)
+{
+	using namespace std;
+	Xapian::docid id = data->getDocID(pkg);
+	if (id == 0)
+		return vector<string>();
+
+	Xapian::Enquire enquire(data->database);
+	Xapian::RSet rset;
+	rset.add_document(id);
+	Xapian::ESet eset = enquire.get_eset(7, rset);
+	if (eset.empty())
+		return vector<string>();
+//	for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
+//		cfg.log() << "EI " << *i << ": " << i.get_weight() << endl;
+
+	Xapian::Query query(Xapian::Query::OP_OR, eset.begin(), eset.end());
+	enquire.set_query(query);
+	//Xapian::MSet matches = enquire.get_mset(0, 10);
+	Xapian::MSet matches = enquire.get_mset(0, 100);
+	std::vector<std::string> res;
+	for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
+	{
+		// Cut off poor results
+		if (res.size() > 20 && i.get_percent() < 60) break;
+		//cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "%" << endl;
+		if (i.get_document().get_data() != pkg)
+			res.push_back(i.get_document().get_data());
+	}
+	return res;
+}
+#if 0
+std::set<std::string> FullTextSearch::similar(const std::string& pkg)
+{
+	using namespace std;
+	Xapian::Enquire enquire(data->database);
+	enquire.set_query(data->similarityQuery(pkg));
+	Xapian::MSet matches = enquire.get_mset(0, 100);
+	std::set<std::string> res;
+	for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i)
+	{
+		// Cut off poor results
+		if (res.size() > 10 && i.get_percent() < 60) break;
+		cfg.log() << i.get_document().get_data() << ": " << i.get_percent() << "% " << *i << endl;
+		res.insert(i.get_document().get_data());
+	}
+
+	Xapian::RSet rset;
+	rset.add_document(1863);
+	Xapian::ESet eset = enquire.get_eset(100, rset);
+	cfg.log() << "ES " << eset.size() << endl;
+	for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
+		cfg.log() << "EI " << *i << endl;
+	return res;
+}
+#endif
+
 // vim:set ts=4 sw=4:

Modified: daemon/src/lib/FullText.h
==============================================================================
--- daemon/src/lib/FullText.h	(original)
+++ daemon/src/lib/FullText.h	Mon Oct  2 22:16:17 2006
@@ -19,6 +19,7 @@
 	~FullTextSearch();
 
 	std::set<std::string> search(const std::vector<std::string>& keys);
+	std::vector<std::string> similar(const std::string& pkg);
 };
 
 // vim:set ts=4 sw=4:



More information about the Debtags-commits mailing list