[ucto] 01/03: New upstream version 0.10

Maarten van Gompel proycon-guest at moszumanska.debian.org
Tue Nov 7 21:36:21 UTC 2017


This is an automated email from the git hooks/post-receive script.

proycon-guest pushed a commit to branch master
in repository ucto.

commit 15e2b95161ee96e0f19281fb6348139cb6ea64ce
Author: proycon <proycon at anaproy.nl>
Date:   Tue Nov 7 22:32:41 2017 +0100

    New upstream version 0.10
---
 ChangeLog               | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 NEWS                    |  4 ++++
 configure               | 38 +++++++++++++++++++-------------------
 configure.ac            |  4 ++--
 include/ucto/tokenize.h |  5 ++++-
 src/tokenize.cxx        | 43 +++++++++++++++++++++++++++++++------------
 6 files changed, 104 insertions(+), 35 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 0ba4c32..d649859 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,55 @@
+2017-11-07  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* NEWS: new NEWS prior to yar (yet another release)
+
 2017-11-06  Ko van der Sloot <K.vanderSloot at let.ru.nl>
 
-	* NEWS: typo
+	* : commit 49f6aa8d553e988c92654a828f96be925d4cf52f Author: Ko van
+	der Sloot <K.vanderSloot at let.ru.nl> Date:   Mon Nov 6 17:24:11 2017
+	+0100
 
 2017-11-06  Ko van der Sloot <K.vanderSloot at let.ru.nl>
 
 	* NEWS, configure.ac, src/Makefile.am: Minor fix: bumped the .so
 	version to 3.0.0
 
+2017-11-02  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* configure.ac, include/ucto/tokenize.h, src/tokenize.cxx: some
+	refactoring
+
+2017-10-24  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* tests/test.de.tok.V: German test result is changed (improved)
+	after fix in NOSPACE handling
+
+2017-10-24  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* src/tokenize.cxx: fixed a problem with NOSPACE inside recursive
+	rules
+
+2017-10-23  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* tests/smileys.nl.tok.V: added a test
+
+2017-10-23  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* src/tokenize.cxx, tests/smileys.nl.txt: added detection of
+	UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS
+
+2017-10-23  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* src/tokenize.cxx: when adding 'extra' text, skip <br> nodes. They
+	don't take <t>
+
+2017-10-23  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* configure.ac: bumped version after release
+
+2017-10-23  Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+	* NEWS: NEWS
+
 2017-10-23  Ko van der Sloot <K.vanderSloot at let.ru.nl>
 
 	* tests/testutt, tests/testutt.ok, tests/utt2.xml: added anothrer
diff --git a/NEWS b/NEWS
index c560f85..856467a 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,7 @@
+0.10 2017-11-07
+[Ko van der Sloot]
+New release due to outdated files in the previous release.
+
 0.9.9 2017-11-06
 [Ko van der Sloot]
 Minor fix:
diff --git a/configure b/configure
index 453becb..8b9e3c1 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ucto 0.9.9.
+# Generated by GNU Autoconf 2.69 for ucto 0.10.
 #
 # Report bugs to <lamasoftware at science.ru.nl>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ucto'
 PACKAGE_TARNAME='ucto'
-PACKAGE_VERSION='0.9.9'
-PACKAGE_STRING='ucto 0.9.9'
+PACKAGE_VERSION='0.10'
+PACKAGE_STRING='ucto 0.10'
 PACKAGE_BUGREPORT='lamasoftware at science.ru.nl'
 PACKAGE_URL=''
 
@@ -1363,7 +1363,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ucto 0.9.9 to adapt to many kinds of systems.
+\`configure' configures ucto 0.10 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1434,7 +1434,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ucto 0.9.9:";;
+     short | recursive ) echo "Configuration of ucto 0.10:";;
    esac
   cat <<\_ACEOF
 
@@ -1568,7 +1568,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ucto configure 0.9.9
+ucto configure 0.10
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2188,7 +2188,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ucto $as_me 0.9.9, which was
+It was created by ucto $as_me 0.10, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -3051,7 +3051,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ucto'
- VERSION='0.9.9'
+ VERSION='0.10'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16847,12 +16847,12 @@ if test -n "$ticcutils_CFLAGS"; then
     pkg_cv_ticcutils_CFLAGS="$ticcutils_CFLAGS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5
-  ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.17 \""; } >&5
+  ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.17 ") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.6 " 2>/dev/null`
+  pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.17 " 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -16864,12 +16864,12 @@ if test -n "$ticcutils_LIBS"; then
     pkg_cv_ticcutils_LIBS="$ticcutils_LIBS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5
-  ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.17 \""; } >&5
+  ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.17 ") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.6 " 2>/dev/null`
+  pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.17 " 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -16890,14 +16890,14 @@ else
         _pkg_short_errors_supported=no
 fi
         if test $_pkg_short_errors_supported = yes; then
-	        ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1`
+	        ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.17 " 2>&1`
         else
-	        ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1`
+	        ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.17 " 2>&1`
         fi
 	# Put the nasty error message in config.log where it belongs
 	echo "$ticcutils_PKG_ERRORS" >&5
 
-	as_fn_error $? "Package requirements (ticcutils >= 0.6 ) were not met:
+	as_fn_error $? "Package requirements (ticcutils >= 0.17 ) were not met:
 
 $ticcutils_PKG_ERRORS
 
@@ -17637,7 +17637,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ucto $as_me 0.9.9, which was
+This file was extended by ucto $as_me 0.10, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -17703,7 +17703,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ucto config.status 0.9.9
+ucto config.status 0.10
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index a745d40..1808674 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ(2.59)
-AC_INIT([ucto], [0.9.9], [lamasoftware at science.ru.nl])
+AC_INIT([ucto], [0.10], [lamasoftware at science.ru.nl])
 AM_INIT_AUTOMAKE([foreign])
 AC_CONFIG_SRCDIR([configure.ac])
 AC_CONFIG_MACRO_DIR([m4])
@@ -90,7 +90,7 @@ PKG_CHECK_MODULES([folia], [folia >= 1.10] )
 CXXFLAGS="$folia_CFLAGS $CXXFLAGS"
 LIBS="$folia_LIBS $LIBS"
 
-PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.6] )
+PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.17] )
 CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
 LIBS="$LIBS $ticcutils_LIBS"
 
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
index da70e1b..f3a0682 100644
--- a/include/ucto/tokenize.h
+++ b/include/ucto/tokenize.h
@@ -28,12 +28,15 @@
 #define UCTO_TOKENIZE_H
 
 #include <vector>
+#include <set>
 #include <map>
 #include <sstream>
 #include <stdexcept>
+#include "libfolia/folia.h"
 #include "ucto/unicode.h"
 #include "ucto/setting.h"
 #include "ticcutils/LogStream.h"
+#include "ticcutils/Unicode.h"
 
 class TextCat;
 
@@ -215,7 +218,7 @@ namespace Tokenizer {
     void setLanguage( const std::string& l ){ default_language = l; };
 
     // set eos marker
-    UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark =  folia::UTF8ToUnicode(s); return t; };
+    UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = TiCC::UnicodeFromUTF8(s); return t; };
     UnicodeString getEosMarker( ) const { return eosmark; }
 
     bool setNormSet( const std::string& );
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
index 274e1b8..f8c0bdb 100644
--- a/src/tokenize.cxx
+++ b/src/tokenize.cxx
@@ -25,6 +25,8 @@
 
 */
 
+#include "ucto/tokenize.h"
+
 #include <unistd.h>
 #include <iostream>
 #include <fstream>
@@ -33,11 +35,9 @@
 #include "unicode/schriter.h"
 #include "ticcutils/StringOps.h"
 #include "ticcutils/PrettyPrint.h"
-#include "libfolia/folia.h"
+#include "ticcutils/Unicode.h"
 #include "ucto/unicode.h"
 #include "ucto/textcat.h"
-#include "ucto/setting.h"
-#include "ucto/tokenize.h"
 
 #define DO_READLINE
 #ifdef HAVE_LIBREADLINE
@@ -111,6 +111,7 @@ namespace Tokenizer {
   const UnicodeString type_space = "SPACE";
   const UnicodeString type_currency = "CURRENCY";
   const UnicodeString type_emoticon = "EMOTICON";
+  const UnicodeString type_picto = "PICTOGRAM";
   const UnicodeString type_word = "WORD";
   const UnicodeString type_symbol = "SYMBOL";
   const UnicodeString type_punctuation = "PUNCTUATION";
@@ -120,7 +121,9 @@ namespace Tokenizer {
   Token::Token( const UnicodeString& _type,
 		const UnicodeString& _s,
 		TokenRole _role, const string& _lc ):
-    type(_type), us(_s), role(_role), lc(_lc) {}
+    type(_type), us(_s), role(_role), lc(_lc) {
+    //    cerr << "Created " << *this << endl;
+  }
 
 
   std::string Token::texttostring() { return folia::UnicodeToUTF8(us); }
@@ -640,6 +643,10 @@ namespace Tokenizer {
       // there is already text, bail out.
       return;
     }
+    if ( root->isSubClass( folia::Linebreak_t ) ){
+      // exception
+      return;
+    }
     UnicodeString utxt = root->text( outputclass, false, false );
     // so get Untokenized text from the children, and set it
     root->settext( folia::UnicodeToUTF8(utxt), outputclass );
@@ -1769,7 +1776,7 @@ namespace Tokenizer {
 	  }
 	  if ( doPunctFilter
 	       && ( type == type_punctuation || type == type_currency ||
-		    type == type_emoticon ) ) {
+		    type == type_emoticon || type == type_picto ) ) {
 	    if (tokDebug >= 2 ){
 	      LOG << "   [passThruLine] skipped PUNCTUATION ["
 			      << input << "]" << endl;
@@ -1832,7 +1839,7 @@ namespace Tokenizer {
 	}
 	if ( doPunctFilter
 	     && ( type == type_punctuation || type == type_currency ||
-		  type == type_emoticon ) ) {
+		  type == type_emoticon || type == type_picto ) ) {
 	  if (tokDebug >= 2 ){
 	    LOG << "   [passThruLine] skipped PUNCTUATION ["
 			    << input << "]" << endl;
@@ -1908,6 +1915,11 @@ namespace Tokenizer {
     return s == UBLOCK_EMOTICONS;
   }
 
+  bool u_ispicto( UChar32 c ){
+    UBlockCode s = ublock_getCode(c);
+    return s == UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS ;
+  }
+
   bool u_iscurrency( UChar32 c ){
     return u_charType( c ) == U_CURRENCY_SYMBOL;
   }
@@ -1932,6 +1944,9 @@ namespace Tokenizer {
     else if ( u_isemo( c ) ) {
       return type_emoticon;
     }
+    else if ( u_ispicto( c ) ) {
+      return type_picto;
+    }
     else if ( u_isalpha(c)) {
       return type_word;
     }
@@ -2223,12 +2238,13 @@ namespace Tokenizer {
     if ( tokDebug > 2 ){
       if ( recurse ){
 	LOG << "   [tokenizeWord] Recurse Input: (" << inpLen << ") "
-			<< "word=[" << input << "], type=" << assigned_type << endl;
+	    << "word=[" << input << "], type=" << assigned_type
+	    << " Space=" << (space?"TRUE":"FALSE") << endl;
       }
       else {
 	LOG << "   [tokenizeWord] Input: (" << inpLen << ") "
-			<< "word=[" << input << "]" << endl;
-      }
+	    << "word=[" << input << "]"
+	    << " Space=" << (space?"TRUE":"FALSE") << endl;      }
     }
     if ( input == eosmark ) {
       if (tokDebug >= 2){
@@ -2255,7 +2271,7 @@ namespace Tokenizer {
       }
       if ( doPunctFilter
 	   && ( type == type_punctuation || type == type_currency ||
-		type == type_emoticon ) ) {
+		type == type_emoticon || type == type_picto ) ) {
 	if (tokDebug >= 2 ){
 	  LOG << "   [tokenizeWord] skipped PUNCTUATION ["
 			  << input << "]" << endl;
@@ -2336,8 +2352,8 @@ namespace Tokenizer {
 	    }
 	    for ( int m=0; m < max; ++m ){
 	      if ( tokDebug >= 4 ){
-		LOG << "\tTOKEN match[" << m << "] = "
-				<< matches[m] << endl;
+		LOG << "\tTOKEN match[" << m << "] = " << matches[m]
+		    << " Space=" << (space?"TRUE":"FALSE") << endl;
 	      }
 	      if ( doPunctFilter
 		   && (&rule->id)->startsWith("PUNCTUATION") ){
@@ -2355,6 +2371,9 @@ namespace Tokenizer {
 		if ( post.length() > 0 ) {
 		  internal_space = false;
 		}
+		else if ( m < max-1 ){
+		  internal_space = false;
+		}
 		UnicodeString word = matches[m];
 		if ( norm_set.find( type ) != norm_set.end() ){
 		  word = "{{" + type + "}}";

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/ucto.git



More information about the debian-science-commits mailing list