[ucto] 01/03: New upstream version 0.10
Maarten van Gompel
proycon-guest at moszumanska.debian.org
Tue Nov 7 21:36:21 UTC 2017
This is an automated email from the git hooks/post-receive script.
proycon-guest pushed a commit to branch master
in repository ucto.
commit 15e2b95161ee96e0f19281fb6348139cb6ea64ce
Author: proycon <proycon at anaproy.nl>
Date: Tue Nov 7 22:32:41 2017 +0100
New upstream version 0.10
---
ChangeLog | 45 ++++++++++++++++++++++++++++++++++++++++++++-
NEWS | 4 ++++
configure | 38 +++++++++++++++++++-------------------
configure.ac | 4 ++--
include/ucto/tokenize.h | 5 ++++-
src/tokenize.cxx | 43 +++++++++++++++++++++++++++++++------------
6 files changed, 104 insertions(+), 35 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 0ba4c32..d649859 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,55 @@
+2017-11-07 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * NEWS: new NEWS prior to yar (yet another release)
+
2017-11-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
- * NEWS: typo
+ * : commit 49f6aa8d553e988c92654a828f96be925d4cf52f Author: Ko van
+ der Sloot <K.vanderSloot at let.ru.nl> Date: Mon Nov 6 17:24:11 2017
+ +0100
2017-11-06 Ko van der Sloot <K.vanderSloot at let.ru.nl>
* NEWS, configure.ac, src/Makefile.am: Minor fix: bumped the .so
version to 3.0.0
+2017-11-02 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac, include/ucto/tokenize.h, src/tokenize.cxx: some
+ refactoring
+
+2017-10-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * tests/test.de.tok.V: German test result is changed (improved)
+ after fix in NOSPACE handling
+
+2017-10-24 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/tokenize.cxx: fixed a problem with NOSPACE inside recursive
+ rules
+
+2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * tests/smileys.nl.tok.V: added a test
+
+2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/tokenize.cxx, tests/smileys.nl.txt: added detection of
+ UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS
+
+2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * src/tokenize.cxx: when adding 'extra' text, skip <br> nodes. They
+ don't take <t>
+
+2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * configure.ac: bumped version after release
+
+2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
+
+ * NEWS: NEWS
+
2017-10-23 Ko van der Sloot <K.vanderSloot at let.ru.nl>
* tests/testutt, tests/testutt.ok, tests/utt2.xml: added anothrer
diff --git a/NEWS b/NEWS
index c560f85..856467a 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,7 @@
+0.10 2017-11-07
+[Ko van der Sloot]
+New release due to outdated files in the previous release.
+
0.9.9 2017-11-06
[Ko van der Sloot]
Minor fix:
diff --git a/configure b/configure
index 453becb..8b9e3c1 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ucto 0.9.9.
+# Generated by GNU Autoconf 2.69 for ucto 0.10.
#
# Report bugs to <lamasoftware at science.ru.nl>.
#
@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ucto'
PACKAGE_TARNAME='ucto'
-PACKAGE_VERSION='0.9.9'
-PACKAGE_STRING='ucto 0.9.9'
+PACKAGE_VERSION='0.10'
+PACKAGE_STRING='ucto 0.10'
PACKAGE_BUGREPORT='lamasoftware at science.ru.nl'
PACKAGE_URL=''
@@ -1363,7 +1363,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures ucto 0.9.9 to adapt to many kinds of systems.
+\`configure' configures ucto 0.10 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1434,7 +1434,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of ucto 0.9.9:";;
+ short | recursive ) echo "Configuration of ucto 0.10:";;
esac
cat <<\_ACEOF
@@ -1568,7 +1568,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-ucto configure 0.9.9
+ucto configure 0.10
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2188,7 +2188,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by ucto $as_me 0.9.9, which was
+It was created by ucto $as_me 0.10, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -3051,7 +3051,7 @@ fi
# Define the identity of the package.
PACKAGE='ucto'
- VERSION='0.9.9'
+ VERSION='0.10'
cat >>confdefs.h <<_ACEOF
@@ -16847,12 +16847,12 @@ if test -n "$ticcutils_CFLAGS"; then
pkg_cv_ticcutils_CFLAGS="$ticcutils_CFLAGS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.17 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.17 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.6 " 2>/dev/null`
+ pkg_cv_ticcutils_CFLAGS=`$PKG_CONFIG --cflags "ticcutils >= 0.17 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -16864,12 +16864,12 @@ if test -n "$ticcutils_LIBS"; then
pkg_cv_ticcutils_LIBS="$ticcutils_LIBS"
elif test -n "$PKG_CONFIG"; then
if test -n "$PKG_CONFIG" && \
- { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.6 \""; } >&5
- ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.6 ") 2>&5
+ { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"ticcutils >= 0.17 \""; } >&5
+ ($PKG_CONFIG --exists --print-errors "ticcutils >= 0.17 ") 2>&5
ac_status=$?
$as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
test $ac_status = 0; }; then
- pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.6 " 2>/dev/null`
+ pkg_cv_ticcutils_LIBS=`$PKG_CONFIG --libs "ticcutils >= 0.17 " 2>/dev/null`
test "x$?" != "x0" && pkg_failed=yes
else
pkg_failed=yes
@@ -16890,14 +16890,14 @@ else
_pkg_short_errors_supported=no
fi
if test $_pkg_short_errors_supported = yes; then
- ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1`
+ ticcutils_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "ticcutils >= 0.17 " 2>&1`
else
- ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.6 " 2>&1`
+ ticcutils_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "ticcutils >= 0.17 " 2>&1`
fi
# Put the nasty error message in config.log where it belongs
echo "$ticcutils_PKG_ERRORS" >&5
- as_fn_error $? "Package requirements (ticcutils >= 0.6 ) were not met:
+ as_fn_error $? "Package requirements (ticcutils >= 0.17 ) were not met:
$ticcutils_PKG_ERRORS
@@ -17637,7 +17637,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by ucto $as_me 0.9.9, which was
+This file was extended by ucto $as_me 0.10, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -17703,7 +17703,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-ucto config.status 0.9.9
+ucto config.status 0.10
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index a745d40..1808674 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ(2.59)
-AC_INIT([ucto], [0.9.9], [lamasoftware at science.ru.nl])
+AC_INIT([ucto], [0.10], [lamasoftware at science.ru.nl])
AM_INIT_AUTOMAKE([foreign])
AC_CONFIG_SRCDIR([configure.ac])
AC_CONFIG_MACRO_DIR([m4])
@@ -90,7 +90,7 @@ PKG_CHECK_MODULES([folia], [folia >= 1.10] )
CXXFLAGS="$folia_CFLAGS $CXXFLAGS"
LIBS="$folia_LIBS $LIBS"
-PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.6] )
+PKG_CHECK_MODULES([ticcutils], [ticcutils >= 0.17] )
CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
LIBS="$LIBS $ticcutils_LIBS"
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
index da70e1b..f3a0682 100644
--- a/include/ucto/tokenize.h
+++ b/include/ucto/tokenize.h
@@ -28,12 +28,15 @@
#define UCTO_TOKENIZE_H
#include <vector>
+#include <set>
#include <map>
#include <sstream>
#include <stdexcept>
+#include "libfolia/folia.h"
#include "ucto/unicode.h"
#include "ucto/setting.h"
#include "ticcutils/LogStream.h"
+#include "ticcutils/Unicode.h"
class TextCat;
@@ -215,7 +218,7 @@ namespace Tokenizer {
void setLanguage( const std::string& l ){ default_language = l; };
// set eos marker
- UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; };
+ UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = TiCC::UnicodeFromUTF8(s); return t; };
UnicodeString getEosMarker( ) const { return eosmark; }
bool setNormSet( const std::string& );
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
index 274e1b8..f8c0bdb 100644
--- a/src/tokenize.cxx
+++ b/src/tokenize.cxx
@@ -25,6 +25,8 @@
*/
+#include "ucto/tokenize.h"
+
#include <unistd.h>
#include <iostream>
#include <fstream>
@@ -33,11 +35,9 @@
#include "unicode/schriter.h"
#include "ticcutils/StringOps.h"
#include "ticcutils/PrettyPrint.h"
-#include "libfolia/folia.h"
+#include "ticcutils/Unicode.h"
#include "ucto/unicode.h"
#include "ucto/textcat.h"
-#include "ucto/setting.h"
-#include "ucto/tokenize.h"
#define DO_READLINE
#ifdef HAVE_LIBREADLINE
@@ -111,6 +111,7 @@ namespace Tokenizer {
const UnicodeString type_space = "SPACE";
const UnicodeString type_currency = "CURRENCY";
const UnicodeString type_emoticon = "EMOTICON";
+ const UnicodeString type_picto = "PICTOGRAM";
const UnicodeString type_word = "WORD";
const UnicodeString type_symbol = "SYMBOL";
const UnicodeString type_punctuation = "PUNCTUATION";
@@ -120,7 +121,9 @@ namespace Tokenizer {
Token::Token( const UnicodeString& _type,
const UnicodeString& _s,
TokenRole _role, const string& _lc ):
- type(_type), us(_s), role(_role), lc(_lc) {}
+ type(_type), us(_s), role(_role), lc(_lc) {
+ // cerr << "Created " << *this << endl;
+ }
std::string Token::texttostring() { return folia::UnicodeToUTF8(us); }
@@ -640,6 +643,10 @@ namespace Tokenizer {
// there is already text, bail out.
return;
}
+ if ( root->isSubClass( folia::Linebreak_t ) ){
+ // exception
+ return;
+ }
UnicodeString utxt = root->text( outputclass, false, false );
// so get Untokenized text from the children, and set it
root->settext( folia::UnicodeToUTF8(utxt), outputclass );
@@ -1769,7 +1776,7 @@ namespace Tokenizer {
}
if ( doPunctFilter
&& ( type == type_punctuation || type == type_currency ||
- type == type_emoticon ) ) {
+ type == type_emoticon || type == type_picto ) ) {
if (tokDebug >= 2 ){
LOG << " [passThruLine] skipped PUNCTUATION ["
<< input << "]" << endl;
@@ -1832,7 +1839,7 @@ namespace Tokenizer {
}
if ( doPunctFilter
&& ( type == type_punctuation || type == type_currency ||
- type == type_emoticon ) ) {
+ type == type_emoticon || type == type_picto ) ) {
if (tokDebug >= 2 ){
LOG << " [passThruLine] skipped PUNCTUATION ["
<< input << "]" << endl;
@@ -1908,6 +1915,11 @@ namespace Tokenizer {
return s == UBLOCK_EMOTICONS;
}
+ bool u_ispicto( UChar32 c ){
+ UBlockCode s = ublock_getCode(c);
+ return s == UBLOCK_MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS ;
+ }
+
bool u_iscurrency( UChar32 c ){
return u_charType( c ) == U_CURRENCY_SYMBOL;
}
@@ -1932,6 +1944,9 @@ namespace Tokenizer {
else if ( u_isemo( c ) ) {
return type_emoticon;
}
+ else if ( u_ispicto( c ) ) {
+ return type_picto;
+ }
else if ( u_isalpha(c)) {
return type_word;
}
@@ -2223,12 +2238,13 @@ namespace Tokenizer {
if ( tokDebug > 2 ){
if ( recurse ){
LOG << " [tokenizeWord] Recurse Input: (" << inpLen << ") "
- << "word=[" << input << "], type=" << assigned_type << endl;
+ << "word=[" << input << "], type=" << assigned_type
+ << " Space=" << (space?"TRUE":"FALSE") << endl;
}
else {
LOG << " [tokenizeWord] Input: (" << inpLen << ") "
- << "word=[" << input << "]" << endl;
- }
+ << "word=[" << input << "]"
+ << " Space=" << (space?"TRUE":"FALSE") << endl; }
}
if ( input == eosmark ) {
if (tokDebug >= 2){
@@ -2255,7 +2271,7 @@ namespace Tokenizer {
}
if ( doPunctFilter
&& ( type == type_punctuation || type == type_currency ||
- type == type_emoticon ) ) {
+ type == type_emoticon || type == type_picto ) ) {
if (tokDebug >= 2 ){
LOG << " [tokenizeWord] skipped PUNCTUATION ["
<< input << "]" << endl;
@@ -2336,8 +2352,8 @@ namespace Tokenizer {
}
for ( int m=0; m < max; ++m ){
if ( tokDebug >= 4 ){
- LOG << "\tTOKEN match[" << m << "] = "
- << matches[m] << endl;
+ LOG << "\tTOKEN match[" << m << "] = " << matches[m]
+ << " Space=" << (space?"TRUE":"FALSE") << endl;
}
if ( doPunctFilter
&& (&rule->id)->startsWith("PUNCTUATION") ){
@@ -2355,6 +2371,9 @@ namespace Tokenizer {
if ( post.length() > 0 ) {
internal_space = false;
}
+ else if ( m < max-1 ){
+ internal_space = false;
+ }
UnicodeString word = matches[m];
if ( norm_set.find( type ) != norm_set.end() ){
word = "{{" + type + "}}";
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/ucto.git
More information about the debian-science-commits
mailing list