[Python-modules-commits] [python-jellyfish] 01/06: import python-jellyfish_0.5.6.orig.tar.gz

Diego M. Rodriguez diegom-guest at moszumanska.debian.org
Thu Sep 15 15:33:33 UTC 2016


This is an automated email from the git hooks/post-receive script.

diegom-guest pushed a commit to branch master
in repository python-jellyfish.

commit ea4a1ce384845f8bf6cec0ad6db58ec01b975c96
Author: Diego M. Rodriguez <diego.plan9 at gmail.com>
Date:   Thu Sep 15 16:15:29 2016 +0200

    import python-jellyfish_0.5.6.orig.tar.gz
---
 LICENSE                                 |    25 +
 MANIFEST.in                             |     2 +
 PKG-INFO                                |    85 +
 README.rst                              |    66 +
 cjellyfish/damerau_levenshtein.c        |    86 +
 cjellyfish/hamming.c                    |    25 +
 cjellyfish/jaro.c                       |   145 +
 cjellyfish/jellyfish.h                  |    40 +
 cjellyfish/jellyfishmodule.c            |   441 +
 cjellyfish/levenshtein.c                |    47 +
 cjellyfish/metaphone.c                  |   197 +
 cjellyfish/mra.c                        |   115 +
 cjellyfish/nysiis.c                     |   191 +
 cjellyfish/porter.c                     |   395 +
 cjellyfish/soundex.c                    |    70 +
 docs/Makefile                           |   177 +
 docs/changelog.rst                      |    91 +
 docs/comparison.rst                     |    77 +
 docs/conf.py                            |   259 +
 docs/index.rst                          |    33 +
 docs/phonetic.rst                       |    62 +
 docs/stemming.rst                       |    15 +
 jellyfish.egg-info/PKG-INFO             |    85 +
 jellyfish.egg-info/SOURCES.txt          |    43 +
 jellyfish.egg-info/dependency_links.txt |     1 +
 jellyfish.egg-info/top_level.txt        |     1 +
 jellyfish/__init__.py                   |     4 +
 jellyfish/_jellyfish.py                 |   488 +
 jellyfish/compat.py                     |    13 +
 jellyfish/porter.py                     |   218 +
 jellyfish/test.py                       |   213 +
 setup.cfg                               |     5 +
 setup.py                                |   124 +
 testdata/README.md                      |     1 +
 testdata/damerau_levenshtein.csv        |     9 +
 testdata/hamming.csv                    |     8 +
 testdata/jaro_distance.csv              |     4 +
 testdata/jaro_winkler.csv               |     7 +
 testdata/levenshtein.csv                |     6 +
 testdata/match_rating_codex.csv         |     9 +
 testdata/match_rating_comparison.csv    |     6 +
 testdata/metaphone.csv                  |    28 +
 testdata/nysiis.csv                     |    33 +
 testdata/porter.csv                     | 23531 ++++++++++++++++++++++++++++++
 testdata/soundex.csv                    |     9 +
 45 files changed, 27490 insertions(+)

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b563a37
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,25 @@
+Copyright (c) 2015, James Turk
+Copyright (c) 2015, Sunlight Foundation
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..c7c6f27
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include LICENSE *.rst *.py cjellyfish/*.c cjellyfish/*.h docs/* testdata/*
+global-exclude .git
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..cf4b132
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,85 @@
+Metadata-Version: 1.1
+Name: jellyfish
+Version: 0.5.6
+Summary: a library for doing approximate and phonetic matching of strings.
+Home-page: http://github.com/jamesturk/jellyfish
+Author: UNKNOWN
+Author-email: UNKNOWN
+License: UNKNOWN
+Description: =========
+        jellyfish
+        =========
+        
+        .. image:: https://travis-ci.org/jamesturk/jellyfish.svg?branch=master
+            :target: https://travis-ci.org/jamesturk/jellyfish
+        
+        .. image:: https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master
+            :target: https://coveralls.io/r/jamesturk/jellyfish
+        
+        .. image:: https://img.shields.io/pypi/v/jellyfish.svg
+            :target: https://pypi.python.org/pypi/jellyfish
+        
+        .. image:: https://readthedocs.org/projects/jellyfish/badge/?version=latest
+            :target: https://readthedocs.org/projects/jellyfish/?badge=latest
+            :alt: Documentation Status
+        
+        .. image:: https://ci.appveyor.com/api/projects/status/t5o03rqcusxhhe41/branch/master?svg=true
+            :target: https://ci.appveyor.com/project/jamesturk/jellyfish/
+        
+        Jellyfish is a python library for doing approximate and phonetic matching of strings.
+        
+        Written by James Turk <james.p.turk at gmail.com> and Michael Stephens.
+        
+        See https://github.com/jamesturk/jellyfish/graphs/contributors for contributors.
+        
+        Source is available at http://github.com/jamesturk/jellyfish.
+        
+        Included Algorithms
+        ===================
+        
+        String comparison:
+        
+          * Levenshtein Distance
+          * Damerau-Levenshtein Distance
+          * Jaro Distance
+          * Jaro-Winkler Distance
+          * Match Rating Approach Comparison
+          * Hamming Distance
+        
+        Phonetic encoding:
+        
+          * American Soundex
+          * Metaphone
+          * NYSIIS (New York State Identification and Intelligence System)
+          * Match Rating Codex
+        
+        Example Usage
+        =============
+        
+        >>> import jellyfish
+        >>> jellyfish.levenshtein_distance(u'jellyfish', u'smellyfish')
+        2
+        >>> jellyfish.jaro_distance(u'jellyfish', u'smellyfish')
+        0.89629629629629637
+        >>> jellyfish.damerau_levenshtein_distance(u'jellyfish', u'jellyfihs')
+        1
+        
+        >>> jellyfish.metaphone(u'Jellyfish')
+        'JLFX'
+        >>> jellyfish.soundex(u'Jellyfish')
+        'J412'
+        >>> jellyfish.nysiis(u'Jellyfish')
+        'JALYF'
+        >>> jellyfish.match_rating_codex(u'Jellyfish')
+        'JLLFSH'
+        
+Platform: any
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3.3
+Classifier: Programming Language :: Python :: 3.4
+Classifier: Topic :: Text Processing :: Linguistic
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..41ffcd9
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,66 @@
+=========
+jellyfish
+=========
+
+.. image:: https://travis-ci.org/jamesturk/jellyfish.svg?branch=master
+    :target: https://travis-ci.org/jamesturk/jellyfish
+
+.. image:: https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master
+    :target: https://coveralls.io/r/jamesturk/jellyfish
+
+.. image:: https://img.shields.io/pypi/v/jellyfish.svg
+    :target: https://pypi.python.org/pypi/jellyfish
+
+.. image:: https://readthedocs.org/projects/jellyfish/badge/?version=latest
+    :target: https://readthedocs.org/projects/jellyfish/?badge=latest
+    :alt: Documentation Status
+
+.. image:: https://ci.appveyor.com/api/projects/status/t5o03rqcusxhhe41/branch/master?svg=true
+    :target: https://ci.appveyor.com/project/jamesturk/jellyfish/
+
+Jellyfish is a python library for doing approximate and phonetic matching of strings.
+
+Written by James Turk <james.p.turk at gmail.com> and Michael Stephens.
+
+See https://github.com/jamesturk/jellyfish/graphs/contributors for contributors.
+
+Source is available at http://github.com/jamesturk/jellyfish.
+
+Included Algorithms
+===================
+
+String comparison:
+
+  * Levenshtein Distance
+  * Damerau-Levenshtein Distance
+  * Jaro Distance
+  * Jaro-Winkler Distance
+  * Match Rating Approach Comparison
+  * Hamming Distance
+
+Phonetic encoding:
+
+  * American Soundex
+  * Metaphone
+  * NYSIIS (New York State Identification and Intelligence System)
+  * Match Rating Codex
+
+Example Usage
+=============
+
+>>> import jellyfish
+>>> jellyfish.levenshtein_distance(u'jellyfish', u'smellyfish')
+2
+>>> jellyfish.jaro_distance(u'jellyfish', u'smellyfish')
+0.89629629629629637
+>>> jellyfish.damerau_levenshtein_distance(u'jellyfish', u'jellyfihs')
+1
+
+>>> jellyfish.metaphone(u'Jellyfish')
+'JLFX'
+>>> jellyfish.soundex(u'Jellyfish')
+'J412'
+>>> jellyfish.nysiis(u'Jellyfish')
+'JALYF'
+>>> jellyfish.match_rating_codex(u'Jellyfish')
+'JLLFSH'
diff --git a/cjellyfish/damerau_levenshtein.c b/cjellyfish/damerau_levenshtein.c
new file mode 100644
index 0000000..43f5a43
--- /dev/null
+++ b/cjellyfish/damerau_levenshtein.c
@@ -0,0 +1,86 @@
+#include "jellyfish.h"
+#include <string.h>
+#include <stdio.h>
+#include <wchar.h>
+
+
+int damerau_levenshtein_distance(const JFISH_UNICODE *s1, const JFISH_UNICODE *s2, size_t len1, size_t len2)
+{
+    size_t infinite = len1 + len2;
+    size_t cols = len2 + 2;
+
+    size_t i, j, i1, j1;
+    size_t db;
+    size_t d1, d2, d3, d4, result;
+    size_t da_idx;
+    unsigned short cost;
+
+    size_t *dist = NULL;
+
+    const size_t len_da = 256;
+    size_t *da = calloc(len_da, sizeof(size_t));
+    if (!da) {
+        return -1;
+    }
+
+    dist = malloc((len1 + 2) * cols * sizeof(size_t));
+    if (!dist) {
+        free(da);
+        return -1;
+    }
+
+    dist[0] = infinite;
+
+    for (i = 0; i <= len1; i++) {
+        dist[((i + 1) * cols) + 0] = infinite;
+        dist[((i + 1) * cols) + 1] = i;
+    }
+
+    for (i = 0; i <= len2; i++) {
+        dist[i + 1] = infinite;       // 0*cols + row
+        dist[cols + i + 1] = i;       // 1*cols + row
+    }
+
+    for (i = 1; i <= len1; i++) {
+        db = 0;
+        for (j = 1; j <= len2; j++) {
+            da_idx = (JFISH_UNICODE)s2[j-1];
+            if (da_idx >= len_da) {
+                free(dist);
+                free(da);
+                return -2;
+            }
+            i1 = da[da_idx];
+            j1 = db;
+
+            if (s1[i - 1] == s2[j - 1]) {
+                cost = 0;
+                db = j;
+            } else {
+                cost = 1;
+            }
+
+            d1 = dist[(i * cols) + j] + cost;
+            d2 = dist[((i + 1) * cols) + j] + 1;
+            d3 = dist[(i * cols) + j + 1] + 1;
+            d4 = dist[(i1 * cols) + j1] + (i - i1 - 1) + 1 + (j - j1 - 1);
+
+            dist[((i+1)*cols) + j + 1] = MIN(MIN(d1, d2), MIN(d3, d4));
+        }
+
+        da_idx = (JFISH_UNICODE)s1[i-1];
+        if (da_idx >= len_da) {
+            free(dist);
+            free(da);
+            return -2;
+        }
+        da[da_idx] = i;
+    }
+
+    result = dist[((len1+1) * cols) + len2 + 1];
+
+    free(dist);
+    free(da);
+
+    return result;
+}
diff --git a/cjellyfish/hamming.c b/cjellyfish/hamming.c
new file mode 100644
index 0000000..d48540e
--- /dev/null
+++ b/cjellyfish/hamming.c
@@ -0,0 +1,25 @@
+#include "jellyfish.h"
+#include <ctype.h>
+
+size_t hamming_distance(const Py_UNICODE *s1, int len1,
+                        const Py_UNICODE *s2, int len2) {
+    unsigned distance = 0;
+    int i1 = 0;
+    int i2 = 0;
+
+    for (; i1 < len1 && i2 < len2; i1++, i2++, s1++, s2++) {
+        if (*s1 != *s2) {
+            distance++;
+        }
+    }
+
+    for ( ; i1 < len1; i1++, s1++) {
+        distance++;
+    }
+
+    for ( ; i2 < len2; i2++, s2++) {
+        distance++;
+    }
+
+    return distance;
+}
diff --git a/cjellyfish/jaro.c b/cjellyfish/jaro.c
new file mode 100644
index 0000000..7f02928
--- /dev/null
+++ b/cjellyfish/jaro.c
@@ -0,0 +1,145 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "jellyfish.h"
+
+#define NOTNUM(c)   ((c>57) || (c<48))
+
+/* borrowed heavily from strcmp95.c
+ *    http://www.census.gov/geo/msb/stand/strcmp.c
+ */
+double _jaro_winkler(const JFISH_UNICODE *ying, int ying_length,
+                     const JFISH_UNICODE *yang, int yang_length,
+                     int long_tolerance, int winklerize)
+{
+    /* Arguments:
+
+       ying
+       yang
+         pointers to the 2 strings to be compared.
+
+       long_tolerance
+         Increase the probability of a match when the number of matched
+         characters is large.  This option allows for a little more
+         tolerance when the strings are large.  It is not an appropriate
+         test when comparing fixed length fields such as phone and
+         social security numbers.
+    */
+    JFISH_UNICODE *ying_flag=0, *yang_flag=0;
+
+    double weight;
+
+    long min_len;
+    long search_range;
+    long lowlim, hilim;
+    long trans_count, common_chars;
+
+    int i, j, k;
+
+    // ensure that neither string is blank
+    if (!ying_length || !yang_length) return 0;
+
+    search_range = min_len = (ying_length > yang_length) ? ying_length : yang_length;
+
+    // Blank out the flags
+    ying_flag = calloc((ying_length + 1), sizeof(JFISH_UNICODE));
+    if (!ying_flag) {
+        return -100;
+    }
+
+    yang_flag = calloc((yang_length + 1), sizeof(JFISH_UNICODE));
+    if (!yang_flag) {
+        free(ying_flag);
+        return -100;
+    }
+
+    search_range = (search_range/2) - 1;
+    if (search_range < 0) search_range = 0;
+
+
+    // Looking only within the search range, count and flag the matched pairs.
+    common_chars = 0;
+    for (i = 0; i < ying_length; i++) {
+        lowlim = (i >= search_range) ? i - search_range : 0;
+        hilim = (i + search_range <= yang_length-1) ? (i + search_range) : yang_length-1;
+        for (j = lowlim; j <= hilim; j++)  {
+            if (!yang_flag[j] && yang[j] == ying[i]) {
+                yang_flag[j] = 1;
+                ying_flag[i] = 1;
+                common_chars++;
+                break;
+            }
+        }
+    }
+
+    // If no characters in common - return
+    if (!common_chars) {
+        free(ying_flag);
+        free(yang_flag);
+        return 0;
+    }
+
+    // Count the number of transpositions
+    k = trans_count = 0;
+    for (i = 0; i < ying_length; i++) {
+        if (ying_flag[i]) {
+            for (j = k; j < yang_length; j++) {
+                if (yang_flag[j]) {
+                    k = j + 1;
+                    break;
+                }
+            }
+            if (ying[i] != yang[j]) {
+                trans_count++;
+            }
+        }
+    }
+    trans_count /= 2;
+
+    // adjust for similarities in nonmatched characters
+
+    // Main weight computation.
+    weight= common_chars / ((double) ying_length) + common_chars / ((double) yang_length)
+        + ((double) (common_chars - trans_count)) / ((double) common_chars);
+    weight /=  3.0;
+
+    // Continue to boost the weight if the strings are similar
+    if (winklerize && weight > 0.7 && ying_length > 3 && yang_length > 3) {
+
+        // Adjust for having up to the first 4 characters in common
+        j = (min_len >= 4) ? 4 : min_len;
+        for (i=0; ((i<j) && (ying[i] == yang[i]) && (NOTNUM(ying[i]))); i++);
+        if (i) {
+            weight += i * 0.1 * (1.0 - weight);
+        }
+
+        /* Optionally adjust for long strings. */
+        /* After agreeing beginning chars, at least two more must agree and
+           the agreeing characters must be > .5 of remaining characters.
+        */
+        if ((long_tolerance) && (min_len>4) && (common_chars>i+1) && (2*common_chars>=min_len+i)) {
+            if (NOTNUM(ying[0])) {
+                weight += (double) (1.0-weight) *
+                    ((double) (common_chars-i-1) / ((double) (ying_length+yang_length-i*2+2)));
+            }
+        }
+    }
+
+    free(ying_flag);
+    free(yang_flag);
+    return weight;
+}
+
+
+double jaro_winkler(const JFISH_UNICODE *ying, int ying_len,
+        const JFISH_UNICODE *yang, int yang_len,
+        int long_tolerance)
+{
+    return _jaro_winkler(ying, ying_len, yang, yang_len, long_tolerance, 1);
+}
+
+double jaro_distance(const JFISH_UNICODE *ying, int ying_len, const JFISH_UNICODE *yang, int yang_len)
+{
+    return _jaro_winkler(ying, ying_len, yang, yang_len, 0, 0);
+}
diff --git a/cjellyfish/jellyfish.h b/cjellyfish/jellyfish.h
new file mode 100644
index 0000000..3d50d14
--- /dev/null
+++ b/cjellyfish/jellyfish.h
@@ -0,0 +1,40 @@
+#ifndef _JELLYFISH_H_
+#define _JELLYFISH_H_
+
+#include <stdlib.h>
+
+#if CJELLYFISH_PYTHON
+#include <Python.h>
+#define JFISH_UNICODE Py_UNICODE
+#endif
+
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+double jaro_winkler(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2, int long_tolerance);
+double jaro_distance(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2);
+
+size_t hamming_distance(const JFISH_UNICODE *str1, int len1,
+        const JFISH_UNICODE *str2, int len2);
+
+int levenshtein_distance(const JFISH_UNICODE *str1, int len1, const JFISH_UNICODE *str2, int len2);
+
+int damerau_levenshtein_distance(const JFISH_UNICODE *str1, const JFISH_UNICODE *str2,
+        size_t len1, size_t len2);
+
+char* soundex(const char *str);
+
+char* metaphone(const char *str);
+
+JFISH_UNICODE *nysiis(const JFISH_UNICODE *str, int len);
+
+JFISH_UNICODE* match_rating_codex(const JFISH_UNICODE *str, size_t len);
+int match_rating_comparison(const JFISH_UNICODE *str1, size_t len1, const JFISH_UNICODE *str2, size_t len2);
+
+struct stemmer;
+extern struct stemmer * create_stemmer(void);
+extern void free_stemmer(struct stemmer * z);
+extern int stem(struct stemmer * z, JFISH_UNICODE * b, int k);
+
+#endif
diff --git a/cjellyfish/jellyfishmodule.c b/cjellyfish/jellyfishmodule.c
new file mode 100644
index 0000000..5c60429
--- /dev/null
+++ b/cjellyfish/jellyfishmodule.c
@@ -0,0 +1,441 @@
+#include <Python.h>
+#include <math.h>
+#include "jellyfish.h"
+
+struct jellyfish_state {
+    PyObject *unicodedata_normalize;
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define GETSTATE(m) ((struct jellyfish_state*)PyModule_GetState(m))
+#else
+#define GETSTATE(m) (&_state)
+static struct jellyfish_state _state;
+#endif
+
+#ifdef _MSC_VER
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#define UTF8_BYTES(s) (PyBytes_AS_STRING(s))
+#define NO_BYTES_ERR_STR "expected str, got bytes"
+#else
+#define UTF8_BYTES(s) (PyString_AS_STRING(s))
+#define NO_BYTES_ERR_STR "expected unicode, got str"
+#endif
+
+#define UNSUPPORTED_CODEPOINT "Encountered unsupported code point in string."
+
+
+/* Returns a new reference to a PyString (python < 3) or
+ * PyBytes (python >= 3.0).
+ *
+ * If passed a PyUnicode, the returned object will be NFKD UTF-8.
+ * If passed a PyString or PyBytes no conversion is done.
+ */
+static INLINE PyObject* normalize(PyObject *mod, const Py_UNICODE *pystr) {
+    PyObject *unicodedata_normalize;
+    PyObject *normalized;
+    PyObject *utf8;
+
+    unicodedata_normalize = GETSTATE(mod)->unicodedata_normalize;
+    normalized = PyObject_CallFunction(unicodedata_normalize,
+                                       "su", "NFKD", pystr);
+    if (!normalized) {
+        return NULL;
+    }
+    utf8 = PyUnicode_AsUTF8String(normalized);
+    Py_DECREF(normalized);
+    return utf8;
+}
+
+static PyObject * jellyfish_jaro_winkler(PyObject *self, PyObject *args, PyObject *kw)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    double result;
+    int long_tolerance = 0;
+    static char *keywords[] = {"s1", "s2", "long_tolerance", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kw, "u#u#|i", keywords, &s1, &len1, &s2, &len2, &long_tolerance)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = jaro_winkler(s1, len1, s2, len2, long_tolerance);
+    // jaro returns a big negative number on error, don't use
+    // 0 here in case there's floating point inaccuracy
+    // .. used to use NaN but different compilers (*cough*MSVC*cough)
+    // handle it really poorly
+    if (result < -1) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("d", result);
+}
+
+static PyObject * jellyfish_jaro_distance(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    double result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = jaro_distance(s1, len1, s2, len2);
+    // see earlier note about jaro_distance return value
+    if (result < -1) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("d", result);
+}
+
+static PyObject * jellyfish_hamming_distance(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    unsigned result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = hamming_distance(s1, len1, s2, len2);
+
+    return Py_BuildValue("I", result);
+}
+
+static PyObject* jellyfish_levenshtein_distance(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *s1, *s2;
+    int len1, len2;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = levenshtein_distance(s1, len1, s2, len2);
+    if (result == -1) {
+        // levenshtein_distance only returns failure code (-1) on
+        // failed malloc
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    return Py_BuildValue("i", result);
+}
+
+
+static PyObject* jellyfish_damerau_levenshtein_distance(PyObject *self,
+                                                        PyObject *args)
+{
+    Py_UNICODE *s1, *s2;
+    int len1, len2;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &s1, &len1, &s2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = damerau_levenshtein_distance(s1, s2, len1, len2);
+    if (result == -1) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    else if (result == -2) {
+        PyErr_SetString(PyExc_ValueError, UNSUPPORTED_CODEPOINT);
+        return NULL;
+    }
+
+    return Py_BuildValue("i", result);
+}
+
+static PyObject* jellyfish_soundex(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    int len;
+    PyObject *normalized;
+    PyObject* ret;
+    char *result;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    normalized = normalize(self, str);
+    if (!normalized) {
+        return NULL;
+    }
+
+    result = soundex(UTF8_BYTES(normalized));
+    Py_DECREF(normalized);
+
+    if (!result) {
+        // soundex only fails on bad malloc
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("s", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_metaphone(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    int len;
+    PyObject *normalized;
+    PyObject *ret;
+    char *result;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    normalized = normalize(self, str);
+    if (!normalized) {
+        return NULL;
+    }
+
+    result = metaphone((const char*)UTF8_BYTES(normalized));
+    Py_DECREF(normalized);
+
+    if (!result) {
+        // metaphone only fails on bad malloc
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("s", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_match_rating_codex(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    int len;
+    Py_UNICODE *result;
+    PyObject *ret;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = match_rating_codex(str, len);
+    if (!result) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("u", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_match_rating_comparison(PyObject *self,
+                                                   PyObject *args)
+{
+    const Py_UNICODE *str1, *str2;
+    int len1, len2;
+    int result;
+
+    if (!PyArg_ParseTuple(args, "u#u#", &str1, &len1, &str2, &len2)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = match_rating_comparison(str1, len1, str2, len2);
+
+    if (result == -1) {
+        Py_RETURN_NONE;
+    } else if (result) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject* jellyfish_nysiis(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    Py_UNICODE *result;
+    int len;
+    PyObject *ret;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    result = nysiis(str, len);
+    if (!result) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    ret = Py_BuildValue("u", result);
+    free(result);
+
+    return ret;
+}
+
+static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
+{
+    const Py_UNICODE *str;
+    int len;
+    Py_UNICODE *result;
+    PyObject *ret;
+    struct stemmer *z;
+    int end;
+
+    if (!PyArg_ParseTuple(args, "u#", &str, &len)) {
+        PyErr_SetString(PyExc_TypeError, NO_BYTES_ERR_STR);
+        return NULL;
+    }
+
+    z = create_stemmer();
+    if (!z) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    result = malloc((len+1) * sizeof(Py_UNICODE));
+    if (!result) {
+        free_stemmer(z);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memcpy(result, str, len * sizeof(Py_UNICODE));
+
+    end = stem(z, result, len - 1);
+    result[end + 1] = '\0';
+
+    ret = Py_BuildValue("u", result);
+
+    free(result);
+    free_stemmer(z);
+
+    return ret;
+}
+
+static PyMethodDef jellyfish_methods[] = {
+    {"jaro_winkler", (PyCFunction)jellyfish_jaro_winkler, METH_VARARGS|METH_KEYWORDS,
+     "jaro_winkler(string1, string2, long_tolerance)\n\n"
+     "Do a Jaro-Winkler string comparison between string1 and string2."},
+
+    {"jaro_distance", jellyfish_jaro_distance, METH_VARARGS,
+     "jaro_distance(string1, string2)\n\n"
+     "Get a Jaro string distance metric for string1 and string2."},
+
+    {"hamming_distance", jellyfish_hamming_distance, METH_VARARGS,
+     "hamming_distance(string1, string2)\n\n"
+     "Compute the Hamming distance between string1 and string2."},
+
+    {"levenshtein_distance", jellyfish_levenshtein_distance, METH_VARARGS,
+     "levenshtein_distance(string1, string2)\n\n"
+     "Compute the Levenshtein distance between string1 and string2."},
+
+    {"damerau_levenshtein_distance", jellyfish_damerau_levenshtein_distance,
+     METH_VARARGS,
+     "damerau_levenshtein_distance(string1, string2)\n\n"
+     "Compute the Damerau-Levenshtein distance between string1 and string2."},
+
+    {"soundex", jellyfish_soundex, METH_VARARGS,
+     "soundex(string)\n\n"
+     "Calculate the soundex code for a given name."},
+
+    {"metaphone", jellyfish_metaphone, METH_VARARGS,
+     "metaphone(string)\n\n"
+     "Calculate the metaphone representation of a given string."},
+
+    {"match_rating_codex", jellyfish_match_rating_codex, METH_VARARGS,
+     "match_rating_codex(string)\n\n"
+     "Calculate the Match Rating Approach representation of a given string."},
+
+    {"match_rating_comparison", jellyfish_match_rating_comparison, METH_VARARGS,
+     "match_rating_comparison(string, string)\n\n"
+     "Compute the Match Rating Approach similarity between string1 and"
+     "string2."},
+
+    {"nysiis", jellyfish_nysiis, METH_VARARGS,
+     "nysiis(string)\n\n"
+     "Compute the NYSIIS (New York State Identification and Intelligence\n"
+     "System) code for a string."},
+
+    {"porter_stem", jellyfish_porter_stem, METH_VARARGS,
+     "porter_stem(string)\n\n"
+     "Return the result of running the Porter stemming algorithm on "
+     "a single-word string."},
+
+    {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#define INITERROR return NULL
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "jellyfish.cjellyfish",
+    NULL,
+    sizeof(struct jellyfish_state),
+    jellyfish_methods,
+    NULL,
+    NULL,
+    NULL,
+    NULL
+};
+
+PyObject* PyInit_cjellyfish(void)
+#else
+
+#define INITERROR return
+
+PyMODINIT_FUNC initcjellyfish(void)
+#endif
+{
+    PyObject *unicodedata;
+
+#if PY_MAJOR_VERSION >= 3
... 26814 lines suppressed ...

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/python-modules/packages/python-jellyfish.git



More information about the Python-modules-commits mailing list