From 60a9ec9315c6928db663d09aea6d67af3120ec04 Mon Sep 17 00:00:00 2001
From: Jeremie Dimino <dimino@crans.org>
Date: Sat, 8 Dec 2007 23:32:07 +0100
Subject: [PATCH] Module pour la desaccentuation des chaines.

darcs-hash:20071208223207-af139-b9b22a6c22216acf66f1214a5a7f94406e28fcb5.gz
---
 gestion/unicode2ascii.py | 155 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100755 gestion/unicode2ascii.py

diff --git a/gestion/unicode2ascii.py b/gestion/unicode2ascii.py
new file mode 100755
index 00000000..61eb60fe
--- /dev/null
+++ b/gestion/unicode2ascii.py
@@ -0,0 +1,155 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# This source code is distributed under GNU GPL v2 license
+# written by Victor Stinner <victor.stinner AT haypocalc.com>
+#                            http://www.haypocalc.com/
+# creatied: 2006-08-14 -- last change: 2007-08-17
+
+# Convert any unicode string to ASCII string:
+#  - Remove diacriticals
+#  - Replace special letter with similar ASCII character (similar glyph)
+#
+# Support greek, cyrillic, some latin letters and some signs.
+
+from unicodedata import normalize
+
+UNICODE_TO_ASCII = {
+    # Latin letters
+    u"Æ": u"AE",   # U+00C6 (latin capital ligature ae)
+    u"Ø": u"O",    # U+00D8 (latin capital letter o with stroke)
+    u"ß": u"ss",   # U+00DF (latin small letter sharp s)
+    u"æ": u"ae",   # U+00E6 (latin small ligature ae)
+    u"ø": u"o",    # U+00F8 (latin small letter o with stroke)
+    u"ł": u"l",    # U+0142 (latin small letter l with stroke)
+    u"Œ": u"OE",   # U+0152 (latin capital ligature oe)
+    u"œ": u"oe",   # U+0153 (latin small ligature oe)
+
+    # Various signs
+    u"¡": u"!",    # U+00A1 (inverted exclamation mark)
+    u"©": u"(c)",  # U+00A9 (copyright sign)
+    u"«": u'"',    # U+00AB (left-pointing double angle quotation mark)
+    u"®": u"(r)",  # U+00AE (registred sign)
+    u"²": u"2",    # U+00B2 (superscript two)
+    u"»": u'"',    # U+00BB (right-pointing double angle quotation mark)
+    u"⁄": u"/",    # U+2044 (fraction slash)
+
+    # Greek
+    u"Α": u"A",    # U+0391 (capital alpha)
+    u"Β": u"B",    # U+0392 (capital beta)
+    u"Ε": u"E",    # U+0395 (capital epsilon)
+    u"Ζ": u"Z",    # U+0396 (capital zeta)
+    u"Η": u"H",    # U+0397 (capital eta)
+    u"Θ": u"O",    # U+0398 (captial theta)
+    u"Ι": u"I",    # U+0399 (capital iota)
+    u"Κ": u"K",    # U+039A (capital kappa)
+    u"Μ": u"M",    # U+039C (capital mu)
+    u"Ν": u"N",    # U+039D (capital nu)
+    u"Ο": u"O",    # U+039F (capital omicron)
+    u"Ρ": u"P",    # U+03A1 (capital rho)
+    u"Τ": u"T",    # U+03A4 (capital tau)
+    u"Υ": u"Y",    # U+03A5 (capital upsilon)
+    u"Χ": u"X",    # U+03A7 (capital chi)
+    u"α": u"a",    # U+03B1 (small alpha)
+    u"β": u"b",    # U+03B2 (small beta)
+    u"γ": u"y",    # U+03B2 (small gamma)
+    u"ε": u"e",    # U+03B5 (small espilon)
+    u"η": u"n",    # U+03B7 (small eta)
+    u"ο": u"o",    # U+03BF (small omicron)
+    u"ρ": u"p",    # U+03C1 (small rho)
+    u"υ": u"v",    # U+03C1 (small upsilon)
+
+    # Cyrillic
+    u"І": u"I",    # U+0406 (capital byelorussian-ukrainian i)
+    u"Ј": u"J",    # U+0408 (capital je)
+    u"В": u"B",    # U+0412 (capital ve)
+    u"Е": u"E",    # U+0415 (capital ie)
+    u"И": u"N",    # U+0418 (capital i)
+    u"З": u"3",    # U+0417 (capital ze)
+    u"К": u"K",    # U+041A (capital ka)
+    u"М": u"M",    # U+041C (capital em)
+    u"Н": u"H",    # U+041D (capital en)
+    u"О": u"O",    # U+041E (capital o)
+    u"Р": u"P",    # U+0420 (capital er)
+    u"С": u"C",    # U+0421 (capital es)
+    u"Т": u"T",    # U+0422 (capital te)
+    u"У": u"Y",    # U+0423 (capital u)
+    u"Х": u"X",    # U+0425 (capital ha)
+    u"Я": u"R",    # U+042F (capital ya)
+    u"а": u"a",    # U+0430 (small a)
+    u"в": u"b",    # U+0432 (small ve)
+    u"е": u"e",    # U+0435 (small ie)
+    u"з": u"3",    # U+0437 (small ze)
+    u"к": u"k",    # U+043A (small ka)
+    u"м": u"m",    # U+043C (small em)
+    u"н": u"h",    # U+043D (small en)
+    u"о": u"o",    # U+043E (small o)
+    u"р": u"p",    # U+0440 (small er)
+    u"с": u"c",    # U+0441 (small es)
+    u"т": u"T",    # U+0442 (small te)
+    u"у": u"y",    # U+0443 (small u)
+    u"х": u"x",    # U+0445 (small ha)
+    u"я": u"R",    # U+044F (small ya)
+    u"і": u"i",    # U+0456 (small byelorussian-ukrainian i)
+    u"ј": u"j",    # U+0458 (small je)
+}
+
+def unicode2ascii(text, replace=False):
+    """
+    Convert an unicode string (type 'unicode') to ascii string (type 'str').
+    Try to keep same visual result.
+
+    You can specify an ASCII character to replace non-ASCII character
+    in 'replace' argument (eg. replace='?').
+
+    >>> unicode2ascii(unicode("¡ Hé hø « español » ! Pythøn", "UTF-8"))
+    '! He ho " espanol " ! Python'
+    >>> unicode2ascii(unicode("L'œuf de læticia", "UTF-8"))
+    "L'oeuf de laeticia"
+    >>> unicode2ascii(unicode("ῙΈΌΑΒΓΔΕΖΗΘΙΚΛΝΜΞΟΥάήαγδεζημ", "UTF-8"), u'?')
+    'IEOAB??EZHOIK?NM?OYanay?e?n?'
+    >>> unicode2ascii(unicode("ЀЁЄЅІЇЈЌЍАВЕЗИКМНОРСТУХавезмнопрстухѐёіїјк", "UTF-8"), u'?')
+    'EE??IIJKN?BE3NKMHOPCTYXabe3mho?pcTyxeeiijk'
+    """
+    assert isinstance(text, unicode)
+    if replace:
+        if isinstance(replace, str):
+            replace = unicode(replace, "latin-1")
+        if not isinstance(replace, unicode) \
+        or len(replace) != 1 \
+        or not (32 <= ord(replace) <= 127):
+            raise ValueError(
+                "invalid replace character (%r): "
+                "need one ascii printable character" % replace)
+
+    ascii = []
+    for char in text:
+        # Remove diacriticals
+        char = normalize("NFKD", char)[0]
+
+        # Known values
+        if char in UNICODE_TO_ASCII:
+            ascii.append(UNICODE_TO_ASCII[char])
+            continue
+
+        if ord(char) <= 127:
+            # Add valid ASCII
+            ascii.append(char)
+        elif replace:
+            # non-ASCII character
+            ascii.append(replace)
+        # else: ignore it
+
+    text = ''.join(ascii)
+    return text.encode("ascii", "strict")
+
+if __name__ == "__main__":
+    from doctest import testmod
+    from sys import exit
+    failure, total = testmod()
+    if failure:
+        print "%s failure on %s tests" % (failure, total)
+        exit(1)
+    else:
+        print "All tests are OK (count=%s)" % total
+