From 60a9ec9315c6928db663d09aea6d67af3120ec04 Mon Sep 17 00:00:00 2001 From: Jeremie Dimino Date: Sat, 8 Dec 2007 23:32:07 +0100 Subject: [PATCH] Module pour la desaccentuation des chaines. darcs-hash:20071208223207-af139-b9b22a6c22216acf66f1214a5a7f94406e28fcb5.gz --- gestion/unicode2ascii.py | 155 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100755 gestion/unicode2ascii.py diff --git a/gestion/unicode2ascii.py b/gestion/unicode2ascii.py new file mode 100755 index 00000000..61eb60fe --- /dev/null +++ b/gestion/unicode2ascii.py @@ -0,0 +1,155 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# This source code is distributed under GNU GPL v2 license +# written by Victor Stinner +# http://www.haypocalc.com/ +# creatied: 2006-08-14 -- last change: 2007-08-17 + +# Convert any unicode string to ASCII string: +# - Remove diacriticals +# - Replace special letter with similar ASCII character (similar glyph) +# +# Support greek, cyrillic, some latin letters and some signs. + +from unicodedata import normalize + +UNICODE_TO_ASCII = { + # Latin letters + u"Æ": u"AE", # U+00C6 (latin capital ligature ae) + u"Ø": u"O", # U+00D8 (latin capital letter o with stroke) + u"ß": u"ss", # U+00DF (latin small letter sharp s) + u"æ": u"ae", # U+00E6 (latin small ligature ae) + u"ø": u"o", # U+00F8 (latin small letter o with stroke) + u"ł": u"l", # U+0142 (latin small letter l with stroke) + u"Œ": u"OE", # U+0152 (latin capital ligature oe) + u"œ": u"oe", # U+0153 (latin small ligature oe) + + # Various signs + u"¡": u"!", # U+00A1 (inverted exclamation mark) + u"©": u"(c)", # U+00A9 (copyright sign) + u"«": u'"', # U+00AB (left-pointing double angle quotation mark) + u"®": u"(r)", # U+00AE (registred sign) + u"²": u"2", # U+00B2 (superscript two) + u"»": u'"', # U+00BB (right-pointing double angle quotation mark) + u"⁄": u"/", # U+2044 (fraction slash) + + # Greek + u"Α": u"A", # U+0391 (capital alpha) + u"Β": u"B", # U+0392 (capital beta) + u"Ε": u"E", # U+0395 (capital epsilon) + u"Ζ": u"Z", # U+0396 (capital zeta) + u"Η": u"H", # U+0397 (capital eta) + u"Θ": u"O", # U+0398 (captial theta) + u"Ι": u"I", # U+0399 (capital iota) + u"Κ": u"K", # U+039A (capital kappa) + u"Μ": u"M", # U+039C (capital mu) + u"Ν": u"N", # U+039D (capital nu) + u"Ο": u"O", # U+039F (capital omicron) + u"Ρ": u"P", # U+03A1 (capital rho) + u"Τ": u"T", # U+03A4 (capital tau) + u"Υ": u"Y", # U+03A5 (capital upsilon) + u"Χ": u"X", # U+03A7 (capital chi) + u"α": u"a", # U+03B1 (small alpha) + u"β": u"b", # U+03B2 (small beta) + u"γ": u"y", # U+03B2 (small gamma) + u"ε": u"e", # U+03B5 (small espilon) + u"η": u"n", # U+03B7 (small eta) + u"ο": u"o", # U+03BF (small omicron) + u"ρ": u"p", # U+03C1 (small rho) + u"υ": u"v", # U+03C1 (small upsilon) + + # Cyrillic + u"І": u"I", # U+0406 (capital byelorussian-ukrainian i) + u"Ј": u"J", # U+0408 (capital je) + u"В": u"B", # U+0412 (capital ve) + u"Е": u"E", # U+0415 (capital ie) + u"И": u"N", # U+0418 (capital i) + u"З": u"3", # U+0417 (capital ze) + u"К": u"K", # U+041A (capital ka) + u"М": u"M", # U+041C (capital em) + u"Н": u"H", # U+041D (capital en) + u"О": u"O", # U+041E (capital o) + u"Р": u"P", # U+0420 (capital er) + u"С": u"C", # U+0421 (capital es) + u"Т": u"T", # U+0422 (capital te) + u"У": u"Y", # U+0423 (capital u) + u"Х": u"X", # U+0425 (capital ha) + u"Я": u"R", # U+042F (capital ya) + u"а": u"a", # U+0430 (small a) + u"в": u"b", # U+0432 (small ve) + u"е": u"e", # U+0435 (small ie) + u"з": u"3", # U+0437 (small ze) + u"к": u"k", # U+043A (small ka) + u"м": u"m", # U+043C (small em) + u"н": u"h", # U+043D (small en) + u"о": u"o", # U+043E (small o) + u"р": u"p", # U+0440 (small er) + u"с": u"c", # U+0441 (small es) + u"т": u"T", # U+0442 (small te) + u"у": u"y", # U+0443 (small u) + u"х": u"x", # U+0445 (small ha) + u"я": u"R", # U+044F (small ya) + u"і": u"i", # U+0456 (small byelorussian-ukrainian i) + u"ј": u"j", # U+0458 (small je) +} + +def unicode2ascii(text, replace=False): + """ + Convert an unicode string (type 'unicode') to ascii string (type 'str'). + Try to keep same visual result. + + You can specify an ASCII character to replace non-ASCII character + in 'replace' argument (eg. replace='?'). + + >>> unicode2ascii(unicode("¡ Hé hø « español » ! Pythøn", "UTF-8")) + '! He ho " espanol " ! Python' + >>> unicode2ascii(unicode("L'œuf de læticia", "UTF-8")) + "L'oeuf de laeticia" + >>> unicode2ascii(unicode("ῙΈΌΑΒΓΔΕΖΗΘΙΚΛΝΜΞΟΥάήαγδεζημ", "UTF-8"), u'?') + 'IEOAB??EZHOIK?NM?OYanay?e?n?' + >>> unicode2ascii(unicode("ЀЁЄЅІЇЈЌЍАВЕЗИКМНОРСТУХавезмнопрстухѐёіїјк", "UTF-8"), u'?') + 'EE??IIJKN?BE3NKMHOPCTYXabe3mho?pcTyxeeiijk' + """ + assert isinstance(text, unicode) + if replace: + if isinstance(replace, str): + replace = unicode(replace, "latin-1") + if not isinstance(replace, unicode) \ + or len(replace) != 1 \ + or not (32 <= ord(replace) <= 127): + raise ValueError( + "invalid replace character (%r): " + "need one ascii printable character" % replace) + + ascii = [] + for char in text: + # Remove diacriticals + char = normalize("NFKD", char)[0] + + # Known values + if char in UNICODE_TO_ASCII: + ascii.append(UNICODE_TO_ASCII[char]) + continue + + if ord(char) <= 127: + # Add valid ASCII + ascii.append(char) + elif replace: + # non-ASCII character + ascii.append(replace) + # else: ignore it + + text = ''.join(ascii) + return text.encode("ascii", "strict") + +if __name__ == "__main__": + from doctest import testmod + from sys import exit + failure, total = testmod() + if failure: + print "%s failure on %s tests" % (failure, total) + exit(1) + else: + print "All tests are OK (count=%s)" % total +