Module pour la desaccentuation des chaines.

darcs-hash:20071208223207-af139-b9b22a6c22216acf66f1214a5a7f94406e28fcb5.gz
This commit is contained in:
Jeremie Dimino 2007-12-08 23:32:07 +01:00
parent 1edec6e4c6
commit 60a9ec9315

155
gestion/unicode2ascii.py Executable file
View file

@ -0,0 +1,155 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# This source code is distributed under GNU GPL v2 license
# written by Victor Stinner <victor.stinner AT haypocalc.com>
# http://www.haypocalc.com/
# creatied: 2006-08-14 -- last change: 2007-08-17
# Convert any unicode string to ASCII string:
# - Remove diacriticals
# - Replace special letter with similar ASCII character (similar glyph)
#
# Support greek, cyrillic, some latin letters and some signs.
from unicodedata import normalize
UNICODE_TO_ASCII = {
# Latin letters
u"Æ": u"AE", # U+00C6 (latin capital ligature ae)
u"Ø": u"O", # U+00D8 (latin capital letter o with stroke)
u"ß": u"ss", # U+00DF (latin small letter sharp s)
u"æ": u"ae", # U+00E6 (latin small ligature ae)
u"ø": u"o", # U+00F8 (latin small letter o with stroke)
u"ł": u"l", # U+0142 (latin small letter l with stroke)
u"Œ": u"OE", # U+0152 (latin capital ligature oe)
u"œ": u"oe", # U+0153 (latin small ligature oe)
# Various signs
u"¡": u"!", # U+00A1 (inverted exclamation mark)
u"©": u"(c)", # U+00A9 (copyright sign)
u"«": u'"', # U+00AB (left-pointing double angle quotation mark)
u"®": u"(r)", # U+00AE (registred sign)
u"²": u"2", # U+00B2 (superscript two)
u"»": u'"', # U+00BB (right-pointing double angle quotation mark)
u"": u"/", # U+2044 (fraction slash)
# Greek
u"Α": u"A", # U+0391 (capital alpha)
u"Β": u"B", # U+0392 (capital beta)
u"Ε": u"E", # U+0395 (capital epsilon)
u"Ζ": u"Z", # U+0396 (capital zeta)
u"Η": u"H", # U+0397 (capital eta)
u"Θ": u"O", # U+0398 (captial theta)
u"Ι": u"I", # U+0399 (capital iota)
u"Κ": u"K", # U+039A (capital kappa)
u"Μ": u"M", # U+039C (capital mu)
u"Ν": u"N", # U+039D (capital nu)
u"Ο": u"O", # U+039F (capital omicron)
u"Ρ": u"P", # U+03A1 (capital rho)
u"Τ": u"T", # U+03A4 (capital tau)
u"Υ": u"Y", # U+03A5 (capital upsilon)
u"Χ": u"X", # U+03A7 (capital chi)
u"α": u"a", # U+03B1 (small alpha)
u"β": u"b", # U+03B2 (small beta)
u"γ": u"y", # U+03B2 (small gamma)
u"ε": u"e", # U+03B5 (small espilon)
u"η": u"n", # U+03B7 (small eta)
u"ο": u"o", # U+03BF (small omicron)
u"ρ": u"p", # U+03C1 (small rho)
u"υ": u"v", # U+03C1 (small upsilon)
# Cyrillic
u"І": u"I", # U+0406 (capital byelorussian-ukrainian i)
u"Ј": u"J", # U+0408 (capital je)
u"В": u"B", # U+0412 (capital ve)
u"Е": u"E", # U+0415 (capital ie)
u"И": u"N", # U+0418 (capital i)
u"З": u"3", # U+0417 (capital ze)
u"К": u"K", # U+041A (capital ka)
u"М": u"M", # U+041C (capital em)
u"Н": u"H", # U+041D (capital en)
u"О": u"O", # U+041E (capital o)
u"Р": u"P", # U+0420 (capital er)
u"С": u"C", # U+0421 (capital es)
u"Т": u"T", # U+0422 (capital te)
u"У": u"Y", # U+0423 (capital u)
u"Х": u"X", # U+0425 (capital ha)
u"Я": u"R", # U+042F (capital ya)
u"а": u"a", # U+0430 (small a)
u"в": u"b", # U+0432 (small ve)
u"е": u"e", # U+0435 (small ie)
u"з": u"3", # U+0437 (small ze)
u"к": u"k", # U+043A (small ka)
u"м": u"m", # U+043C (small em)
u"н": u"h", # U+043D (small en)
u"о": u"o", # U+043E (small o)
u"р": u"p", # U+0440 (small er)
u"с": u"c", # U+0441 (small es)
u"т": u"T", # U+0442 (small te)
u"у": u"y", # U+0443 (small u)
u"х": u"x", # U+0445 (small ha)
u"я": u"R", # U+044F (small ya)
u"і": u"i", # U+0456 (small byelorussian-ukrainian i)
u"ј": u"j", # U+0458 (small je)
}
def unicode2ascii(text, replace=False):
"""
Convert an unicode string (type 'unicode') to ascii string (type 'str').
Try to keep same visual result.
You can specify an ASCII character to replace non-ASCII character
in 'replace' argument (eg. replace='?').
>>> unicode2ascii(unicode("¡ Hé hø « español » ! Pythøn", "UTF-8"))
'! He ho " espanol " ! Python'
>>> unicode2ascii(unicode("L'œuf de læticia", "UTF-8"))
"L'oeuf de laeticia"
>>> unicode2ascii(unicode("ῙΈΌΑΒΓΔΕΖΗΘΙΚΛΝΜΞΟΥάήαγδεζημ", "UTF-8"), u'?')
'IEOAB??EZHOIK?NM?OYanay?e?n?'
>>> unicode2ascii(unicode("ЀЁЄЅІЇЈЌЍАВЕЗИКМНОРСТУХавезмнопрстухѐёіїјк", "UTF-8"), u'?')
'EE??IIJKN?BE3NKMHOPCTYXabe3mho?pcTyxeeiijk'
"""
assert isinstance(text, unicode)
if replace:
if isinstance(replace, str):
replace = unicode(replace, "latin-1")
if not isinstance(replace, unicode) \
or len(replace) != 1 \
or not (32 <= ord(replace) <= 127):
raise ValueError(
"invalid replace character (%r): "
"need one ascii printable character" % replace)
ascii = []
for char in text:
# Remove diacriticals
char = normalize("NFKD", char)[0]
# Known values
if char in UNICODE_TO_ASCII:
ascii.append(UNICODE_TO_ASCII[char])
continue
if ord(char) <= 127:
# Add valid ASCII
ascii.append(char)
elif replace:
# non-ASCII character
ascii.append(replace)
# else: ignore it
text = ''.join(ascii)
return text.encode("ascii", "strict")
if __name__ == "__main__":
from doctest import testmod
from sys import exit
failure, total = testmod()
if failure:
print "%s failure on %s tests" % (failure, total)
exit(1)
else:
print "All tests are OK (count=%s)" % total