scripts/gestion/unicode2ascii.py
Antoine Durand-Gasselin 4a68475e34 [wiki-lenny] suppression de static/
darcs-hash:20090314092631-bd074-b01256aeaf71e935851b3ecdbd623eaae8c9e8a1.gz
2009-03-14 10:26:31 +01:00

155 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python
# -*- coding: utf-8 -*-
# This source code is distributed under GNU GPL v2 license
# written by Victor Stinner <victor.stinner AT haypocalc.com>
# http://www.haypocalc.com/
# creatied: 2006-08-14 -- last change: 2007-08-17
# Convert any unicode string to ASCII string:
# - Remove diacriticals
# - Replace special letter with similar ASCII character (similar glyph)
#
# Support greek, cyrillic, some latin letters and some signs.
from unicodedata import normalize
UNICODE_TO_ASCII = {
# Latin letters
u"Æ": u"AE", # U+00C6 (latin capital ligature ae)
u"Ø": u"O", # U+00D8 (latin capital letter o with stroke)
u"ß": u"ss", # U+00DF (latin small letter sharp s)
u"æ": u"ae", # U+00E6 (latin small ligature ae)
u"ø": u"o", # U+00F8 (latin small letter o with stroke)
u"ł": u"l", # U+0142 (latin small letter l with stroke)
u"Œ": u"OE", # U+0152 (latin capital ligature oe)
u"œ": u"oe", # U+0153 (latin small ligature oe)
# Various signs
u"¡": u"!", # U+00A1 (inverted exclamation mark)
u"©": u"(c)", # U+00A9 (copyright sign)
u"«": u'"', # U+00AB (left-pointing double angle quotation mark)
u"®": u"(r)", # U+00AE (registred sign)
u"²": u"2", # U+00B2 (superscript two)
u"»": u'"', # U+00BB (right-pointing double angle quotation mark)
u"": u"/", # U+2044 (fraction slash)
# Greek
u"Α": u"A", # U+0391 (capital alpha)
u"Β": u"B", # U+0392 (capital beta)
u"Ε": u"E", # U+0395 (capital epsilon)
u"Ζ": u"Z", # U+0396 (capital zeta)
u"Η": u"H", # U+0397 (capital eta)
u"Θ": u"O", # U+0398 (captial theta)
u"Ι": u"I", # U+0399 (capital iota)
u"Κ": u"K", # U+039A (capital kappa)
u"Μ": u"M", # U+039C (capital mu)
u"Ν": u"N", # U+039D (capital nu)
u"Ο": u"O", # U+039F (capital omicron)
u"Ρ": u"P", # U+03A1 (capital rho)
u"Τ": u"T", # U+03A4 (capital tau)
u"Υ": u"Y", # U+03A5 (capital upsilon)
u"Χ": u"X", # U+03A7 (capital chi)
u"α": u"a", # U+03B1 (small alpha)
u"β": u"b", # U+03B2 (small beta)
u"γ": u"y", # U+03B2 (small gamma)
u"ε": u"e", # U+03B5 (small espilon)
u"η": u"n", # U+03B7 (small eta)
u"ο": u"o", # U+03BF (small omicron)
u"ρ": u"p", # U+03C1 (small rho)
u"υ": u"v", # U+03C1 (small upsilon)
# Cyrillic
u"І": u"I", # U+0406 (capital byelorussian-ukrainian i)
u"Ј": u"J", # U+0408 (capital je)
u"В": u"B", # U+0412 (capital ve)
u"Е": u"E", # U+0415 (capital ie)
u"И": u"N", # U+0418 (capital i)
u"З": u"3", # U+0417 (capital ze)
u"К": u"K", # U+041A (capital ka)
u"М": u"M", # U+041C (capital em)
u"Н": u"H", # U+041D (capital en)
u"О": u"O", # U+041E (capital o)
u"Р": u"P", # U+0420 (capital er)
u"С": u"C", # U+0421 (capital es)
u"Т": u"T", # U+0422 (capital te)
u"У": u"Y", # U+0423 (capital u)
u"Х": u"X", # U+0425 (capital ha)
u"Я": u"R", # U+042F (capital ya)
u"а": u"a", # U+0430 (small a)
u"в": u"b", # U+0432 (small ve)
u"е": u"e", # U+0435 (small ie)
u"з": u"3", # U+0437 (small ze)
u"к": u"k", # U+043A (small ka)
u"м": u"m", # U+043C (small em)
u"н": u"h", # U+043D (small en)
u"о": u"o", # U+043E (small o)
u"р": u"p", # U+0440 (small er)
u"с": u"c", # U+0441 (small es)
u"т": u"T", # U+0442 (small te)
u"у": u"y", # U+0443 (small u)
u"х": u"x", # U+0445 (small ha)
u"я": u"R", # U+044F (small ya)
u"і": u"i", # U+0456 (small byelorussian-ukrainian i)
u"ј": u"j", # U+0458 (small je)
}
def unicode2ascii(text, replace=False):
"""
Convert an unicode string (type 'unicode') to ascii string (type 'str').
Try to keep same visual result.
You can specify an ASCII character to replace non-ASCII character
in 'replace' argument (eg. replace='?').
>>> unicode2ascii(unicode("¡ Hé hø « español » ! Pythøn", "UTF-8"))
'! He ho " espanol " ! Python'
>>> unicode2ascii(unicode("L'œuf de læticia", "UTF-8"))
"L'oeuf de laeticia"
>>> unicode2ascii(unicode("ῙΈΌΑΒΓΔΕΖΗΘΙΚΛΝΜΞΟΥάήαγδεζημ", "UTF-8"), u'?')
'IEOAB??EZHOIK?NM?OYanay?e?n?'
>>> unicode2ascii(unicode("ЀЁЄЅІЇЈЌЍАВЕЗИКМНОРСТУХавезмнопрстухѐёіїјк", "UTF-8"), u'?')
'EE??IIJKN?BE3NKMHOPCTYXabe3mho?pcTyxeeiijk'
"""
assert isinstance(text, unicode)
if replace:
if isinstance(replace, str):
replace = unicode(replace, "latin-1")
if not isinstance(replace, unicode) \
or len(replace) != 1 \
or not (32 <= ord(replace) <= 127):
raise ValueError(
"invalid replace character (%r): "
"need one ascii printable character" % replace)
ascii = []
for char in text:
# Remove diacriticals
char = normalize("NFKD", char)[0]
# Known values
if char in UNICODE_TO_ASCII:
ascii.append(UNICODE_TO_ASCII[char])
continue
if ord(char) <= 127:
# Add valid ASCII
ascii.append(char)
elif replace:
# non-ASCII character
ascii.append(replace)
# else: ignore it
text = ''.join(ascii)
return text.encode("ascii", "strict")
if __name__ == "__main__":
from doctest import testmod
from sys import exit
failure, total = testmod()
if failure:
print "%s failure on %s tests" % (failure, total)
exit(1)
else:
print "All tests are OK (count=%s)" % total