#!/usr/bin/python # -*- coding: utf-8 -*- # This source code is distributed under GNU GPL v2 license # written by Victor Stinner # http://www.haypocalc.com/ # creatied: 2006-08-14 -- last change: 2007-08-17 # Convert any unicode string to ASCII string: # - Remove diacriticals # - Replace special letter with similar ASCII character (similar glyph) # # Support greek, cyrillic, some latin letters and some signs. from unicodedata import normalize UNICODE_TO_ASCII = { # Latin letters u"Æ": u"AE", # U+00C6 (latin capital ligature ae) u"Ø": u"O", # U+00D8 (latin capital letter o with stroke) u"ß": u"ss", # U+00DF (latin small letter sharp s) u"æ": u"ae", # U+00E6 (latin small ligature ae) u"ø": u"o", # U+00F8 (latin small letter o with stroke) u"ł": u"l", # U+0142 (latin small letter l with stroke) u"Œ": u"OE", # U+0152 (latin capital ligature oe) u"œ": u"oe", # U+0153 (latin small ligature oe) # Various signs u"¡": u"!", # U+00A1 (inverted exclamation mark) u"©": u"(c)", # U+00A9 (copyright sign) u"«": u'"', # U+00AB (left-pointing double angle quotation mark) u"®": u"(r)", # U+00AE (registred sign) u"²": u"2", # U+00B2 (superscript two) u"»": u'"', # U+00BB (right-pointing double angle quotation mark) u"⁄": u"/", # U+2044 (fraction slash) # Greek u"Α": u"A", # U+0391 (capital alpha) u"Β": u"B", # U+0392 (capital beta) u"Ε": u"E", # U+0395 (capital epsilon) u"Ζ": u"Z", # U+0396 (capital zeta) u"Η": u"H", # U+0397 (capital eta) u"Θ": u"O", # U+0398 (captial theta) u"Ι": u"I", # U+0399 (capital iota) u"Κ": u"K", # U+039A (capital kappa) u"Μ": u"M", # U+039C (capital mu) u"Ν": u"N", # U+039D (capital nu) u"Ο": u"O", # U+039F (capital omicron) u"Ρ": u"P", # U+03A1 (capital rho) u"Τ": u"T", # U+03A4 (capital tau) u"Υ": u"Y", # U+03A5 (capital upsilon) u"Χ": u"X", # U+03A7 (capital chi) u"α": u"a", # U+03B1 (small alpha) u"β": u"b", # U+03B2 (small beta) u"γ": u"y", # U+03B2 (small gamma) u"ε": u"e", # U+03B5 (small espilon) u"η": u"n", # U+03B7 (small eta) u"ο": u"o", # U+03BF (small omicron) u"ρ": u"p", # U+03C1 (small rho) u"υ": u"v", # U+03C1 (small upsilon) # Cyrillic u"І": u"I", # U+0406 (capital byelorussian-ukrainian i) u"Ј": u"J", # U+0408 (capital je) u"В": u"B", # U+0412 (capital ve) u"Е": u"E", # U+0415 (capital ie) u"И": u"N", # U+0418 (capital i) u"З": u"3", # U+0417 (capital ze) u"К": u"K", # U+041A (capital ka) u"М": u"M", # U+041C (capital em) u"Н": u"H", # U+041D (capital en) u"О": u"O", # U+041E (capital o) u"Р": u"P", # U+0420 (capital er) u"С": u"C", # U+0421 (capital es) u"Т": u"T", # U+0422 (capital te) u"У": u"Y", # U+0423 (capital u) u"Х": u"X", # U+0425 (capital ha) u"Я": u"R", # U+042F (capital ya) u"а": u"a", # U+0430 (small a) u"в": u"b", # U+0432 (small ve) u"е": u"e", # U+0435 (small ie) u"з": u"3", # U+0437 (small ze) u"к": u"k", # U+043A (small ka) u"м": u"m", # U+043C (small em) u"н": u"h", # U+043D (small en) u"о": u"o", # U+043E (small o) u"р": u"p", # U+0440 (small er) u"с": u"c", # U+0441 (small es) u"т": u"T", # U+0442 (small te) u"у": u"y", # U+0443 (small u) u"х": u"x", # U+0445 (small ha) u"я": u"R", # U+044F (small ya) u"і": u"i", # U+0456 (small byelorussian-ukrainian i) u"ј": u"j", # U+0458 (small je) } def unicode2ascii(text, replace=False): """ Convert an unicode string (type 'unicode') to ascii string (type 'str'). Try to keep same visual result. You can specify an ASCII character to replace non-ASCII character in 'replace' argument (eg. replace='?'). >>> unicode2ascii(unicode("¡ Hé hø « español » ! Pythøn", "UTF-8")) '! He ho " espanol " ! Python' >>> unicode2ascii(unicode("L'œuf de læticia", "UTF-8")) "L'oeuf de laeticia" >>> unicode2ascii(unicode("ῙΈΌΑΒΓΔΕΖΗΘΙΚΛΝΜΞΟΥάήαγδεζημ", "UTF-8"), u'?') 'IEOAB??EZHOIK?NM?OYanay?e?n?' >>> unicode2ascii(unicode("ЀЁЄЅІЇЈЌЍАВЕЗИКМНОРСТУХавезмнопрстухѐёіїјк", "UTF-8"), u'?') 'EE??IIJKN?BE3NKMHOPCTYXabe3mho?pcTyxeeiijk' """ assert isinstance(text, unicode) if replace: if isinstance(replace, str): replace = unicode(replace, "latin-1") if not isinstance(replace, unicode) \ or len(replace) != 1 \ or not (32 <= ord(replace) <= 127): raise ValueError( "invalid replace character (%r): " "need one ascii printable character" % replace) ascii = [] for char in text: # Remove diacriticals char = normalize("NFKD", char)[0] # Known values if char in UNICODE_TO_ASCII: ascii.append(UNICODE_TO_ASCII[char]) continue if ord(char) <= 127: # Add valid ASCII ascii.append(char) elif replace: # non-ASCII character ascii.append(replace) # else: ignore it text = ''.join(ascii) return text.encode("ascii", "strict") if __name__ == "__main__": from doctest import testmod from sys import exit failure, total = testmod() if failure: print "%s failure on %s tests" % (failure, total) exit(1) else: print "All tests are OK (count=%s)" % total