|
""" |
|
This gives other modules access to the gritty details about characters and the |
|
encodings that use them. |
|
""" |
|
|
|
from __future__ import annotations |
|
|
|
import html |
|
import itertools |
|
import re |
|
import unicodedata |
|
|
|
|
|
|
|
CHARMAP_ENCODINGS = [ |
|
"latin-1", |
|
"sloppy-windows-1252", |
|
"sloppy-windows-1251", |
|
"sloppy-windows-1250", |
|
"sloppy-windows-1253", |
|
"sloppy-windows-1254", |
|
"sloppy-windows-1257", |
|
"iso-8859-2", |
|
"macroman", |
|
"cp437", |
|
] |
|
|
|
SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") |
|
DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") |
|
|
|
|
|
def _build_regexes() -> dict[str, re.Pattern[str]]: |
|
""" |
|
ENCODING_REGEXES contain reasonably fast ways to detect if we |
|
could represent a given string in a given encoding. The simplest one is |
|
the 'ascii' detector, which of course just determines if all characters |
|
are between U+0000 and U+007F. |
|
""" |
|
|
|
encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")} |
|
|
|
for encoding in CHARMAP_ENCODINGS: |
|
|
|
|
|
|
|
byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) |
|
charlist = byte_range.decode(encoding) |
|
|
|
|
|
|
|
|
|
|
|
|
|
regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$" |
|
encoding_regexes[encoding] = re.compile(regex) |
|
return encoding_regexes |
|
|
|
|
|
ENCODING_REGEXES = _build_regexes() |
|
|
|
|
|
def _build_html_entities() -> dict[str, str]: |
|
entities = {} |
|
|
|
|
|
|
|
for name, char in html.entities.html5.items(): |
|
if name.endswith(";"): |
|
entities["&" + name] = char |
|
|
|
|
|
|
|
|
|
if name == name.lower(): |
|
name_upper = name.upper() |
|
entity_upper = "&" + name_upper |
|
if html.unescape(entity_upper) == entity_upper: |
|
entities[entity_upper] = char.upper() |
|
return entities |
|
|
|
|
|
HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};") |
|
HTML_ENTITIES = _build_html_entities() |
|
|
|
|
|
def possible_encoding(text: str, encoding: str) -> bool: |
|
""" |
|
Given text and a single-byte encoding, check whether that text could have |
|
been decoded from that single-byte encoding. |
|
|
|
In other words, check whether it can be encoded in that encoding, possibly |
|
sloppily. |
|
""" |
|
return bool(ENCODING_REGEXES[encoding].match(text)) |
|
|
|
|
|
def _build_control_char_mapping() -> dict[int, None]: |
|
""" |
|
Build a translate mapping that strips likely-unintended control characters. |
|
See :func:`ftfy.fixes.remove_control_chars` for a description of these |
|
codepoint ranges and why they should be removed. |
|
""" |
|
control_chars: dict[int, None] = {} |
|
|
|
for i in itertools.chain( |
|
range(0x00, 0x09), |
|
[0x0B], |
|
range(0x0E, 0x20), |
|
[0x7F], |
|
range(0x206A, 0x2070), |
|
[0xFEFF], |
|
range(0xFFF9, 0xFFFD), |
|
): |
|
control_chars[i] = None |
|
|
|
return control_chars |
|
|
|
|
|
CONTROL_CHARS = _build_control_char_mapping() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ALTERED_UTF8_RE = re.compile( |
|
b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]" |
|
b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]" |
|
b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]" |
|
b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]" |
|
b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]" |
|
b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LOSSY_UTF8_RE = re.compile( |
|
b"[\xc2-\xdf][\x1a]" |
|
b"|[\xc2-\xc3][?]" |
|
b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]" |
|
b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]" |
|
b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]" |
|
b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]" |
|
b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]" |
|
b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]" |
|
b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]" |
|
b"|\x1a" |
|
) |
|
|
|
|
|
|
|
|
|
C1_CONTROL_RE = re.compile(r"[\x80-\x9f]") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LIGATURES = { |
|
ord("Δ²"): "IJ", |
|
ord("Δ³"): "ij", |
|
ord("Ε"): "ΚΌn", |
|
ord("Η±"): "DZ", |
|
ord("Η²"): "Dz", |
|
ord("Η³"): "dz", |
|
ord("Η"): "DΕ½", |
|
ord("Η
"): "DΕΎ", |
|
ord("Η"): "dΕΎ", |
|
ord("Η"): "LJ", |
|
ord("Η"): "Lj", |
|
ord("Η"): "lj", |
|
ord("Η"): "NJ", |
|
ord("Η"): "Nj", |
|
ord("Η"): "nj", |
|
ord("ο¬"): "ff", |
|
ord("ο¬"): "fi", |
|
ord("ο¬"): "fl", |
|
ord("ο¬"): "ffi", |
|
ord("ο¬"): "ffl", |
|
ord("ο¬
"): "ΕΏt", |
|
ord("ο¬"): "st", |
|
} |
|
|
|
|
|
def _build_width_map() -> dict[int, str]: |
|
""" |
|
Build a translate mapping that replaces halfwidth and fullwidth forms |
|
with their standard-width forms. |
|
""" |
|
|
|
|
|
|
|
width_map = {0x3000: " "} |
|
for i in range(0xFF01, 0xFFF0): |
|
char = chr(i) |
|
alternate = unicodedata.normalize("NFKC", char) |
|
if alternate != char: |
|
width_map[i] = alternate |
|
return width_map |
|
|
|
|
|
WIDTH_MAP = _build_width_map() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
UTF8_CLUES: dict[str, str] = { |
|
|
|
"utf8_first_of_2": ( |
|
"\N{LATIN CAPITAL LETTER A WITH BREVE}" |
|
"\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}" |
|
"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER A WITH MACRON}" |
|
"\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" |
|
"\N{LATIN CAPITAL LETTER A WITH TILDE}" |
|
"\N{LATIN CAPITAL LETTER AE}" |
|
"\N{LATIN CAPITAL LETTER C WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER C WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER C WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER D WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER D WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER E WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER E WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}" |
|
"\N{LATIN CAPITAL LETTER E WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" |
|
"\N{LATIN CAPITAL LETTER E WITH GRAVE}" |
|
"\N{LATIN CAPITAL LETTER E WITH MACRON}" |
|
"\N{LATIN CAPITAL LETTER E WITH OGONEK}" |
|
"\N{LATIN CAPITAL LETTER ETH}" |
|
"\N{LATIN CAPITAL LETTER G WITH BREVE}" |
|
"\N{LATIN CAPITAL LETTER G WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER I WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}" |
|
"\N{LATIN CAPITAL LETTER I WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" |
|
"\N{LATIN CAPITAL LETTER I WITH GRAVE}" |
|
"\N{LATIN CAPITAL LETTER I WITH MACRON}" |
|
"\N{LATIN CAPITAL LETTER K WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER L WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER L WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER L WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER N WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER N WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER N WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER N WITH TILDE}" |
|
"\N{LATIN CAPITAL LETTER O WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}" |
|
"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" |
|
"\N{LATIN CAPITAL LETTER O WITH GRAVE}" |
|
"\N{LATIN CAPITAL LETTER O WITH MACRON}" |
|
"\N{LATIN CAPITAL LETTER O WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER O WITH TILDE}" |
|
"\N{LATIN CAPITAL LETTER R WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER S WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER S WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER S WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER T WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER THORN}" |
|
"\N{LATIN CAPITAL LETTER U WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}" |
|
"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" |
|
"\N{LATIN CAPITAL LETTER U WITH GRAVE}" |
|
"\N{LATIN CAPITAL LETTER U WITH MACRON}" |
|
"\N{LATIN CAPITAL LETTER U WITH OGONEK}" |
|
"\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" |
|
"\N{LATIN CAPITAL LETTER Y WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER Z WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER Z WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" |
|
"\N{LATIN SMALL LETTER SHARP S}" |
|
"\N{MULTIPLICATION SIGN}" |
|
"\N{GREEK CAPITAL LETTER BETA}" |
|
"\N{GREEK CAPITAL LETTER GAMMA}" |
|
"\N{GREEK CAPITAL LETTER DELTA}" |
|
"\N{GREEK CAPITAL LETTER EPSILON}" |
|
"\N{GREEK CAPITAL LETTER ZETA}" |
|
"\N{GREEK CAPITAL LETTER ETA}" |
|
"\N{GREEK CAPITAL LETTER THETA}" |
|
"\N{GREEK CAPITAL LETTER IOTA}" |
|
"\N{GREEK CAPITAL LETTER KAPPA}" |
|
"\N{GREEK CAPITAL LETTER LAMDA}" |
|
"\N{GREEK CAPITAL LETTER MU}" |
|
"\N{GREEK CAPITAL LETTER NU}" |
|
"\N{GREEK CAPITAL LETTER XI}" |
|
"\N{GREEK CAPITAL LETTER OMICRON}" |
|
"\N{GREEK CAPITAL LETTER PI}" |
|
"\N{GREEK CAPITAL LETTER RHO}" |
|
"\N{GREEK CAPITAL LETTER SIGMA}" |
|
"\N{GREEK CAPITAL LETTER TAU}" |
|
"\N{GREEK CAPITAL LETTER UPSILON}" |
|
"\N{GREEK CAPITAL LETTER PHI}" |
|
"\N{GREEK CAPITAL LETTER CHI}" |
|
"\N{GREEK CAPITAL LETTER PSI}" |
|
"\N{GREEK CAPITAL LETTER OMEGA}" |
|
"\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" |
|
"\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" |
|
"\N{GREEK SMALL LETTER ALPHA WITH TONOS}" |
|
"\N{GREEK SMALL LETTER EPSILON WITH TONOS}" |
|
"\N{GREEK SMALL LETTER ETA WITH TONOS}" |
|
"\N{GREEK SMALL LETTER IOTA WITH TONOS}" |
|
"\N{CYRILLIC CAPITAL LETTER VE}" |
|
"\N{CYRILLIC CAPITAL LETTER GHE}" |
|
"\N{CYRILLIC CAPITAL LETTER DE}" |
|
"\N{CYRILLIC CAPITAL LETTER IE}" |
|
"\N{CYRILLIC CAPITAL LETTER ZHE}" |
|
"\N{CYRILLIC CAPITAL LETTER ZE}" |
|
"\N{CYRILLIC CAPITAL LETTER I}" |
|
"\N{CYRILLIC CAPITAL LETTER SHORT I}" |
|
"\N{CYRILLIC CAPITAL LETTER KA}" |
|
"\N{CYRILLIC CAPITAL LETTER EL}" |
|
"\N{CYRILLIC CAPITAL LETTER EM}" |
|
"\N{CYRILLIC CAPITAL LETTER EN}" |
|
"\N{CYRILLIC CAPITAL LETTER O}" |
|
"\N{CYRILLIC CAPITAL LETTER PE}" |
|
"\N{CYRILLIC CAPITAL LETTER ER}" |
|
"\N{CYRILLIC CAPITAL LETTER ES}" |
|
"\N{CYRILLIC CAPITAL LETTER TE}" |
|
"\N{CYRILLIC CAPITAL LETTER U}" |
|
"\N{CYRILLIC CAPITAL LETTER EF}" |
|
"\N{CYRILLIC CAPITAL LETTER HA}" |
|
"\N{CYRILLIC CAPITAL LETTER TSE}" |
|
"\N{CYRILLIC CAPITAL LETTER CHE}" |
|
"\N{CYRILLIC CAPITAL LETTER SHA}" |
|
"\N{CYRILLIC CAPITAL LETTER SHCHA}" |
|
"\N{CYRILLIC CAPITAL LETTER HARD SIGN}" |
|
"\N{CYRILLIC CAPITAL LETTER YERU}" |
|
"\N{CYRILLIC CAPITAL LETTER SOFT SIGN}" |
|
"\N{CYRILLIC CAPITAL LETTER E}" |
|
"\N{CYRILLIC CAPITAL LETTER YU}" |
|
"\N{CYRILLIC CAPITAL LETTER YA}" |
|
), |
|
|
|
"utf8_first_of_3": ( |
|
"\N{LATIN SMALL LETTER A WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER A WITH BREVE}" |
|
"\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}" |
|
"\N{LATIN SMALL LETTER A WITH DIAERESIS}" |
|
"\N{LATIN SMALL LETTER A WITH GRAVE}" |
|
"\N{LATIN SMALL LETTER A WITH MACRON}" |
|
"\N{LATIN SMALL LETTER A WITH OGONEK}" |
|
"\N{LATIN SMALL LETTER A WITH RING ABOVE}" |
|
"\N{LATIN SMALL LETTER A WITH TILDE}" |
|
"\N{LATIN SMALL LETTER AE}" |
|
"\N{LATIN SMALL LETTER C WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER C WITH CARON}" |
|
"\N{LATIN SMALL LETTER C WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER D WITH CARON}" |
|
"\N{LATIN SMALL LETTER E WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER E WITH CARON}" |
|
"\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}" |
|
"\N{LATIN SMALL LETTER E WITH DIAERESIS}" |
|
"\N{LATIN SMALL LETTER E WITH DOT ABOVE}" |
|
"\N{LATIN SMALL LETTER E WITH GRAVE}" |
|
"\N{LATIN SMALL LETTER E WITH MACRON}" |
|
"\N{LATIN SMALL LETTER E WITH OGONEK}" |
|
"\N{LATIN SMALL LETTER E WITH OGONEK}" |
|
"\N{LATIN SMALL LETTER G WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER I WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}" |
|
"\N{LATIN SMALL LETTER I WITH DIAERESIS}" |
|
"\N{LATIN SMALL LETTER I WITH GRAVE}" |
|
"\N{LATIN SMALL LETTER I WITH MACRON}" |
|
"\N{LATIN SMALL LETTER I WITH OGONEK}" |
|
"\N{LATIN SMALL LETTER K WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER L WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER L WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER R WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER Z WITH ACUTE}" |
|
"\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" |
|
"\N{GREEK SMALL LETTER ALPHA}" |
|
"\N{GREEK SMALL LETTER BETA}" |
|
"\N{GREEK SMALL LETTER GAMMA}" |
|
"\N{GREEK SMALL LETTER DELTA}" |
|
"\N{GREEK SMALL LETTER EPSILON}" |
|
"\N{GREEK SMALL LETTER ZETA}" |
|
"\N{GREEK SMALL LETTER ETA}" |
|
"\N{GREEK SMALL LETTER THETA}" |
|
"\N{GREEK SMALL LETTER IOTA}" |
|
"\N{GREEK SMALL LETTER KAPPA}" |
|
"\N{GREEK SMALL LETTER LAMDA}" |
|
"\N{GREEK SMALL LETTER MU}" |
|
"\N{GREEK SMALL LETTER NU}" |
|
"\N{GREEK SMALL LETTER XI}" |
|
"\N{GREEK SMALL LETTER OMICRON}" |
|
"\N{CYRILLIC SMALL LETTER A}" |
|
"\N{CYRILLIC SMALL LETTER BE}" |
|
"\N{CYRILLIC SMALL LETTER VE}" |
|
"\N{CYRILLIC SMALL LETTER GHE}" |
|
"\N{CYRILLIC SMALL LETTER DE}" |
|
"\N{CYRILLIC SMALL LETTER IE}" |
|
"\N{CYRILLIC SMALL LETTER ZHE}" |
|
"\N{CYRILLIC SMALL LETTER ZE}" |
|
"\N{CYRILLIC SMALL LETTER I}" |
|
"\N{CYRILLIC SMALL LETTER SHORT I}" |
|
"\N{CYRILLIC SMALL LETTER KA}" |
|
"\N{CYRILLIC SMALL LETTER EL}" |
|
"\N{CYRILLIC SMALL LETTER EM}" |
|
"\N{CYRILLIC SMALL LETTER EN}" |
|
"\N{CYRILLIC SMALL LETTER O}" |
|
"\N{CYRILLIC SMALL LETTER PE}" |
|
), |
|
|
|
|
|
"utf8_first_of_4": ( |
|
"\N{LATIN SMALL LETTER D WITH STROKE}" |
|
"\N{LATIN SMALL LETTER ETH}" |
|
"\N{LATIN SMALL LETTER G WITH BREVE}" |
|
"\N{LATIN SMALL LETTER O WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER S WITH CARON}" |
|
"\N{GREEK SMALL LETTER PI}" |
|
"\N{GREEK SMALL LETTER SIGMA}" |
|
"\N{CYRILLIC SMALL LETTER ER}" |
|
"\N{CYRILLIC SMALL LETTER U}" |
|
), |
|
|
|
|
|
"utf8_continuation": ( |
|
"\x80-\xbf" |
|
"\N{SPACE}" |
|
"\N{LATIN CAPITAL LETTER A WITH OGONEK}" |
|
"\N{LATIN CAPITAL LETTER AE}" |
|
"\N{LATIN CAPITAL LETTER L WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER L WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER O WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER R WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER S WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER S WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER S WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER T WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER Z WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER Z WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" |
|
"\N{LATIN CAPITAL LIGATURE OE}" |
|
"\N{LATIN SMALL LETTER A WITH OGONEK}" |
|
"\N{LATIN SMALL LETTER AE}" |
|
"\N{LATIN SMALL LETTER F WITH HOOK}" |
|
"\N{LATIN SMALL LETTER L WITH CARON}" |
|
"\N{LATIN SMALL LETTER L WITH STROKE}" |
|
"\N{LATIN SMALL LETTER O WITH STROKE}" |
|
"\N{LATIN SMALL LETTER R WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER S WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER S WITH CARON}" |
|
"\N{LATIN SMALL LETTER S WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER T WITH CARON}" |
|
"\N{LATIN SMALL LETTER Z WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER Z WITH CARON}" |
|
"\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" |
|
"\N{LATIN SMALL LIGATURE OE}" |
|
"\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" |
|
"\N{CARON}" |
|
"\N{BREVE}" |
|
"\N{OGONEK}" |
|
"\N{SMALL TILDE}" |
|
"\N{DOUBLE ACUTE ACCENT}" |
|
"\N{GREEK TONOS}" |
|
"\N{GREEK DIALYTIKA TONOS}" |
|
"\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER ETA WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" |
|
"\N{CYRILLIC CAPITAL LETTER IO}" |
|
"\N{CYRILLIC CAPITAL LETTER DJE}" |
|
"\N{CYRILLIC CAPITAL LETTER GJE}" |
|
"\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" |
|
"\N{CYRILLIC CAPITAL LETTER DZE}" |
|
"\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" |
|
"\N{CYRILLIC CAPITAL LETTER YI}" |
|
"\N{CYRILLIC CAPITAL LETTER JE}" |
|
"\N{CYRILLIC CAPITAL LETTER LJE}" |
|
"\N{CYRILLIC CAPITAL LETTER NJE}" |
|
"\N{CYRILLIC CAPITAL LETTER TSHE}" |
|
"\N{CYRILLIC CAPITAL LETTER KJE}" |
|
"\N{CYRILLIC CAPITAL LETTER SHORT U}" |
|
"\N{CYRILLIC CAPITAL LETTER DZHE}" |
|
"\N{CYRILLIC SMALL LETTER IO}" |
|
"\N{CYRILLIC SMALL LETTER DJE}" |
|
"\N{CYRILLIC SMALL LETTER GJE}" |
|
"\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" |
|
"\N{CYRILLIC SMALL LETTER DZE}" |
|
"\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" |
|
"\N{CYRILLIC SMALL LETTER YI}" |
|
"\N{CYRILLIC SMALL LETTER JE}" |
|
"\N{CYRILLIC SMALL LETTER LJE}" |
|
"\N{CYRILLIC SMALL LETTER NJE}" |
|
"\N{CYRILLIC SMALL LETTER TSHE}" |
|
"\N{CYRILLIC SMALL LETTER KJE}" |
|
"\N{CYRILLIC SMALL LETTER SHORT U}" |
|
"\N{CYRILLIC SMALL LETTER DZHE}" |
|
"\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" |
|
"\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" |
|
"\N{EN DASH}" |
|
"\N{EM DASH}" |
|
"\N{HORIZONTAL BAR}" |
|
"\N{LEFT SINGLE QUOTATION MARK}" |
|
"\N{RIGHT SINGLE QUOTATION MARK}" |
|
"\N{SINGLE LOW-9 QUOTATION MARK}" |
|
"\N{LEFT DOUBLE QUOTATION MARK}" |
|
"\N{RIGHT DOUBLE QUOTATION MARK}" |
|
"\N{DOUBLE LOW-9 QUOTATION MARK}" |
|
"\N{DAGGER}" |
|
"\N{DOUBLE DAGGER}" |
|
"\N{BULLET}" |
|
"\N{HORIZONTAL ELLIPSIS}" |
|
"\N{PER MILLE SIGN}" |
|
"\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" |
|
"\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" |
|
"\N{EURO SIGN}" |
|
"\N{NUMERO SIGN}" |
|
"\N{TRADE MARK SIGN}" |
|
), |
|
|
|
|
|
|
|
"utf8_continuation_strict": ( |
|
"\x80-\xbf" |
|
"\N{LATIN CAPITAL LETTER A WITH OGONEK}" |
|
"\N{LATIN CAPITAL LETTER AE}" |
|
"\N{LATIN CAPITAL LETTER L WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER L WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER O WITH STROKE}" |
|
"\N{LATIN CAPITAL LETTER R WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER S WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER S WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER S WITH CEDILLA}" |
|
"\N{LATIN CAPITAL LETTER T WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" |
|
"\N{LATIN CAPITAL LETTER Z WITH ACUTE}" |
|
"\N{LATIN CAPITAL LETTER Z WITH CARON}" |
|
"\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" |
|
"\N{LATIN CAPITAL LIGATURE OE}" |
|
"\N{LATIN SMALL LETTER A WITH OGONEK}" |
|
"\N{LATIN SMALL LETTER AE}" |
|
"\N{LATIN SMALL LETTER F WITH HOOK}" |
|
"\N{LATIN SMALL LETTER L WITH CARON}" |
|
"\N{LATIN SMALL LETTER L WITH STROKE}" |
|
"\N{LATIN SMALL LETTER O WITH STROKE}" |
|
"\N{LATIN SMALL LETTER R WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER S WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER S WITH CARON}" |
|
"\N{LATIN SMALL LETTER S WITH CEDILLA}" |
|
"\N{LATIN SMALL LETTER T WITH CARON}" |
|
"\N{LATIN SMALL LETTER Z WITH ACUTE}" |
|
"\N{LATIN SMALL LETTER Z WITH CARON}" |
|
"\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" |
|
"\N{LATIN SMALL LIGATURE OE}" |
|
"\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" |
|
"\N{CARON}" |
|
"\N{BREVE}" |
|
"\N{OGONEK}" |
|
"\N{SMALL TILDE}" |
|
"\N{DOUBLE ACUTE ACCENT}" |
|
"\N{GREEK TONOS}" |
|
"\N{GREEK DIALYTIKA TONOS}" |
|
"\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER ETA WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" |
|
"\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" |
|
"\N{CYRILLIC CAPITAL LETTER IO}" |
|
"\N{CYRILLIC CAPITAL LETTER DJE}" |
|
"\N{CYRILLIC CAPITAL LETTER GJE}" |
|
"\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" |
|
"\N{CYRILLIC CAPITAL LETTER DZE}" |
|
"\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" |
|
"\N{CYRILLIC CAPITAL LETTER YI}" |
|
"\N{CYRILLIC CAPITAL LETTER JE}" |
|
"\N{CYRILLIC CAPITAL LETTER LJE}" |
|
"\N{CYRILLIC CAPITAL LETTER NJE}" |
|
"\N{CYRILLIC CAPITAL LETTER TSHE}" |
|
"\N{CYRILLIC CAPITAL LETTER KJE}" |
|
"\N{CYRILLIC CAPITAL LETTER SHORT U}" |
|
"\N{CYRILLIC CAPITAL LETTER DZHE}" |
|
"\N{CYRILLIC SMALL LETTER IO}" |
|
"\N{CYRILLIC SMALL LETTER DJE}" |
|
"\N{CYRILLIC SMALL LETTER GJE}" |
|
"\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" |
|
"\N{CYRILLIC SMALL LETTER DZE}" |
|
"\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" |
|
"\N{CYRILLIC SMALL LETTER YI}" |
|
"\N{CYRILLIC SMALL LETTER JE}" |
|
"\N{CYRILLIC SMALL LETTER LJE}" |
|
"\N{CYRILLIC SMALL LETTER NJE}" |
|
"\N{CYRILLIC SMALL LETTER TSHE}" |
|
"\N{CYRILLIC SMALL LETTER KJE}" |
|
"\N{CYRILLIC SMALL LETTER SHORT U}" |
|
"\N{CYRILLIC SMALL LETTER DZHE}" |
|
"\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" |
|
"\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" |
|
"\N{DAGGER}" |
|
"\N{DOUBLE DAGGER}" |
|
"\N{PER MILLE SIGN}" |
|
"\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" |
|
"\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" |
|
"\N{EURO SIGN}" |
|
"\N{NUMERO SIGN}" |
|
"\N{TRADE MARK SIGN}" |
|
), |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
UTF8_DETECTOR_RE = re.compile( |
|
""" |
|
(?<! [{utf8_continuation_strict}]) |
|
( |
|
[{utf8_first_of_2}] [{utf8_continuation}] |
|
| |
|
[{utf8_first_of_3}] [{utf8_continuation}]{{2}} |
|
| |
|
[{utf8_first_of_4}] [{utf8_continuation}]{{3}} |
|
)+ |
|
""".format(**UTF8_CLUES), |
|
re.VERBOSE, |
|
) |
|
|