Source code for extract_msg.encoding

"""
File for handling specialized encoding tasks or information.
"""

__all__ = [
    'lookupCodePage',
]


# This adds additional encodings to python.
import ebcdic as _
import codecs

from ..exceptions import UnknownCodepageError, UnsupportedEncodingError


# This is a dictionary matching the code page number to it's encoding name.
# The list used to make this can be found here:
# https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
### TODO:
# Many of these code pages are not supported by Python. As such, we should
# really implement them ourselves to make sure that if someone wants to use an
# MSG file with one of those encodings, they are able to. Perhaps we should
# create a seperate module for that?
# Code pages that currently don't have a supported encoding will be preceded by
# `# UNSUPPORTED`.
# For some of these, it is also possible that the name we are trying to find
# them with is not known to Python. I have already confirmed this for a few of
# them, and adjusted their names to ones that python would recognize. It is
# Possible I missed a few.
_CODE_PAGES = {
    37: 'IBM037', # IBM EBCDIC US-Canada
    437: 'IBM437', # OEM United States
    500: 'IBM500', # IBM EBCDIC International
    708: 'ASMO-708', # Arabic (ASMO 708)
    # UNSUPPORTED.
    709: '', # Arabic (ASMO-449+, BCON V4)
    # UNSUPPORTED.
    710: '', # Arabic - Transparent Arabic
    # UNSUPPORTED.
    720: 'DOS-720', # Arabic (Transparent ASMO); Arabic (DOS)
    737: 'cp737', # OEM Greek (formerly 437G); Greek (DOS)
    775: 'ibm775', # OEM Baltic; Baltic (DOS)
    850: 'ibm850', # OEM Multilingual Latin 1; Western European (DOS)
    852: 'ibm852', # OEM Latin 2; Central European (DOS)
    855: 'IBM855', # OEM Cyrillic (primarily Russian)
    857: 'ibm857', # OEM Turkish; Turkish (DOS)
    858: 'cp858', # OEM Multilingual Latin 1 + Euro symbol
    860: 'IBM860', # OEM Portuguese; Portuguese (DOS)
    861: 'ibm861', # OEM Icelandic; Icelandic (DOS)
    862: 'cp862', # OEM Hebrew; Hebrew (DOS)
    863: 'IBM863', # OEM French Canadian; French Canadian (DOS)
    864: 'IBM864', # OEM Arabic; Arabic (864)
    865: 'IBM865', # OEM Nordic; Nordic (DOS)
    866: 'cp866', # OEM Russian; Cyrillic (DOS)
    869: 'ibm869', # OEM Modern Greek; Greek, Modern (DOS)
    870: 'cp870', # IBM870 # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
    874: 'windows-874', # ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
    875: 'cp875', # IBM EBCDIC Greek Modern
    932: 'shift_jis', # ANSI/OEM Japanese; Japanese (Shift-JIS)
    936: 'gb2312', # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
    949: 'ks_c_5601-1987', # ANSI/OEM Korean (Unified Hangul Code)
    # We *must* use a custom encoding because the Python implementation differs
    # from the Microsoft implementation.
    950: 'windows-950', # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
    1026: 'IBM1026', # IBM EBCDIC Turkish (Latin 5)
    1047: 'cp1047', # IBM EBCDIC Latin 1/Open System
    1140: 'cp1140', # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
    1141: 'cp1141', # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
    1142: 'cp1142', # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
    1143: 'cp1143', # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
    1144: 'cp1144', # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
    1145: 'cp1145', # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
    1146: 'cp1146', # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
    1147: 'cp1147', # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
    1148: 'cp1148ms', # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
    1149: 'cp1149', # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
    1200: 'utf-16-le', # Unicode UTF-16, little endian byte order (BMP of ISO 10646);
    1201: 'utf-16-be', # Unicode UTF-16, big endian byte order;
    1250: 'windows-1250', # ANSI Central European; Central European (Windows)
    1251: 'windows-1251', # ANSI Cyrillic; Cyrillic (Windows)
    1252: 'windows-1252', # ANSI Latin 1; Western European (Windows)
    1253: 'windows-1253', # ANSI Greek; Greek (Windows)
    1254: 'windows-1254', # ANSI Turkish; Turkish (Windows)
    1255: 'windows-1255', # ANSI Hebrew; Hebrew (Windows)
    1256: 'windows-1256', # ANSI Arabic; Arabic (Windows)
    1257: 'windows-1257', # ANSI Baltic; Baltic (Windows)
    1258: 'windows-1258', # ANSI/OEM Vietnamese; Vietnamese (Windows)
    1361: 'Johab', # Korean (Johab)
    10000: 'macintosh', # MAC Roman; Western European (Mac)
    10001: 'x-mac-japanese', # Japanese (Mac)
    # UNSUPPORTED.
    10002: 'x-mac-chinesetrad', # MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
    10003: 'x-mac-korean', # Korean (Mac)
    # UNSUPPORTED.
    10004: 'x-mac-arabic', # Arabic (Mac)
    # UNSUPPORTED.
    10005: 'x-mac-hebrew', # Hebrew (Mac)
    10006: 'x-mac-greek', # Greek (Mac)
    10007: 'x-mac-cyrillic', # Cyrillic (Mac)
    # UNSUPPORTED.
    10008: 'x-mac-chinesesimp', # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
    # UNSUPPORTED.
    10010: 'x-mac-romanian', # Romanian (Mac)
    # UNSUPPORTED.
    10017: 'x-mac-ukrainian', # Ukrainian (Mac)
    # UNSUPPORTED.
    10021: 'x-mac-thai', # Thai (Mac)
    10029: 'x-mac-ce', # MAC Latin 2; Central European (Mac)
    10079: 'x-mac-icelandic', # Icelandic (Mac)
    10081: 'x-mac-turkish', # Turkish (Mac)
    # UNSUPPORTED.
    10082: 'x-mac-croatian', # Croatian (Mac)
    12000: 'utf-32', # Unicode UTF-32, little endian byte order
    12001: 'utf-32BE', # Unicode UTF-32, big endian byte order
    # UNSUPPORTED.
    20000: 'x-Chinese_CNS', # CNS Taiwan; Chinese Traditional (CNS)
    # UNSUPPORTED.
    20001: 'x-cp20001', # TCA Taiwan
    # UNSUPPORTED.
    20002: 'x_Chinese-Eten', # Eten Taiwan; Chinese Traditional (Eten)
    # UNSUPPORTED.
    20003: 'x-cp20003', # IBM5550 Taiwan
    # UNSUPPORTED.
    20004: 'x-cp20004', # TeleText Taiwan
    # UNSUPPORTED.
    20005: 'x-cp20005', # Wang Taiwan
    # UNSUPPORTED.
    20105: 'x-IA5', # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
    # UNSUPPORTED.
    20106: 'x-IA5-German', # IA5 German (7-bit)
    # UNSUPPORTED.
    20107: 'x-IA5-Swedish', # IA5 Swedish (7-bit)
    # UNSUPPORTED.
    20108: 'x-IA5-Norwegian', # IA5 Norwegian (7-bit)
    20127: 'us-ascii', # US-ASCII (7-bit)
    # UNSUPPORTED.
    20261: 'x-cp20261', # T.61
    # UNSUPPORTED.
    20269: 'x-cp20269', # ISO 6937 Non-Spacing Accent
    20273: 'IBM273', # IBM EBCDIC Germany
    20277: 'cp277', # IBM EBCDIC Denmark-Norway
    20278: 'cp278', # IBM EBCDIC Finland-Sweden
    20280: 'cp280', # IBM EBCDIC Italy
    20284: 'cp284', # IBM EBCDIC Latin America-Spain
    20285: 'cp285', # IBM EBCDIC United Kingdom
    20290: 'cp290', # IBM EBCDIC Japanese Katakana Extended
    20297: 'cp297', # IBM EBCDIC France
    20420: 'cp420', # IBM EBCDIC Arabic
    # UNSUPPORTED.
    20423: 'IBM423', # IBM EBCDIC Greek
    20424: 'IBM424', # IBM EBCDIC Hebrew
    20833: 'cp833', # IBM EBCDIC Korean Extended
    20838: 'cp838', # IBM EBCDIC Thai
    20866: 'koi8-r', # Russian (KOI8-R); Cyrillic (KOI8-R)
    20871: 'cp871', # IBM EBCDIC Icelandic
    # UNSUPPORTED.
    20880: 'IBM880', # IBM EBCDIC Cyrillic Russian
    # UNSUPPORTED.
    20905: 'IBM905', # IBM EBCDIC Turkish
    # UNSUPPORTED.
    20924: 'IBM00924', # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
    20932: 'EUC-JP', # Japanese (JIS 0208-1990 and 0212-1990)
    # UNSUPPORTED.
    20936: 'x-cp20936', # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
    # UNSUPPORTED.
    20949: 'x-cp20949', # Korean Wansung
    21025: 'cp1025', # IBM EBCDIC Cyrillic Serbian-Bulgarian
    # UNSUPPORTED.
    21027: '', # (deprecated)
    21866: 'koi8-u', # Ukrainian (KOI8-U); Cyrillic (KOI8-U)
    28591: 'iso-8859-1', # ISO 8859-1 Latin 1; Western European (ISO)
    28592: 'iso-8859-2', # ISO 8859-2 Central European; Central European (ISO)
    28593: 'iso-8859-3', # ISO 8859-3 Latin 3
    28594: 'iso-8859-4', # ISO 8859-4 Baltic
    28595: 'iso-8859-5', # ISO 8859-5 Cyrillic
    28596: 'iso-8859-6', # ISO 8859-6 Arabic
    28597: 'iso-8859-7', # ISO 8859-7 Greek
    28598: 'iso-8859-8', # ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
    28599: 'iso-8859-9', # ISO 8859-9 Turkish
    28603: 'iso-8859-13', # ISO 8859-13 Estonian
    28605: 'iso-8859-15', # ISO 8859-15 Latin 9
    # UNSUPPORTED.
    29001: 'x-Europa', # Europa 3
    # UNSUPPORTED.
    38598: 'iso-8859-8-i', # ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
    50220: 'iso-2022-jp', # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
    50221: 'csISO2022JP', # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
    50222: 'iso-2022-jp', # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
    50225: 'iso-2022-kr', # ISO 2022 Korean
    # UNSUPPORTED.
    50227: 'x-cp50227', # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
    # UNSUPPORTED.
    50229: '', # ISO 2022 Traditional Chinese
    # UNSUPPORTED.
    50930: '', # EBCDIC Japanese (Katakana) Extended
    # UNSUPPORTED.
    50931: '', # EBCDIC US-Canada and Japanese
    # UNSUPPORTED.
    50933: '', # EBCDIC Korean Extended and Korean
    # UNSUPPORTED.
    50935: '', # EBCDIC Simplified Chinese Extended and Simplified Chinese
    # UNSUPPORTED.
    50936: '', # EBCDIC Simplified Chinese
    # UNSUPPORTED.
    50937: '', # EBCDIC US-Canada and Traditional Chinese
    # UNSUPPORTED.
    50939: '', # EBCDIC Japanese (Latin) Extended and Japanese
    51932: 'euc-jp', # EUC Japanese
    51936: 'EUC-CN', # EUC Simplified Chinese; Chinese Simplified (EUC)
    51949: 'euc-kr', # EUC Korean
    # UNSUPPORTED.
    51950: '', # EUC Traditional Chinese
    52936: 'hz-gb-2312', # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
    54936: 'GB18030', # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
    # UNSUPPORTED.
    57002: 'x-iscii-de', # ISCII Devanagari
    # UNSUPPORTED.
    57003: 'x-iscii-be', # ISCII Bangla
    # UNSUPPORTED.
    57004: 'x-iscii-ta', # ISCII Tamil
    # UNSUPPORTED.
    57005: 'x-iscii-te', # ISCII Telugu
    # UNSUPPORTED.
    57006: 'x-iscii-as', # ISCII Assamese
    # UNSUPPORTED.
    57007: 'x-iscii-or', # ISCII Odia
    # UNSUPPORTED.
    57008: 'x-iscii-ka', # ISCII Kannada
    # UNSUPPORTED.
    57009: 'x-iscii-ma', # ISCII Malayalam
    # UNSUPPORTED.
    57010: 'x-iscii-gu', # ISCII Gujarati
    # UNSUPPORTED.
    57011: 'x-iscii-pa', # ISCII Punjabi
    65000: 'utf-7', # Unicode (UTF-7)
    65001: 'utf-8', # Unicode (UTF-8)
}

# Register new encodings.

[docs]def lookupCodePage(id_: int) -> str:
    """
    Converts an encoding id into it's name.

    :raises UnknownCodepageError: The code page was not recognized.
    :raises UnsupportedEncodingError: The code page was recognized, but no
        encoding exists in the environment with support for it.
    """
    if id_ in _CODE_PAGES:
        if (page := _CODE_PAGES[id_]):
            return page
        else:
            raise UnsupportedEncodingError(f'Code page {id_} is unsupported.')
    else:
        raise UnknownCodepageError(f'Unknown code page {id_}.')

def _lookupEncoding(name):
    return _codecsInfo.get(name)

from .utils import createSBEncoding as _sb, createVBEncoding as _vb
from ._dt import (
        _mac_ce, _mac_cyrillic, _mac_greek, _mac_iceland, _mac_turkish,
        _win874_dec, _win950_dec
    )

_codecsInfo = {
    'x_mac_ce': _sb('x-mac-ce', _mac_ce.decodingTable),
    'x_mac_cyrillic': _sb('x-mac-cyrillic', _mac_cyrillic.decodingTable),
    'x_mac_greek': _sb('x-mac-greek', _mac_greek.decodingTable),
    'x_mac_icelandic': _sb('x-mac-icelandic', _mac_iceland.decodingTable),
    'x_mac_turkish': _sb('x-mac-turkish', _mac_turkish.decodingTable),
    'windows_950': _vb('windows-950', _win950_dec.decodingTable),
    'windows_874': _sb('windows-874', _win874_dec.decodingTable),
}

codecs.register(_lookupEncoding)