Module gatenlp.serialization.default

Module that implements the various ways of how to save and load documents and change logs.

Expand source code
"""
Module that implements the various ways of how to save and load documents and change logs.
"""
import os
from gatenlp.serialization.default_json import JsonSerializer
from gatenlp.serialization.default_msgpack import MsgPackSerializer
from gatenlp.serialization.default_pickle import PickleSerializer
from gatenlp.serialization.default_plaintext import PlainTextSerializer
from gatenlp.serialization.default_tweetv1 import TweetV1Serializer
from gatenlp.serialization.default_yaml import YamlSerializer
from gatenlp.serialization.default_gatexml import GateXmlLoader
from gatenlp.serialization.default_htmlannviewer import HtmlAnnViewerSerializer
from gatenlp.serialization.default_htmlloader import HtmlLoader

# TODO: when loading from a URL, allow for deciding on the format based on the mime type!
# So if we do not have the format, we should get the header for the file, check the mime type and see
# if  we have a loder registered for that and then let the loader do the rest of the work. This may
# need loaders to be able to use an already open stream.


def determine_loader(
    clazz, from_ext=None, from_mem=None, offset_mapper=None, gzip=False, **kwargs
):
    """

    Args:
      clazz:
      from_ext: (Default value = None)
      from_mem: (Default value = None)
      offset_mapper: (Default value = None)
      gzip: (Default value = False)
      **kwargs:

    Returns:

    """
    if from_mem:
        first = from_mem[0]
    else:
        with open(from_ext, "rt") as infp:
            first = infp.read(1)
    if first == "{":
        return JsonSerializer.load(
            clazz,
            from_ext=from_ext,
            from_mem=from_mem,
            offset_mapper=offset_mapper,
            gzip=gzip,
            **kwargs,
        )
    else:
        return MsgPackSerializer.load(
            clazz,
            from_ext=from_ext,
            from_mem=from_mem,
            offset_mapper=offset_mapper,
            gzip=gzip,
            **kwargs,
        )


DOCUMENT_SAVERS = {
    "text/plain": PlainTextSerializer.save,
    "text/plain+gzip": PlainTextSerializer.save_gzip,
    "text": PlainTextSerializer.save,
    "json": JsonSerializer.save,
    "jsongz": JsonSerializer.save_gzip,
    "bdocjs": JsonSerializer.save,
    "pickle": PickleSerializer.save,
    "bdocjsgz": JsonSerializer.save_gzip,
    "text/bdocjs": JsonSerializer.save,
    "text/bdocjs+gzip": JsonSerializer.save_gzip,
    "yaml": YamlSerializer.save,
    "bdocym": YamlSerializer.save,
    "yamlgz": YamlSerializer.save_gzip,
    "text/bdocym": YamlSerializer.save,
    "text/bdocym+gzip+": YamlSerializer.save_gzip,
    "msgpack": MsgPackSerializer.save,
    "bdocmp": MsgPackSerializer.save,
    "tweet-v1": TweetV1Serializer.save,
    "text/bdocmp": MsgPackSerializer.save,
    "application/msgpack": MsgPackSerializer.save,
    "html-ann-viewer": HtmlAnnViewerSerializer.save,
}
DOCUMENT_LOADERS = {
    "json": JsonSerializer.load,
    "jsongz": JsonSerializer.load_gzip,
    "bdocjs": JsonSerializer.load,
    "bdocjsgz": JsonSerializer.load_gzip,
    "text/bdocjs": JsonSerializer.load,
    "text/bdocjs+gzip": JsonSerializer.load_gzip,
    "yaml": YamlSerializer.load,
    "yamlgz": YamlSerializer.load_gzip,
    "bdocym": YamlSerializer.load,
    "bdocymzg: ": YamlSerializer.load_gzip,
    "text/bdocym": YamlSerializer.load,
    "text/bdocym+gzip": YamlSerializer.load_gzip,
    "msgpack": MsgPackSerializer.load,
    "bdocmp": MsgPackSerializer.load,
    "application/msgpack": MsgPackSerializer.load,
    "text/bdocmp": MsgPackSerializer.load,
    "jsonormsgpack": determine_loader,
    "text/plain": PlainTextSerializer.load,
    "text/plain+gzip": PlainTextSerializer.load_gzip,
    "text": PlainTextSerializer.load,
    "text/html": HtmlLoader.load,
    "html": HtmlLoader.load,
    "html-rendered": HtmlLoader.load_rendered,
    "gatexml": GateXmlLoader.load,
    "tweet-v1": TweetV1Serializer.load,
    "pickle": PickleSerializer.load,
}
CHANGELOG_SAVERS = {
    "json": JsonSerializer.save,
    "text/bdocjs+gzip": JsonSerializer.save_gzip,
    "text/bdocjs": JsonSerializer.save,
}
CHANGELOG_LOADERS = {
    "json": JsonSerializer.load,
    "text/bdocjs+gzip": JsonSerializer.load_gzip,
    "text/bdocjs": JsonSerializer.load,
}

# map extensions to document types
EXTENSIONS = {
    "bdocjs": "json",
    "bdocym": "yaml",
    "bdocym.gz": "text/bdocym+gzip",
    "bdoc.gz": "text/bdocjs+gzip",  # lets assume it is compressed json
    "bdoc": "jsonormsgpack",
    "bdocjs.gz": "text/bdocjs+gzip",
    "bdocjson": "json",
    "bdocmp": "msgpack",
    "txt": "text/plain",
    "txt.gz": "text/plain+gzip",
    "html": "text/html",
    "htm": "text/html",
    "pickle": "pickle",
}


def get_handler(filespec, fmt, handlers, saveload, what):
    """

    Args:
      filespec:
      fmt:
      handlers:
      saveload:
      what:

    Returns:

    """
    msg = f"Could not determine how to {saveload} {what} for format {fmt} in module gatenlp.serialization.default"
    if fmt:
        handler = handlers.get(fmt)
        if not handler:
            raise Exception(msg)
        return handler
    else:
        if not filespec:  # in case of save_mem
            raise Exception(msg)
        if isinstance(filespec, os.PathLike):
            wf = os.fspath(filespec)
        elif isinstance(filespec, str):
            wf = filespec
        else:
            raise Exception(msg)
        name, ext = os.path.splitext(wf)
        if ext == ".gz":
            ext2 = os.path.splitext(name)[1]
            if ext2:
                ext2 = ext2[1:]
            ext = ext2 + ext
        elif ext:
            ext = ext[1:]
        fmt = EXTENSIONS.get(ext)
        msg = f"Could not determine how to {saveload} {what} for format {fmt} and with " \
              "extension {ext} in module gatenlp.serialization.default"
        if not fmt:
            raise Exception(msg)
        handler = handlers.get(fmt)
        if not handler:
            raise Exception(msg)
        return handler


def get_document_saver(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, DOCUMENT_SAVERS, "save", "document")


def get_document_loader(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, DOCUMENT_LOADERS, "load", "document")


def get_changelog_saver(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, CHANGELOG_SAVERS, "save", "changelog")


def get_changelog_loader(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, CHANGELOG_LOADERS, "load", "changelog")

Functions

def determine_loader(clazz, from_ext=None, from_mem=None, offset_mapper=None, gzip=False, **kwargs)

Args

clazz:
from_ext
(Default value = None)
from_mem
(Default value = None)
offset_mapper
(Default value = None)
gzip
(Default value = False)

**kwargs: Returns:

Expand source code
def determine_loader(
    clazz, from_ext=None, from_mem=None, offset_mapper=None, gzip=False, **kwargs
):
    """

    Args:
      clazz:
      from_ext: (Default value = None)
      from_mem: (Default value = None)
      offset_mapper: (Default value = None)
      gzip: (Default value = False)
      **kwargs:

    Returns:

    """
    if from_mem:
        first = from_mem[0]
    else:
        with open(from_ext, "rt") as infp:
            first = infp.read(1)
    if first == "{":
        return JsonSerializer.load(
            clazz,
            from_ext=from_ext,
            from_mem=from_mem,
            offset_mapper=offset_mapper,
            gzip=gzip,
            **kwargs,
        )
    else:
        return MsgPackSerializer.load(
            clazz,
            from_ext=from_ext,
            from_mem=from_mem,
            offset_mapper=offset_mapper,
            gzip=gzip,
            **kwargs,
        )
def get_changelog_loader(filespec, fmt)

Args

filespec: fmt: Returns:

Expand source code
def get_changelog_loader(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, CHANGELOG_LOADERS, "load", "changelog")
def get_changelog_saver(filespec, fmt)

Args

filespec: fmt: Returns:

Expand source code
def get_changelog_saver(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, CHANGELOG_SAVERS, "save", "changelog")
def get_document_loader(filespec, fmt)

Args

filespec: fmt: Returns:

Expand source code
def get_document_loader(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, DOCUMENT_LOADERS, "load", "document")
def get_document_saver(filespec, fmt)

Args

filespec: fmt: Returns:

Expand source code
def get_document_saver(filespec, fmt):
    """

    Args:
      filespec:
      fmt:

    Returns:

    """
    return get_handler(filespec, fmt, DOCUMENT_SAVERS, "save", "document")
def get_handler(filespec, fmt, handlers, saveload, what)

Args

filespec: fmt: handlers: saveload: what: Returns:

Expand source code
def get_handler(filespec, fmt, handlers, saveload, what):
    """

    Args:
      filespec:
      fmt:
      handlers:
      saveload:
      what:

    Returns:

    """
    msg = f"Could not determine how to {saveload} {what} for format {fmt} in module gatenlp.serialization.default"
    if fmt:
        handler = handlers.get(fmt)
        if not handler:
            raise Exception(msg)
        return handler
    else:
        if not filespec:  # in case of save_mem
            raise Exception(msg)
        if isinstance(filespec, os.PathLike):
            wf = os.fspath(filespec)
        elif isinstance(filespec, str):
            wf = filespec
        else:
            raise Exception(msg)
        name, ext = os.path.splitext(wf)
        if ext == ".gz":
            ext2 = os.path.splitext(name)[1]
            if ext2:
                ext2 = ext2[1:]
            ext = ext2 + ext
        elif ext:
            ext = ext[1:]
        fmt = EXTENSIONS.get(ext)
        msg = f"Could not determine how to {saveload} {what} for format {fmt} and with " \
              "extension {ext} in module gatenlp.serialization.default"
        if not fmt:
            raise Exception(msg)
        handler = handlers.get(fmt)
        if not handler:
            raise Exception(msg)
        return handler