Module gatenlp.serialization.default_gatexml

Module that implements the GATE XML format loader.

Expand source code
"""
Module that implements the GATE XML format loader.
"""
import sys
from decimal import Decimal
import xml.etree.ElementTree as ET
from gatenlp.document import Document
from gatenlp.utils import init_logger
from gatenlp.urlfileutils import is_url, get_str_from_url, stream_from

logger = init_logger()


class GateXmlLoader:
    """
    Loader for JAVA GATE XML format. This supports document and annotation feature values for the following types:
    String, int, float, booleean, and the following containers containing recursively any of the supported types:
    Map, Array, List, Set (converted to list).
    """

    @staticmethod
    def context2txt(context):
        """Generate text from context info"""
        fname = context["fname"]
        node = context["node"]
        if node is None:
            nodetxt = ""
        else:
            nodetxt = ET.tostring(node, encoding='unicode', method="text")
        if context["ftype"] == "doc":
            return f"document feature {fname},\n    node={nodetxt}"
        else:
            atype = context["atype"]
            aset = context["aset"]
            offset = context["offset"]
            return f"annotation feature {fname} for ann type {atype} in set '{aset}' at offset {offset},\n    node={nodetxt}"

    @staticmethod
    def warning(txt, options, context):
        """Emit a warning or stay silent"""
        ctx = GateXmlLoader.context2txt(context)
        if options["show_warnings"]:
            logger.warning(f"{txt} for {ctx}")

    @staticmethod
    def error(txt, options, context):
        """Handle an error"""
        if options["ignore_errors"]:
            GateXmlLoader.warning(txt, options, context)
        else:
            ctx = GateXmlLoader.context2txt(context)
            raise Exception(f"{txt} for {ctx}")

    @staticmethod
    def xstream2python(node, options, context):
        """
        Convert a xstream node to a python object. The node should only have a single child.
        This is either a single Value.className=gate.corpora.ObjectWrapper node with a single value child, or
        a list of value nodes to initialize a container, or something not supported.
        """
        # if options["debug"]:
        #     print(f"DEBUG: Got element ({node.tag}):", ET.tostring(node))
        if node.tag == "gate.corpora.ObjectWrapper":
            children = list(node)
            if len(children) != 1:
                GateXmlLoader.error(
                    f"ObjectWrapper node with not exactly one child for {node} but: {len(children)}: {children}",
                    options, context)
                return None
            child = children[0]
            if child.tag != "value":
                GateXmlLoader.error(f"Child of Value tag is not value but {child.tag}", options, context)
                return None
            return GateXmlLoader.xstream2python(child, options, context)
        elif node.tag == "value":
            valueclass = node.attrib["class"]
            ret = None
            if valueclass == "set":
                GateXmlLoader.warning(f"Converting set to list", options, context)
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif valueclass == "list" or valueclass.endswith("-array"):
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif valueclass == "date":
                ret = node.text
            elif valueclass == "linked-hash-map" or valueclass == "hash-map" or valueclass == "map":
                ret = {}
                for el in node:
                    items = list(el)
                    if len(items) != 2:
                        GateXmlLoader.error(f"Not exactly two children for map content, but {len(items)}",
                                            options, context)
                        break
                    else:
                        key = GateXmlLoader.xstream2python(items[0], options, context)
                        value = GateXmlLoader.xstream2python(items[1], options, context)
                        ret[key] = value
            else:
                GateXmlLoader.error(f"Unsupported xstreaqm type: {valueclass}", options, context)
                ret = None
        else:
            # this is a node for nested values which are not ObjectWrapper, e.g. <string>
            ret = None
            if node.tag == "string":
                ret = node.text
                if ret is None:
                    ret = ""
            elif node.tag == "int":
                ret = int(node.text)
            elif node.tag == "long":
                ret = int(node.text)
            elif node.tag == "boolean":
                ret = (node.text == "true")
            elif node.tag == "set":
                GateXmlLoader.warning(f"Converting set to list", options, context)
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif node.tag == "list" or node.tag.endswith("-array"):
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif node.tag == "date":
                ret = node.text
            elif node.tag == "big-decimal":
                GateXmlLoader.warning(f"Converting BigDecimal to float", options, context)
                ret = float(Decimal(node.text))
            elif node.tag == "linked-hash-map" or node.tag == "hash-map" or node.tag == "map":
                ret = {}
                for el in node:
                    items = list(el)
                    if len(items) != 2:
                        GateXmlLoader.error(f"Not exactly two children for map content, but {len(items)}",
                                            options, context)
                        break
                    else:
                        key = GateXmlLoader.xstream2python(items[0], options, context)
                        value = GateXmlLoader.xstream2python(items[1], options, context)
                        ret[key] = value
            else:
                GateXmlLoader.error(f"Unknown type, nested tag: {node.tag}", options, context)
                ret = None
        return ret

    @staticmethod
    def value4objectwrapper(xmlstr, options, context):
        """
        Convert some xstream-converted Java values to Python values.

        Args:
            text: the xstream serialization of the value as encountered in the GATE XML
            options: options dictionary to influence error/warning behavior
            context: context information to add to warnings/error messages

        Returns:
            a python value. The value is None if the value could not get converted but the class is configured
            to ignore unknown types.

        Throws:
            Exception if a value cannot be converted to Python and the lcass is configured to not ignore unknown types.
        """
        tree = ET.fromstring(xmlstr)
        return GateXmlLoader.xstream2python(tree, options, context)

    @staticmethod
    def load(clazz,
             from_ext=None,
             ignore_errors=True,
             show_warnings=True,
             debug=False):
        """

        Args:
            clazz:
            from_ext: (Default value = None)
            ignore_errors: (default: False) if True, ignore errors and try to load what we can for the document,
                if False, throw and exception.
            show_warnings: (default: True) If an error occurs but ignore_errors is True, or if some conversion
                is carried out, show a warning.
            debug: if True, output detailed information about unsupported elements in the input to stderr

        Returns:
            Loaded document
        """
        options = dict(
            ignore_errors=ignore_errors,
            show_warnings=show_warnings,
            debug=debug
        )

        isurl, extstr = is_url(from_ext)
        if isurl:
            # xmlstring = get_str_from_url(extstr, encoding="utf-8")
            # root = ET.fromstring(xmlstring)
            infp = stream_from(extstr, encoding=None)
            root = ET.fromstring(infp.read())
            infp.close()
        else:
            with open(extstr, "rb") as infp:
                # tree = ET.parse(extstr)
                # root = tree.getroot()
                root = ET.fromstring(infp.read())

        # or: root = ET.fromstring(xmlstring)

        # check we do have a GATE document

        assert root.tag == "GateDocument"
        # NOTE: there are docs around where version is less than 3 and
        # also where encoding="windows-1252" !!!
        # assert root.attrib == {"version": "3"}

        def parsefeatures(feats, ftype="Unknown", atype="Unknown", aset="Unknown", offset=None):
            """
            Parse the node for a feature map.

            Args:
              feats: iterable of Feature nodes

            Returns:
                The features
            """
            features = {}
            context = dict(
                ftype=ftype,
                atype=atype,
                aset=aset,
                offset=offset,
                fname=None,
                node=None
            )
            for feat in list(feats):
                name = None
                value = None
                for el in list(feat):
                    context["node"] = el
                    context["fname"] = name
                    if el.tag == "Name":
                        if el.get("className") == "java.lang.String":
                            name = el.text
                        else:
                            raise Exception(
                                "Odd Feature Name type: " + el.get("className")
                            )
                    elif el.tag == "Value":
                        cls_name = el.get("className")
                        if cls_name == "java.lang.String":
                            value = el.text
                        elif cls_name == "java.lang.Integer":
                            value = int(el.text)
                        elif cls_name == "java.lang.Long":
                            value = int(el.text)
                        elif cls_name == "java.math.BigDecimal":
                            value = float(el.text)
                        elif cls_name == "java.lang.Boolean":
                            value = bool(el.text)
                        elif cls_name == "gate.corpora.ObjectWrapper":
                            value = GateXmlLoader.value4objectwrapper(el.text, options, context)
                        else:
                            GateXmlLoader.error(f"Feature with unknown serialization type: {cls_name}",
                                                options, context)
                            value = None
                if name is not None and value is not None:
                    features[name] = value
            return features

        # get the document features
        docfeatures = {}
        feats = root.findall("./GateDocumentFeatures/Feature")

        docfeatures = parsefeatures(feats, ftype="doc")

        textwithnodes = root.findall("./TextWithNodes")
        text = ""
        node2offset = {}
        curoff = 0
        for item in textwithnodes:
            if item.text:
                text += item.text
                curoff += len(item.text)
            for node in item:
                nodeid = node.get("id")
                node2offset[nodeid] = curoff
                if node.tail:
                    text += node.tail
                    curoff += len(node.tail)

        annsets = root.findall("./AnnotationSet")

        annotation_sets = {}  # map name - set
        for annset in annsets:
            if annset.get("Name"):
                setname = annset.get("Name")
            else:
                setname = ""
            annots = annset.findall("./Annotation")
            annotations = []
            maxannid = 0
            for ann in annots:
                annid = int(ann.attrib["Id"])
                maxannid = max(maxannid, annid)
                anntype = ann.attrib["Type"]
                startnode = ann.attrib["StartNode"]
                endnode = ann.attrib["EndNode"]
                startoff = node2offset[startnode]
                endoff = node2offset[endnode]
                feats = ann.findall("./Feature")
                features = parsefeatures(feats, ftype="ann", atype=anntype, aset=setname, offset=startoff)
                if len(features) == 0:
                    features = None
                annotation = {
                    "id": annid,
                    "type": anntype,
                    "start": startoff,
                    "end": endoff,
                    "features": features,
                }
                annotations.append(annotation)
            annset = {
                "name": setname,
                "annotations": annotations,
                "next_annid": maxannid + 1,
            }
            annotation_sets[setname] = annset

        docmap = {
            "text": text,
            "features": docfeatures,
            "offset_type": "p",
            "annotation_sets": annotation_sets,
        }

        doc = Document.from_dict(docmap)
        return doc

Classes

class GateXmlLoader

Loader for JAVA GATE XML format. This supports document and annotation feature values for the following types: String, int, float, booleean, and the following containers containing recursively any of the supported types: Map, Array, List, Set (converted to list).

Expand source code
class GateXmlLoader:
    """
    Loader for JAVA GATE XML format. This supports document and annotation feature values for the following types:
    String, int, float, booleean, and the following containers containing recursively any of the supported types:
    Map, Array, List, Set (converted to list).
    """

    @staticmethod
    def context2txt(context):
        """Generate text from context info"""
        fname = context["fname"]
        node = context["node"]
        if node is None:
            nodetxt = ""
        else:
            nodetxt = ET.tostring(node, encoding='unicode', method="text")
        if context["ftype"] == "doc":
            return f"document feature {fname},\n    node={nodetxt}"
        else:
            atype = context["atype"]
            aset = context["aset"]
            offset = context["offset"]
            return f"annotation feature {fname} for ann type {atype} in set '{aset}' at offset {offset},\n    node={nodetxt}"

    @staticmethod
    def warning(txt, options, context):
        """Emit a warning or stay silent"""
        ctx = GateXmlLoader.context2txt(context)
        if options["show_warnings"]:
            logger.warning(f"{txt} for {ctx}")

    @staticmethod
    def error(txt, options, context):
        """Handle an error"""
        if options["ignore_errors"]:
            GateXmlLoader.warning(txt, options, context)
        else:
            ctx = GateXmlLoader.context2txt(context)
            raise Exception(f"{txt} for {ctx}")

    @staticmethod
    def xstream2python(node, options, context):
        """
        Convert a xstream node to a python object. The node should only have a single child.
        This is either a single Value.className=gate.corpora.ObjectWrapper node with a single value child, or
        a list of value nodes to initialize a container, or something not supported.
        """
        # if options["debug"]:
        #     print(f"DEBUG: Got element ({node.tag}):", ET.tostring(node))
        if node.tag == "gate.corpora.ObjectWrapper":
            children = list(node)
            if len(children) != 1:
                GateXmlLoader.error(
                    f"ObjectWrapper node with not exactly one child for {node} but: {len(children)}: {children}",
                    options, context)
                return None
            child = children[0]
            if child.tag != "value":
                GateXmlLoader.error(f"Child of Value tag is not value but {child.tag}", options, context)
                return None
            return GateXmlLoader.xstream2python(child, options, context)
        elif node.tag == "value":
            valueclass = node.attrib["class"]
            ret = None
            if valueclass == "set":
                GateXmlLoader.warning(f"Converting set to list", options, context)
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif valueclass == "list" or valueclass.endswith("-array"):
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif valueclass == "date":
                ret = node.text
            elif valueclass == "linked-hash-map" or valueclass == "hash-map" or valueclass == "map":
                ret = {}
                for el in node:
                    items = list(el)
                    if len(items) != 2:
                        GateXmlLoader.error(f"Not exactly two children for map content, but {len(items)}",
                                            options, context)
                        break
                    else:
                        key = GateXmlLoader.xstream2python(items[0], options, context)
                        value = GateXmlLoader.xstream2python(items[1], options, context)
                        ret[key] = value
            else:
                GateXmlLoader.error(f"Unsupported xstreaqm type: {valueclass}", options, context)
                ret = None
        else:
            # this is a node for nested values which are not ObjectWrapper, e.g. <string>
            ret = None
            if node.tag == "string":
                ret = node.text
                if ret is None:
                    ret = ""
            elif node.tag == "int":
                ret = int(node.text)
            elif node.tag == "long":
                ret = int(node.text)
            elif node.tag == "boolean":
                ret = (node.text == "true")
            elif node.tag == "set":
                GateXmlLoader.warning(f"Converting set to list", options, context)
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif node.tag == "list" or node.tag.endswith("-array"):
                ret = []
                for el in node:
                    ret.append(GateXmlLoader.xstream2python(el, options, context))
            elif node.tag == "date":
                ret = node.text
            elif node.tag == "big-decimal":
                GateXmlLoader.warning(f"Converting BigDecimal to float", options, context)
                ret = float(Decimal(node.text))
            elif node.tag == "linked-hash-map" or node.tag == "hash-map" or node.tag == "map":
                ret = {}
                for el in node:
                    items = list(el)
                    if len(items) != 2:
                        GateXmlLoader.error(f"Not exactly two children for map content, but {len(items)}",
                                            options, context)
                        break
                    else:
                        key = GateXmlLoader.xstream2python(items[0], options, context)
                        value = GateXmlLoader.xstream2python(items[1], options, context)
                        ret[key] = value
            else:
                GateXmlLoader.error(f"Unknown type, nested tag: {node.tag}", options, context)
                ret = None
        return ret

    @staticmethod
    def value4objectwrapper(xmlstr, options, context):
        """
        Convert some xstream-converted Java values to Python values.

        Args:
            text: the xstream serialization of the value as encountered in the GATE XML
            options: options dictionary to influence error/warning behavior
            context: context information to add to warnings/error messages

        Returns:
            a python value. The value is None if the value could not get converted but the class is configured
            to ignore unknown types.

        Throws:
            Exception if a value cannot be converted to Python and the lcass is configured to not ignore unknown types.
        """
        tree = ET.fromstring(xmlstr)
        return GateXmlLoader.xstream2python(tree, options, context)

    @staticmethod
    def load(clazz,
             from_ext=None,
             ignore_errors=True,
             show_warnings=True,
             debug=False):
        """

        Args:
            clazz:
            from_ext: (Default value = None)
            ignore_errors: (default: False) if True, ignore errors and try to load what we can for the document,
                if False, throw and exception.
            show_warnings: (default: True) If an error occurs but ignore_errors is True, or if some conversion
                is carried out, show a warning.
            debug: if True, output detailed information about unsupported elements in the input to stderr

        Returns:
            Loaded document
        """
        options = dict(
            ignore_errors=ignore_errors,
            show_warnings=show_warnings,
            debug=debug
        )

        isurl, extstr = is_url(from_ext)
        if isurl:
            # xmlstring = get_str_from_url(extstr, encoding="utf-8")
            # root = ET.fromstring(xmlstring)
            infp = stream_from(extstr, encoding=None)
            root = ET.fromstring(infp.read())
            infp.close()
        else:
            with open(extstr, "rb") as infp:
                # tree = ET.parse(extstr)
                # root = tree.getroot()
                root = ET.fromstring(infp.read())

        # or: root = ET.fromstring(xmlstring)

        # check we do have a GATE document

        assert root.tag == "GateDocument"
        # NOTE: there are docs around where version is less than 3 and
        # also where encoding="windows-1252" !!!
        # assert root.attrib == {"version": "3"}

        def parsefeatures(feats, ftype="Unknown", atype="Unknown", aset="Unknown", offset=None):
            """
            Parse the node for a feature map.

            Args:
              feats: iterable of Feature nodes

            Returns:
                The features
            """
            features = {}
            context = dict(
                ftype=ftype,
                atype=atype,
                aset=aset,
                offset=offset,
                fname=None,
                node=None
            )
            for feat in list(feats):
                name = None
                value = None
                for el in list(feat):
                    context["node"] = el
                    context["fname"] = name
                    if el.tag == "Name":
                        if el.get("className") == "java.lang.String":
                            name = el.text
                        else:
                            raise Exception(
                                "Odd Feature Name type: " + el.get("className")
                            )
                    elif el.tag == "Value":
                        cls_name = el.get("className")
                        if cls_name == "java.lang.String":
                            value = el.text
                        elif cls_name == "java.lang.Integer":
                            value = int(el.text)
                        elif cls_name == "java.lang.Long":
                            value = int(el.text)
                        elif cls_name == "java.math.BigDecimal":
                            value = float(el.text)
                        elif cls_name == "java.lang.Boolean":
                            value = bool(el.text)
                        elif cls_name == "gate.corpora.ObjectWrapper":
                            value = GateXmlLoader.value4objectwrapper(el.text, options, context)
                        else:
                            GateXmlLoader.error(f"Feature with unknown serialization type: {cls_name}",
                                                options, context)
                            value = None
                if name is not None and value is not None:
                    features[name] = value
            return features

        # get the document features
        docfeatures = {}
        feats = root.findall("./GateDocumentFeatures/Feature")

        docfeatures = parsefeatures(feats, ftype="doc")

        textwithnodes = root.findall("./TextWithNodes")
        text = ""
        node2offset = {}
        curoff = 0
        for item in textwithnodes:
            if item.text:
                text += item.text
                curoff += len(item.text)
            for node in item:
                nodeid = node.get("id")
                node2offset[nodeid] = curoff
                if node.tail:
                    text += node.tail
                    curoff += len(node.tail)

        annsets = root.findall("./AnnotationSet")

        annotation_sets = {}  # map name - set
        for annset in annsets:
            if annset.get("Name"):
                setname = annset.get("Name")
            else:
                setname = ""
            annots = annset.findall("./Annotation")
            annotations = []
            maxannid = 0
            for ann in annots:
                annid = int(ann.attrib["Id"])
                maxannid = max(maxannid, annid)
                anntype = ann.attrib["Type"]
                startnode = ann.attrib["StartNode"]
                endnode = ann.attrib["EndNode"]
                startoff = node2offset[startnode]
                endoff = node2offset[endnode]
                feats = ann.findall("./Feature")
                features = parsefeatures(feats, ftype="ann", atype=anntype, aset=setname, offset=startoff)
                if len(features) == 0:
                    features = None
                annotation = {
                    "id": annid,
                    "type": anntype,
                    "start": startoff,
                    "end": endoff,
                    "features": features,
                }
                annotations.append(annotation)
            annset = {
                "name": setname,
                "annotations": annotations,
                "next_annid": maxannid + 1,
            }
            annotation_sets[setname] = annset

        docmap = {
            "text": text,
            "features": docfeatures,
            "offset_type": "p",
            "annotation_sets": annotation_sets,
        }

        doc = Document.from_dict(docmap)
        return doc

Static methods

def context2txt(context)

Generate text from context info

Expand source code
@staticmethod
def context2txt(context):
    """Generate text from context info"""
    fname = context["fname"]
    node = context["node"]
    if node is None:
        nodetxt = ""
    else:
        nodetxt = ET.tostring(node, encoding='unicode', method="text")
    if context["ftype"] == "doc":
        return f"document feature {fname},\n    node={nodetxt}"
    else:
        atype = context["atype"]
        aset = context["aset"]
        offset = context["offset"]
        return f"annotation feature {fname} for ann type {atype} in set '{aset}' at offset {offset},\n    node={nodetxt}"
def error(txt, options, context)

Handle an error

Expand source code
@staticmethod
def error(txt, options, context):
    """Handle an error"""
    if options["ignore_errors"]:
        GateXmlLoader.warning(txt, options, context)
    else:
        ctx = GateXmlLoader.context2txt(context)
        raise Exception(f"{txt} for {ctx}")
def load(clazz, from_ext=None, ignore_errors=True, show_warnings=True, debug=False)

Args

clazz:
from_ext
(Default value = None)
ignore_errors
(default: False) if True, ignore errors and try to load what we can for the document, if False, throw and exception.
show_warnings
(default: True) If an error occurs but ignore_errors is True, or if some conversion is carried out, show a warning.
debug
if True, output detailed information about unsupported elements in the input to stderr

Returns

Loaded document

Expand source code
@staticmethod
def load(clazz,
         from_ext=None,
         ignore_errors=True,
         show_warnings=True,
         debug=False):
    """

    Args:
        clazz:
        from_ext: (Default value = None)
        ignore_errors: (default: False) if True, ignore errors and try to load what we can for the document,
            if False, throw and exception.
        show_warnings: (default: True) If an error occurs but ignore_errors is True, or if some conversion
            is carried out, show a warning.
        debug: if True, output detailed information about unsupported elements in the input to stderr

    Returns:
        Loaded document
    """
    options = dict(
        ignore_errors=ignore_errors,
        show_warnings=show_warnings,
        debug=debug
    )

    isurl, extstr = is_url(from_ext)
    if isurl:
        # xmlstring = get_str_from_url(extstr, encoding="utf-8")
        # root = ET.fromstring(xmlstring)
        infp = stream_from(extstr, encoding=None)
        root = ET.fromstring(infp.read())
        infp.close()
    else:
        with open(extstr, "rb") as infp:
            # tree = ET.parse(extstr)
            # root = tree.getroot()
            root = ET.fromstring(infp.read())

    # or: root = ET.fromstring(xmlstring)

    # check we do have a GATE document

    assert root.tag == "GateDocument"
    # NOTE: there are docs around where version is less than 3 and
    # also where encoding="windows-1252" !!!
    # assert root.attrib == {"version": "3"}

    def parsefeatures(feats, ftype="Unknown", atype="Unknown", aset="Unknown", offset=None):
        """
        Parse the node for a feature map.

        Args:
          feats: iterable of Feature nodes

        Returns:
            The features
        """
        features = {}
        context = dict(
            ftype=ftype,
            atype=atype,
            aset=aset,
            offset=offset,
            fname=None,
            node=None
        )
        for feat in list(feats):
            name = None
            value = None
            for el in list(feat):
                context["node"] = el
                context["fname"] = name
                if el.tag == "Name":
                    if el.get("className") == "java.lang.String":
                        name = el.text
                    else:
                        raise Exception(
                            "Odd Feature Name type: " + el.get("className")
                        )
                elif el.tag == "Value":
                    cls_name = el.get("className")
                    if cls_name == "java.lang.String":
                        value = el.text
                    elif cls_name == "java.lang.Integer":
                        value = int(el.text)
                    elif cls_name == "java.lang.Long":
                        value = int(el.text)
                    elif cls_name == "java.math.BigDecimal":
                        value = float(el.text)
                    elif cls_name == "java.lang.Boolean":
                        value = bool(el.text)
                    elif cls_name == "gate.corpora.ObjectWrapper":
                        value = GateXmlLoader.value4objectwrapper(el.text, options, context)
                    else:
                        GateXmlLoader.error(f"Feature with unknown serialization type: {cls_name}",
                                            options, context)
                        value = None
            if name is not None and value is not None:
                features[name] = value
        return features

    # get the document features
    docfeatures = {}
    feats = root.findall("./GateDocumentFeatures/Feature")

    docfeatures = parsefeatures(feats, ftype="doc")

    textwithnodes = root.findall("./TextWithNodes")
    text = ""
    node2offset = {}
    curoff = 0
    for item in textwithnodes:
        if item.text:
            text += item.text
            curoff += len(item.text)
        for node in item:
            nodeid = node.get("id")
            node2offset[nodeid] = curoff
            if node.tail:
                text += node.tail
                curoff += len(node.tail)

    annsets = root.findall("./AnnotationSet")

    annotation_sets = {}  # map name - set
    for annset in annsets:
        if annset.get("Name"):
            setname = annset.get("Name")
        else:
            setname = ""
        annots = annset.findall("./Annotation")
        annotations = []
        maxannid = 0
        for ann in annots:
            annid = int(ann.attrib["Id"])
            maxannid = max(maxannid, annid)
            anntype = ann.attrib["Type"]
            startnode = ann.attrib["StartNode"]
            endnode = ann.attrib["EndNode"]
            startoff = node2offset[startnode]
            endoff = node2offset[endnode]
            feats = ann.findall("./Feature")
            features = parsefeatures(feats, ftype="ann", atype=anntype, aset=setname, offset=startoff)
            if len(features) == 0:
                features = None
            annotation = {
                "id": annid,
                "type": anntype,
                "start": startoff,
                "end": endoff,
                "features": features,
            }
            annotations.append(annotation)
        annset = {
            "name": setname,
            "annotations": annotations,
            "next_annid": maxannid + 1,
        }
        annotation_sets[setname] = annset

    docmap = {
        "text": text,
        "features": docfeatures,
        "offset_type": "p",
        "annotation_sets": annotation_sets,
    }

    doc = Document.from_dict(docmap)
    return doc
def value4objectwrapper(xmlstr, options, context)

Convert some xstream-converted Java values to Python values.

Args

text
the xstream serialization of the value as encountered in the GATE XML
options
options dictionary to influence error/warning behavior
context
context information to add to warnings/error messages

Returns

a python value. The value is None if the value could not get converted but the class is configured to ignore unknown types.

Throws

Exception if a value cannot be converted to Python and the lcass is configured to not ignore unknown types.

Expand source code
@staticmethod
def value4objectwrapper(xmlstr, options, context):
    """
    Convert some xstream-converted Java values to Python values.

    Args:
        text: the xstream serialization of the value as encountered in the GATE XML
        options: options dictionary to influence error/warning behavior
        context: context information to add to warnings/error messages

    Returns:
        a python value. The value is None if the value could not get converted but the class is configured
        to ignore unknown types.

    Throws:
        Exception if a value cannot be converted to Python and the lcass is configured to not ignore unknown types.
    """
    tree = ET.fromstring(xmlstr)
    return GateXmlLoader.xstream2python(tree, options, context)
def warning(txt, options, context)

Emit a warning or stay silent

Expand source code
@staticmethod
def warning(txt, options, context):
    """Emit a warning or stay silent"""
    ctx = GateXmlLoader.context2txt(context)
    if options["show_warnings"]:
        logger.warning(f"{txt} for {ctx}")
def xstream2python(node, options, context)

Convert a xstream node to a python object. The node should only have a single child. This is either a single Value.className=gate.corpora.ObjectWrapper node with a single value child, or a list of value nodes to initialize a container, or something not supported.

Expand source code
@staticmethod
def xstream2python(node, options, context):
    """
    Convert a xstream node to a python object. The node should only have a single child.
    This is either a single Value.className=gate.corpora.ObjectWrapper node with a single value child, or
    a list of value nodes to initialize a container, or something not supported.
    """
    # if options["debug"]:
    #     print(f"DEBUG: Got element ({node.tag}):", ET.tostring(node))
    if node.tag == "gate.corpora.ObjectWrapper":
        children = list(node)
        if len(children) != 1:
            GateXmlLoader.error(
                f"ObjectWrapper node with not exactly one child for {node} but: {len(children)}: {children}",
                options, context)
            return None
        child = children[0]
        if child.tag != "value":
            GateXmlLoader.error(f"Child of Value tag is not value but {child.tag}", options, context)
            return None
        return GateXmlLoader.xstream2python(child, options, context)
    elif node.tag == "value":
        valueclass = node.attrib["class"]
        ret = None
        if valueclass == "set":
            GateXmlLoader.warning(f"Converting set to list", options, context)
            ret = []
            for el in node:
                ret.append(GateXmlLoader.xstream2python(el, options, context))
        elif valueclass == "list" or valueclass.endswith("-array"):
            ret = []
            for el in node:
                ret.append(GateXmlLoader.xstream2python(el, options, context))
        elif valueclass == "date":
            ret = node.text
        elif valueclass == "linked-hash-map" or valueclass == "hash-map" or valueclass == "map":
            ret = {}
            for el in node:
                items = list(el)
                if len(items) != 2:
                    GateXmlLoader.error(f"Not exactly two children for map content, but {len(items)}",
                                        options, context)
                    break
                else:
                    key = GateXmlLoader.xstream2python(items[0], options, context)
                    value = GateXmlLoader.xstream2python(items[1], options, context)
                    ret[key] = value
        else:
            GateXmlLoader.error(f"Unsupported xstreaqm type: {valueclass}", options, context)
            ret = None
    else:
        # this is a node for nested values which are not ObjectWrapper, e.g. <string>
        ret = None
        if node.tag == "string":
            ret = node.text
            if ret is None:
                ret = ""
        elif node.tag == "int":
            ret = int(node.text)
        elif node.tag == "long":
            ret = int(node.text)
        elif node.tag == "boolean":
            ret = (node.text == "true")
        elif node.tag == "set":
            GateXmlLoader.warning(f"Converting set to list", options, context)
            ret = []
            for el in node:
                ret.append(GateXmlLoader.xstream2python(el, options, context))
        elif node.tag == "list" or node.tag.endswith("-array"):
            ret = []
            for el in node:
                ret.append(GateXmlLoader.xstream2python(el, options, context))
        elif node.tag == "date":
            ret = node.text
        elif node.tag == "big-decimal":
            GateXmlLoader.warning(f"Converting BigDecimal to float", options, context)
            ret = float(Decimal(node.text))
        elif node.tag == "linked-hash-map" or node.tag == "hash-map" or node.tag == "map":
            ret = {}
            for el in node:
                items = list(el)
                if len(items) != 2:
                    GateXmlLoader.error(f"Not exactly two children for map content, but {len(items)}",
                                        options, context)
                    break
                else:
                    key = GateXmlLoader.xstream2python(items[0], options, context)
                    value = GateXmlLoader.xstream2python(items[1], options, context)
                    ret[key] = value
        else:
            GateXmlLoader.error(f"Unknown type, nested tag: {node.tag}", options, context)
            ret = None
    return ret