Module gatenlp.document
Module that implements the Document class for representing gatenlp documents with features and annotation sets.
Expand source code
"""
Module that implements the Document class for representing gatenlp documents with
features and annotation sets.
"""
from typing import Callable, Union, List
import logging
import importlib
from typing import Iterable
import copy as lib_copy
from gatenlp.annotation_set import AnnotationSet
from gatenlp.annotation import Annotation
from gatenlp.offsetmapper import OffsetMapper, OFFSET_TYPE_PYTHON, OFFSET_TYPE_JAVA
from gatenlp.features import Features
from gatenlp.utils import in_notebook, in_colab
from gatenlp.changelog import ChangeLog
from gatenlp.changelog_consts import (
ACTION_ADD_ANN,
ACTION_ADD_ANNSET,
ACTION_CLEAR_ANNS,
ADDANN_UPDATE_FEATURES,
ACTION_CLEAR_ANN_FEATURES,
ACTION_CLEAR_DOC_FEATURES,
ACTION_DEL_ANN,
ACTION_DEL_ANN_FEATURE,
ACTION_DEL_DOC_FEATURE,
ACTION_SET_ANN_FEATURE,
ACTION_SET_DOC_FEATURE,
ADDANN_ADD_NEW_FEATURES,
ADDANN_ADD_WITH_NEW_ID,
ADDANN_IGNORE,
ADDANN_REPLACE_ANNOTATION,
ADDANN_REPLACE_FEATURES,
)
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class Document:
"""
Represent a GATE document. This is different from the original Java GATE representation in
several ways:
* the text is not mutable and can only be set at creation time, so there is no "edit" method
* as a feature bearer, all the methods to set, get and manipulate features are part of this
class, there is
no separate "FeatureMap" to store them
* does not support listener callbacks
* there is no separate abstraction for "content", the only content possible is text which
is a unicode string that can be acessed with the "text()" method
* Spans of text can be directly accessed using doc[from:to]
* Features may only have string keys and values which can be json-serialised
* Annotation offsets by default are number of Unicde code points, this is different from Java
where the offsets are UTF-16 Unicode code units
* Offsets of all annotations can be changed from/to Java (from python index of unicode
codepoint to Java index of UTF-16 code unit and back)
* No part of the document has to be present, not even the text (this allows saving just
the annotations separately from the text)
* Once the text has been set, it is immutable (no support to edit text and change annotation
offsets accordingly)
Args:
text: the text of the document. The text can be None to indicate that no initial text
should be set. Once the text has been set for a document, it is immutable and cannot
be changed.
features: the initial document features to set, a sequence of key/value tuples
changelog: a ChangeLog instance to use to log changes.
"""
def __init__(self, text: str = None, features=None, changelog: ChangeLog = None):
if text is not None:
assert isinstance(text, str)
if changelog is not None:
assert isinstance(changelog, ChangeLog)
self._changelog = changelog
self._features = Features(features, _change_logger=self._log_feature_change)
self._annotation_sets = dict()
self._text = text
self.offset_type = OFFSET_TYPE_PYTHON
self._name = ""
@property
def name(self):
""" """
return self._name
@name.setter
def name(self, val):
"""
Args:
val:
Returns:
"""
if val is None:
val = ""
if not isinstance(val, str):
raise Exception("Name must be a string")
self._name = val
if self._changelog is not None:
ch = {"command": "name:set"}
ch["name"] = val
self._changelog.append(ch)
def _ensure_type_python(self) -> None:
""" """
if self.offset_type != OFFSET_TYPE_PYTHON:
raise Exception(
"Document cannot be used if it is not type PYTHON, "
+ "use to_type(OFFSET_TYPE_PYTHON) first"
)
def _fixup_annotations(self, method: Callable) -> None:
"""
Args:
method: Callable:
Returns:
"""
annset_names = self._annotation_sets.keys()
for annset_name in annset_names:
annset = self._annotation_sets[annset_name]
if annset._annotations is not None:
for ann in annset._annotations.values():
ann._start = method(ann._start)
ann._end = method(ann._end)
def to_offset_type(self, offsettype: str) -> Union[OffsetMapper, None]:
"""Convert all the offsets of all the annotations in this document to the
required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets
are already of that type, this does nothing.
NOTE: if the document has a ChangeLog, it is NOT also converted!
The method returns the offset mapper if anything actually was converted,
otherwise None.
Args:
offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
offsettype: str:
Returns:
offset mapper or None
"""
if offsettype == self.offset_type:
return None
if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON:
# convert from currently python to java
om = OffsetMapper(self._text)
self._fixup_annotations(om.convert_to_java)
self.offset_type = OFFSET_TYPE_JAVA
elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA:
# convert from currently java to python
om = OffsetMapper(self._text)
self._fixup_annotations(om.convert_to_python)
self.offset_type = OFFSET_TYPE_PYTHON
else:
raise Exception("Odd offset type")
return om
def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID):
"""Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance,
a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single
change object.
The document is modified in-place.
Args:
changes: one or more changes
handle_existing_anns: what to do if the change from the changelog tries to
add an annotation with an annotation id that already exists in the target set.
(Default value = ADDANN_ADD_WITH_NEW_ID)
"""
if isinstance(changes, dict):
changes = [changes]
elif isinstance(changes, ChangeLog):
changes = changes.changes
for change in changes:
cmd = change.get("command")
fname = change.get("feature")
fvalue = change.get("value")
features = change.get("features")
sname = change.get("set")
annid = change.get("id")
if cmd is None:
raise Exception("Change without field 'command'")
if cmd == ACTION_ADD_ANNSET:
assert sname is not None
self.annset(sname)
elif cmd == ACTION_ADD_ANN:
assert sname is not None
assert annid is not None
anns = self.annset(sname)
ann = anns.get(annid)
start = change.get("start")
end = change.get("end")
anntype = change.get("type")
if ann is None:
anns.add(start, end, anntype, annid=annid, features=features)
else:
if handle_existing_anns == ADDANN_IGNORE:
pass
elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID:
anns.add(start, end, anntype)
elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION:
anns.remove(annid)
anns.add(start, end, anntype, annid)
elif handle_existing_anns == ADDANN_UPDATE_FEATURES:
ann.features.update(features)
elif handle_existing_anns == ADDANN_REPLACE_FEATURES:
ann.features.clear()
ann.features.update(features)
elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES:
fns = ann.features.names()
for f in features.keys():
if f not in fns:
ann.features[f] = features[f]
elif handle_existing_anns == ADDANN_IGNORE:
pass
elif cmd == ACTION_CLEAR_ANNS:
assert sname is not None
anns = self.annset(sname)
anns.clear()
elif cmd == ACTION_CLEAR_ANN_FEATURES:
assert sname is not None
assert annid is not None
anns = self.annset(sname)
ann = anns.get(annid)
if ann is not None:
ann.features.clear()
else:
pass # ignore, could happen with a detached annotation
elif cmd == ACTION_CLEAR_DOC_FEATURES:
self.features.clear()
elif cmd == ACTION_SET_ANN_FEATURE:
assert fname is not None
assert sname is not None
assert annid is not None
ann = self.annset(sname).get(annid)
ann.features[fname] = fvalue
elif cmd == ACTION_DEL_ANN_FEATURE:
assert sname is not None
assert annid is not None
anns = self.annset(sname)
ann = anns.get(annid)
if ann is not None:
if fname is not None:
ann.features.pop(fname, None)
else:
pass # ignore, could happen with a detached annotation
elif cmd == ACTION_DEL_DOC_FEATURE:
assert fname is not None
self.features.pop(fname, None)
elif cmd == ACTION_DEL_ANN:
assert sname is not None
assert annid is not None
anns = self.annset(sname)
anns.remove(annid)
elif cmd == ACTION_SET_DOC_FEATURE:
assert fname is not None
self.features[fname] = fvalue
elif cmd == ACTION_CLEAR_DOC_FEATURES:
self._features.clear()
elif cmd == ACTION_DEL_DOC_FEATURE:
assert fname is not None
del self._features[fname]
else:
raise Exception("Unknown ChangeLog action: ", cmd)
@property
def features(self):
"""Accesses the features as a FeatureViewer instance. Changes made on this object are
reflected in the document and recorded in the change log, if there is one.
:return: A FeatureViewer view of the document features.
Args:
Returns:
"""
return self._features
@property
def changelog(self):
"""Get the ChangeLog or None if no ChangeLog has been set.
:return: the changelog
Args:
Returns:
"""
return self._changelog
@changelog.setter
def changelog(self, chlog):
"""Make the document use the given changelog to record all changes
from this moment on.
Args:
chlog: the new changelog to use or None to not use any
Returns:
the changelog used previously or None
"""
self._changelog = chlog
@property
def text(self) -> str:
"""Get the text of the document. For a partial document, the text may be None.
:return: the text of the document
Args:
Returns:
"""
self._ensure_type_python()
return self._text
@text.setter
def text(self, value: str) -> None:
"""
Set the text of the document. This is only possible as long as it has not been set
yet, after that, the text is immutable.
IMPORTANT: it is possible to add arbitrary annotations to a document which does not have any
text. This is meant to allow handling of annotation-only representations.
However, if the text is set after annotations have been added, annotation offsets are not
checked and it is possible to thus create an invalid document where annotations refer to
text ranges that do not exist!
Args:
value: the text for the document
value: str:
Returns:
"""
if self._text is None:
self._text = value
else:
raise NotImplementedError("Text cannot be modified")
def _log_feature_change(
self, command: str, feature: str = None, value=None
) -> None:
"""
Args:
command: str:
feature: str: (Default value = None)
value: (Default value = None)
Returns:
"""
if self._changelog is None:
return
command = "doc-" + command
ch = {"command": command}
if command == "doc-feature:set":
ch["feature"] = feature
ch["value"] = value
self._changelog.append(ch)
def __len__(self) -> int:
"""
Return the length of the text.
Note: this will convert the type of the document to python!
:return: the length of the document text
"""
self._ensure_type_python()
if self._text is None:
return 0
else:
return len(self._text)
def __getitem__(self, span) -> str:
"""
Get the text for the given span.
:param span: a single number, an offset range of the form from:to or an annotation.
If annotation, uses the annotation's offset span.
:return: the text of the span
"""
self._ensure_type_python()
if isinstance(span, Annotation):
return self.text[span._start:span._end]
if isinstance(span, AnnotationSet):
return self.text[span.start():span.end()]
if hasattr(span, "start") and hasattr(span, "end"):
return self.text[span.start:span.end]
return self.text[span]
def annset(self, name: str = "") -> AnnotationSet:
"""
Get the named annotation set, if name is not given or the empty string,
the default annotation set.
If the annotation set does not already exist, it is created.
Args:
name: the annotation set name, the empty string is used for the
"default annotation set".
name: str: (Default value = "")
Returns:
the specified annotation set.
"""
self._ensure_type_python()
if name not in self._annotation_sets:
annset = AnnotationSet(owner_doc=self, name=name)
self._annotation_sets[name] = annset
if self._changelog:
self._changelog.append({"command": "annotations:add", "set": name})
return annset
else:
return self._annotation_sets[name]
def annset_names(self) -> List[str]:
"""
Args:
Returns:
:return: annotation set names
"""
self._ensure_type_python()
return list(self._annotation_sets.keys())
def remove_annset(self, name: str):
"""Completely remove the annotation set.
Args:
name: name of the annotation set to remove
name: str:
Returns:
"""
if name not in self._annotation_sets:
raise Exception(f"AnnotationSet with name {name} does not exist")
del self._annotation_sets[name]
if self._changelog:
self._changelog.append({"command": "annotations:remove", "set": name})
def anns(self, annspec):
"""
Return a detached annotation set with all annotations which match the annotation specification.
Annotation ids are preserved if possible, but if annotations from different sets have duplicate
ids, new ids are assigned instead.
Args:
annspec: either a single string which is interpreted as an annotation set name, or a list where
each element is either a string (annotation set name) or a tuple. If an element is a tuple, the
first element of the tuple must be the annotation set name and the second element either a type
name or a list of type names.
Returns:
a detached, immutable set with all the annotations matching the annotation specification
"""
return AnnotationSet.create_from(self.yield_anns(annspec))
def yield_anns(self, annspec):
"""
Yield all annotations which match the annotation specification.
The order of the annotations is unespecified.
Args:
annspec: either a single string which is interpreted as an annotation set name, or a list where
each element is either a string (annotation set name) or a tuple. If an element is a tuple, the
first element of the tuple must be the annotation set name and the second element either a type
name or a list of type names.
Yields:
all the annotations matching the annotation specification
"""
if isinstance(annspec, str):
tmpset = self._annotation_sets.get(annspec)
if tmpset is not None:
for ann in tmpset._annotations.values():
yield ann
return
for spec in annspec:
if isinstance(spec, str):
tmpset = self._annotation_sets.get(spec)
if tmpset is not None:
for ann in tmpset._annotations.values():
yield ann
else:
setname, types = spec
if isinstance(types, str):
types = [types]
tmpset = self._annotation_sets.get(setname)
if tmpset is not None:
for ann in tmpset._annotations.values():
if ann.type in types:
yield ann
def __repr__(self) -> str:
"""
String representation of the document, showing all content.
:return: string representation
"""
return "Document({},features={},anns={})".format(
self.text, self._features, self._annotation_sets.__repr__()
)
def __str__(self) -> str:
asets = (
"["
+ ",".join([f"'{k}':{len(v)}" for k, v in self._annotation_sets.items()])
+ "]"
)
return "Document({},features={},anns={})".format(
self.text, self._features, asets
)
def to_dict(self, offset_type=None, annspec=None, **kwargs):
"""Convert this instance to a dictionary that can be used to re-create the instance with
from_dict.
NOTE: if there is an active changelog, it is not included in the output as this
field is considered a transient field!
Args:
offset_type: convert to the given offset type on the fly (Default value = None)
annspec if not None, a list of annotation set/type specifications: each element
is either a string, the name of the annotation set to include, or a tuple where the
first element is the annotation set name and the second element is either a
type name or a list of type names. The same annotation set name should not be used
in more than one specification.
**kwargs: get passed on to the to_dict methods of included objects.
Returns:
the dictionary representation of this instance
"""
# if the specified offset type is equal to what we have, do nothing, otherwise
# create an offset mapper and pass it down to where we actually convert the annotations
if offset_type is not None:
assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON
if offset_type != self.offset_type:
if self._text is not None:
om = OffsetMapper(self._text)
kwargs["offset_mapper"] = om
kwargs["offset_type"] = offset_type
else:
offset_type = self.offset_type
# create the annotation sets map
if annspec is not None:
annsets_dict = {}
for spec in annspec:
if isinstance(spec, str):
tmpset = self._annotation_sets.get(spec)
if tmpset is not None:
annsets_dict[spec] = tmpset.to_dict(**kwargs)
else:
setname, types = spec
if isinstance(types, str):
types = [types]
tmpset = self._annotation_sets.get(setname)
if tmpset is not None:
annsets_dict[setname] = self._annotation_sets[setname].to_dict(
anntypes=types, **kwargs
)
else:
annsets_dict = {
name: aset.to_dict(**kwargs)
for name, aset in self._annotation_sets.items()
}
return {
"annotation_sets": annsets_dict,
"text": self._text,
"features": self._features.to_dict(),
"offset_type": offset_type,
"name": self.name,
}
@staticmethod
def from_dict(dictrepr, **_kwargs):
"""Return a Document instance as represented by the dictionary dictrepr.
Args:
dictrepr: return: the initialized Document instance
**_kwargs: not used, ignored
Returns:
the initialized Document instance
"""
feats = dictrepr.get("features", {})
doc = Document(dictrepr.get("text"), features=feats)
doc.name = dictrepr.get("name")
doc.offset_type = dictrepr.get("offset_type")
if doc.offset_type is None:
doc.offset_type = OFFSET_TYPE_PYTHON
if (
doc.offset_type != OFFSET_TYPE_JAVA
and doc.offset_type != OFFSET_TYPE_PYTHON
):
raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
annsets = {
name: AnnotationSet.from_dict(adict, owner_doc=doc)
for name, adict in dictrepr.get("annotation_sets", {}).items()
}
doc._annotation_sets = annsets
return doc
def save(
self,
destination,
fmt=None,
offset_type=None,
mod="gatenlp.serialization.default",
annspec=None,
**kwargs,
):
"""Save the document to the destination file.
Args:
destination: either a file name or something that has a write(string) method.
fmt: serialization format, by default the format is inferred from the file extension.
offset_type: store using the given offset type or keep the current if None
(Default value = None)
mod: module where the document saver is implemented.
(Default value = "gatenlp.serialization.default")
annspec: if not None, a list of annotation set names or tuples of set name and a
list of annotation types to include in the serialized document.
kwargs: additional parameters for the document saver.
"""
if annspec is not None:
kwargs["annspec"] = annspec
if fmt is None or isinstance(fmt, str):
m = importlib.import_module(mod)
saver = m.get_document_saver(destination, fmt)
saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
else:
# assume fmt is a callable to get used directly
fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
def save_mem(
self,
fmt="json",
offset_type=None,
mod="gatenlp.serialization.default",
**kwargs,
):
"""Serialize to a string or bytes in the given format.
Args:
fmt: serialization format to use. (Default value = "json")
offset_type: store using the given offset type or keep the current if None
(Default value = None)
mod: module where the document saver is implemented.
(Default value = "gatenlp.serialization.default")
kwargs: additional parameters for the format.
"""
if not fmt:
raise Exception("Format required.")
if isinstance(fmt, str):
m = importlib.import_module(mod)
saver = m.get_document_saver(None, fmt)
return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
else:
fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
@staticmethod
def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs):
"""
Load or import a document from the given source. The source can be a file path or
file name or a URL. If the type of the source is str, then if it starts with
"http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of
a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse
the URL using urllib.
Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)`
Example: `Document.load(pathlib.Path(somepath), fmt=theformat)`
NOTE: the offset type of the document is always converted to PYTHON when loading!
Args:
source: the URL or file path to load from.
fmt: the format of the source. By default the format is inferred by the file extension.
The format can be a format memnonic like "json", "html", or a known mime type
like "text/bdocjs".
mod: the name of a module where the document loader is implemented.
(Default value = "gatenlp.serialization.default")
kwargs: additional format specific keyword arguments to pass to the loader
Returns:
the loaded document
"""
if fmt is None or isinstance(fmt, str):
m = importlib.import_module(mod)
loader = m.get_document_loader(source, fmt)
doc = loader(Document, from_ext=source, **kwargs)
else:
doc = fmt(Document, from_ext=source, **kwargs)
if doc.offset_type == OFFSET_TYPE_JAVA:
doc.to_offset_type(OFFSET_TYPE_PYTHON)
return doc
@staticmethod
def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs):
"""
Create a document from the in-memory serialization in source. Source can be a string or
bytes, depending on the format.
Note: the offset type is always converted to PYTHON when loading!
Args:
source: the string/bytes to deserialize
fmt: if string, the format identifier or mime type (Default value = "json"), otherwise
assumed to be a callable that retrieves and returns the document
mod: the name of the module where the loader is implemented
(Default value = "gatenlp.serialization.default")
kwargs: additional arguments to pass to the loader
"""
if not fmt:
raise Exception("Format required.")
if isinstance(fmt, str):
m = importlib.import_module(mod)
loader = m.get_document_loader(None, fmt)
doc = loader(Document, from_mem=source, **kwargs)
else:
doc = fmt(Document, from_mem=source, **kwargs)
if doc.offset_type == OFFSET_TYPE_JAVA:
doc.to_offset_type(OFFSET_TYPE_PYTHON)
return doc
def __copy__(self):
"""
Creates a shallow copy except the changelog which is set to None. The document feature map is
a new instance, so features added in one copy will not show up in the other. However if
feature values of copied features are objects, they are shared between the copies.
Annotation sets are separate but the features of shared annotations are shared.
Returns:
shallow copy of the document
"""
doc = Document(self._text)
doc._annotation_sets = dict()
for name, aset in self._annotation_sets.items():
doc._annotation_sets[name] = aset.copy()
doc._annotation_sets[name]._owner_doc = doc
doc.offset_type = self.offset_type
doc._features = self._features.copy()
return doc
def copy(self, annspec=None):
"""
Creates a shallow copy except the changelog which is set to None. If annspec is specified,
creates a shallow copy but also limits the annotations to the one specified.
Args:
annspec: if not None, a list of annotation set/type specifications: each element
is either a string, the name of the annotation set to include, or a tuple where the
first element is the annotation set name and the second element is either a
type name or a list of type names. The same annotation set name should not be used
in more than one specification.
Returns:
shallow copy of the document, optionally with some annotations removed
"""
if annspec is None:
return self.__copy__()
doc = Document(self._text)
doc.offset_type = self.offset_type
doc._features = self._features.copy()
doc._annotation_sets = dict()
for spec in annspec:
if isinstance(spec, str):
tmpset = self._annotation_sets.get(spec)
if tmpset is not None:
doc._annotation_sets[spec] = self._annotation_sets[spec].copy()
doc._annotation_sets[spec]._owner_doc = doc
else:
setname, types = spec
if isinstance(types, str):
types = [types]
tmpset = self._annotation_sets.get(setname)
if tmpset is not None:
annset = AnnotationSet(owner_doc=doc, name=setname)
anns = self.annset(setname).with_type(types)
for ann in anns:
annset.add_ann(ann)
doc._annotation_sets[setname] = annset
return doc
def deepcopy(self, annspec=None, memo=None):
"""
Creates a deep copy, except the changelog which is set to None. If annset is not None, the
annotations in the copy are restricted to the given set.
Args:
memo: the memoization dictionary to use.
annspec: which annotation sets and types to include
Returns:
a deep copy of the document.
"""
if self._features is not None:
fts = lib_copy.deepcopy(self._features.to_dict(), memo)
else:
fts = None
doc = Document(self._text, features=fts)
doc._changelog = None
doc.offset_type = self.offset_type
if annspec is None:
doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo)
else:
doc._annotation_sets = dict()
for spec in annspec:
if isinstance(spec, str):
tmpset = self._annotation_sets.get(spec)
if tmpset is not None:
doc._annotation_sets[spec] = lib_copy.deepcopy(tmpset, memo)
doc._annotation_sets[spec]._owner_doc = doc
else:
setname, types = spec
if isinstance(types, str):
types = [types]
tmpset = self._annotation_sets.get(setname)
if tmpset is not None:
annset = AnnotationSet(owner_doc=doc, name=setname)
anns = tmpset.with_type(types)
for ann in anns:
annset.add_ann(lib_copy.deepcopy(ann, memo))
doc._annotation_sets[setname] = annset
return doc
def __deepcopy__(self, memo=None):
"""
Creates a deep copy, except the changelog which is set to None.
Args:
memo: the memoization dictionary to use.
Returns:
a deep copy of the document.
"""
return lib_copy.deepcopy(self, memo=memo)
def _repr_html_(self):
"""
Render function for Jupyter notebooks. Returns the html-ann-viewer HTML.
This renders the HTML for notebook, for offline mode, but does not add the JS
but instead initializes the JS in the notebook unless gatenlp.init_notebook()
has been called already.
"""
# import within this method to avoid dependencies needed for this in the whole Document module
from gatenlp.serialization.default_htmlannviewer import show_colab, show_notebook
if in_colab():
return show_colab(self, display=False)
elif in_notebook():
return show_notebook(self, display=False)
else:
# things like starboard.gg may call _repr_html_ but still not satisfy either in colab or notebook
return self.__repr__()
def show(
self,
to=None,
htmlid=None,
annspec=None,
preselect=None,
palette=None,
cols4types=None,
doc_style=None,
row1_style=None,
row2_style=None):
"""
Show the document, possibly in a Jupyter notebook. This allows to assign a specific htmlid so
the generated HTML can be directly styled afterwards.
This directly sends the rendered document to the cell (no display/HTML necessary) if
the destination is a notebook.
Args:
to: if None, try to guess if this is called from within a notebook and if yes, which kind.
Otherwise, explicitly specify where to show the document to, one of "console", "jupyter",
"colab". If "console" is specified or automatically chosen, the parameters "annspec", "doc_style",
"row1_style", and "row2_style" are irrelevant and ignored.
htmlid: the HTML id prefix to use for classes and element ids.
annspec: if not None, a list of annotation set/type specifications.
Each element is either the name of a set to fully include, or a tuple with the name of the set as
the first element and with a single type name or a list of type names as the second element
preselect: if not None, the set and type names to pre-select (show). This should have the same format
as the annspec parameter.
palette: if not None a list of colour codes (strings) usable in Javascript which will be used instead
of the default palette.
cols4types: if not None a dictionary mapping tuples (setname, typename) to a color. For the given
setname and typename combinations, the colours from the palette (default or specified) will be
overrriden.
doc_style: if not None, use this as the style for the document text box
row1_style: if not None, use this for the first row of the HTML viewer, which contains the document text
and annotation sets and types panes. Default is gatenlpconfig.doc_html_repr_row1style_nostretch
row2_style: if not None, use this for the second row of the HTML viewer, which contains the document and
annottion features pane. Default is gatenlpconfig.doc_html_repr_row2style_nostretch
"""
# import within this method to avoid dependencies needed for this in the whole Document module
from gatenlp.serialization.default_htmlannviewer import show_colab, show_notebook
if to == "colab":
show_colab(self, htmlid=htmlid, display=True, annspec=annspec,
preselect=preselect, palette=palette, cols4types=cols4types,
doc_style=doc_style, row1_style=row1_style, row2_style=row2_style)
return
elif to == "jupyter":
show_notebook(self, htmlid=htmlid, display=True, annspec=annspec,
preselect=preselect, palette=palette, cols4types=cols4types,
doc_style=doc_style, row1_style=row1_style, row2_style=row2_style)
return
elif to == "console":
return self.__str__()
elif to is not None:
raise Exception(f"Not a valid value for parameter to: {to}. Use one of console, jupyter, colab")
if in_notebook():
if in_colab():
show_colab(self, htmlid=htmlid, display=True, annspec=annspec,
preselect=preselect, palette=palette, cols4types=cols4types,
doc_style=doc_style, row1_style=row1_style, row2_style=row2_style)
return
else:
show_notebook(self, htmlid=htmlid, display=True, annspec=annspec,
preselect=preselect, palette=palette, cols4types=cols4types,
doc_style=doc_style, row1_style=row1_style, row2_style=row2_style)
return
else:
return self.__str__()
def attach(self, annset, name, check=True):
"""
Attach a detached set to the document. This should get used with caution and is mainly
intended for use inside the gatenlp library to allow for fast incremental creation of
new documents and document sets. The set can only be added if a set with the given name
does not yet exist at all.
Args:
annset: the annotation set to attach
name: the name for the annotation set
check: if False, prevent any checking. WARNING: this may create an inconsistent/illegal document!
"""
if name in self._annotation_sets:
raise Exception(f"Cannot attach set, a set with the name {name} already exists")
if check:
# check if the offsets are consistent with the document
mylen = len(self)
for ann in annset._annotations.values():
if ann.end > mylen:
raise Exception(f"Cannot attach set, annotation beyond text end: {ann}")
self._annotation_sets[name] = annset
annset._owner_doc = self
@staticmethod
def _edit_text(oldtext, edits):
"""
Edit helper method: given some old text, applies the edit or edits and returns the changed text.
Args:
oldtext: the text to edit
edits: a single edit (start, end, replacementtext) or a list of such edits. The list must be sorted by
starting offset.
Returns: the new changed text
"""
# note: all the offsets in the edits refer to the current text, so we cannot simply apply one edit
# after the other. Instead, we collect all the text snippets we need and create the final text from them
snippets = []
lastoff = 0
for edit in edits:
start, end, newtext = edit
if end < start:
raise Exception(f"Edit ({start}, {end}, {newtext}): end offset smaller than start offset")
if start > lastoff:
snippets.append(oldtext[lastoff:start])
elif start < lastoff:
raise Exception(f"Edits overlap or not sorted: ({start}, {end}, {newtext})")
if len(newtext) > 0:
snippets.append(newtext)
lastoff = end
if lastoff < len(oldtext):
snippets.append(oldtext[lastoff:])
newtext = "".join(snippets)
return newtext
def edit(self, edits, affected_strategy="keepadapt"):
"""
Carry out one or more edits. If edits is a tuple of length 3 with the first element not being iterable,
assume it is a single edit, Otherwise assume it is an iterable of edits.
An edit is a tuple (start, end, newstr) giving the old offset range and the string which
replaces that range. NOTE: no two edit offset ranges may
overlap, if ranges do overlap, this method may raise an exception or silently perform unexpected
and terrible changes. The method does not check for edit spans to not overlap!
This method adapts the offsets of all annotations after the affected span, if an annotation begins or
ends within an affected span, what happens depends on the affected_strategy:
delete_all: remove any annotation where the start and/or end offset lies between the from/to offsets of
the edit
adapt: any start and/or end offset in between from/to is changed to the from or to offset
keepadapt: any start and/or end offset in between is left unchanged if that offset still exists in the
new span, otherwise adapted to from/to.
Args:
edits: single edit or iterable of edits
affected_strategy: one of the following strategies: delete, adapt, keepadapt
"""
assert affected_strategy in ["delete_all", "adapt", "keepadapt"]
if isinstance(edits, tuple) and not isinstance(edits[0], Iterable):
edits = [edits]
edits.sort(key=lambda x: x[0])
self._text = Document._edit_text(self._text, edits)
for annset in self._annotation_sets.values():
annset._edit(edits, affected_strategy=affected_strategy)
def clone(self):
"""
Create a clone of the current document, no data is shared between the clone and the original.
Returns:
A copy of the current document
"""
return Document.load_mem(self.save_mem())
# class MultiDocument(Document):
# """
# NOTE: This is just experimental for now, DO NOT USE!
#
# A MultiDocument can store more than one document, each identified by their ids. One of those
# documents is always the "active" one and the MultiDocument can be used just like a Document
# with that content. In addition, there are methods to make each of the other documents active
# and to create mappings between annotations of pairs of documents.
#
# An AnnotationMapping is something that maps annotations to annotations, either for the same
# document, from the same or different sets, of for different documents. Once an annotation
# becomes part of a mapping, that annotation is becoming immutable. Even if the original
# annotation in the document changes or gets removed, the mapping retains the original copy of
# the annotation until the mapping is modified or removed.
# """
#
# # TODO: ALL necessary fields of the document must be references of mutable objects so that
# # if something is changed for the active document the one stored in the documents map is
# # really updated as well, or we must override the updating method to change both!
# # A better way could be to override all methods to always directly change the document in the
# # documents map, and simply pass on all calls to the activated document.
# # In that case, to_dict and from_dict would actually generate the fields for normal document
# # readers and ignore them on restore
# def __init__(
# self, text: str = None, features=None, changelog: ChangeLog = None, docid=0
# ):
# logger.warning("Experimental feature, DO NOT USE")
# self.documents = {} # map from document id to document
# self._mappings = None # TODO: we need to implement this
# self._docid = None
# doc = Document(text, features=features, changelog=changelog)
# self.documents[docid] = doc
# self.activate(docid)
#
# @property
# def docid(self):
# return self._docid
#
# def activate(self, docid=0):
# if docid not in self.documents:
# raise Exception(f"Cannot activate id {docid}, not in MultiDocument")
# doc = self.documents[docid]
# self._changelog = doc._changelog
# self._features = doc._features
# self._annotation_sets = doc._annotation_sets
# self._text = doc._text
# self.offset_type = OFFSET_TYPE_PYTHON
# self._name = doc._name
# self._docid = docid
#
# def add_document(self, doc, docid=None, activate=False):
# if docid is None:
# docid = len(self.documents)
# elif docid in self.documents:
# raise Exception(
# f"Cannot add document to MultiDocument, id {docid} already exists"
# )
# self.documents[docid] = doc
# if activate:
# self.activate(docid)
# return docid
#
# def to_dict(self, offset_type=None, **kwargs):
# # TODO: check what to do with the offset type parameter!
# # The basic strategy is that we simply create the dictionary for the active document plus
# # the entries for the documents map and the annotation mappings. That way, any reader of the
# # dict representation which just ignored unknown fields can still read this in as a normal
# # document from the active document.
# # The drawback is that the active document is represented twice, but OK
# thedict = {
# "annotation_sets": {
# name: aset.to_dict() for name, aset in self._annotation_sets.items()
# },
# "text": self._text,
# "features": self._features.to_dict(),
# "offset_type": self.offset_type,
# "name": self.name,
# }
# thedict["documents"] = {
# docid: doc.to_dict() for docid, doc in self.documents.items()
# }
# thedict["docid"] = self._docid
# thedict["mappings"] = self._mappings
# return thedict
#
# @staticmethod
# def from_dict(dictrepr, **kwargs):
# """
# Create a MultiDocument from the dictionary representation.
#
# Args:
# dictrepr: the dictionary representation
# **kwargs: additional kwargs to pass on
#
# Returns:
#
# """
# feats = dictrepr.get("features")
# docid = dictrepr.get("docid")
# doc = MultiDocument(dictrepr.get("text"), features=feats, docid=docid)
# doc.name = dictrepr.get("name")
# doc.offset_type = dictrepr.get("offset_type")
# if (
# doc.offset_type != OFFSET_TYPE_JAVA
# and doc.offset_type != OFFSET_TYPE_PYTHON
# ):
# raise Exception("Invalid offset type, cannot load: ", doc.offset_type)
# annsets = {
# name: AnnotationSet.from_dict(adict, owner_doc=doc)
# for name, adict in dictrepr.get("annotation_sets").items()
# }
# doc._annotation_sets = annsets
# doc.documents = {
# did: Document.from_dict(d)
# for did, d in dictrepr.get("documents", {}).items()
# }
# # TODO: get the mappings back!
# return doc
Classes
class Document (text: str = None, features=None, changelog: ChangeLog = None)
-
Represent a GATE document. This is different from the original Java GATE representation in several ways:
-
the text is not mutable and can only be set at creation time, so there is no "edit" method
-
as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is no separate "FeatureMap" to store them
-
does not support listener callbacks
- there is no separate abstraction for "content", the only content possible is text which is a unicode string that can be acessed with the "text()" method
- Spans of text can be directly accessed using doc[from:to]
- Features may only have string keys and values which can be json-serialised
- Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets are UTF-16 Unicode code units
- Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index of UTF-16 code unit and back)
- No part of the document has to be present, not even the text (this allows saving just the annotations separately from the text)
- Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly)
Args
- text: the text of the document. The text can be None to indicate that no initial text
- should be set. Once the text has been set for a document, it is immutable and cannot
- be changed.
features
- the initial document features to set, a sequence of key/value tuples
changelog
- a ChangeLog instance to use to log changes.
Expand source code
class Document: """ Represent a GATE document. This is different from the original Java GATE representation in several ways: * the text is not mutable and can only be set at creation time, so there is no "edit" method * as a feature bearer, all the methods to set, get and manipulate features are part of this class, there is no separate "FeatureMap" to store them * does not support listener callbacks * there is no separate abstraction for "content", the only content possible is text which is a unicode string that can be acessed with the "text()" method * Spans of text can be directly accessed using doc[from:to] * Features may only have string keys and values which can be json-serialised * Annotation offsets by default are number of Unicde code points, this is different from Java where the offsets are UTF-16 Unicode code units * Offsets of all annotations can be changed from/to Java (from python index of unicode codepoint to Java index of UTF-16 code unit and back) * No part of the document has to be present, not even the text (this allows saving just the annotations separately from the text) * Once the text has been set, it is immutable (no support to edit text and change annotation offsets accordingly) Args: text: the text of the document. The text can be None to indicate that no initial text should be set. Once the text has been set for a document, it is immutable and cannot be changed. features: the initial document features to set, a sequence of key/value tuples changelog: a ChangeLog instance to use to log changes. """ def __init__(self, text: str = None, features=None, changelog: ChangeLog = None): if text is not None: assert isinstance(text, str) if changelog is not None: assert isinstance(changelog, ChangeLog) self._changelog = changelog self._features = Features(features, _change_logger=self._log_feature_change) self._annotation_sets = dict() self._text = text self.offset_type = OFFSET_TYPE_PYTHON self._name = "" @property def name(self): """ """ return self._name @name.setter def name(self, val): """ Args: val: Returns: """ if val is None: val = "" if not isinstance(val, str): raise Exception("Name must be a string") self._name = val if self._changelog is not None: ch = {"command": "name:set"} ch["name"] = val self._changelog.append(ch) def _ensure_type_python(self) -> None: """ """ if self.offset_type != OFFSET_TYPE_PYTHON: raise Exception( "Document cannot be used if it is not type PYTHON, " + "use to_type(OFFSET_TYPE_PYTHON) first" ) def _fixup_annotations(self, method: Callable) -> None: """ Args: method: Callable: Returns: """ annset_names = self._annotation_sets.keys() for annset_name in annset_names: annset = self._annotation_sets[annset_name] if annset._annotations is not None: for ann in annset._annotations.values(): ann._start = method(ann._start) ann._end = method(ann._end) def to_offset_type(self, offsettype: str) -> Union[OffsetMapper, None]: """Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing. NOTE: if the document has a ChangeLog, it is NOT also converted! The method returns the offset mapper if anything actually was converted, otherwise None. Args: offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON offsettype: str: Returns: offset mapper or None """ if offsettype == self.offset_type: return None if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON: # convert from currently python to java om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_java) self.offset_type = OFFSET_TYPE_JAVA elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA: # convert from currently java to python om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_python) self.offset_type = OFFSET_TYPE_PYTHON else: raise Exception("Odd offset type") return om def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID): """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object. The document is modified in-place. Args: changes: one or more changes handle_existing_anns: what to do if the change from the changelog tries to add an annotation with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID) """ if isinstance(changes, dict): changes = [changes] elif isinstance(changes, ChangeLog): changes = changes.changes for change in changes: cmd = change.get("command") fname = change.get("feature") fvalue = change.get("value") features = change.get("features") sname = change.get("set") annid = change.get("id") if cmd is None: raise Exception("Change without field 'command'") if cmd == ACTION_ADD_ANNSET: assert sname is not None self.annset(sname) elif cmd == ACTION_ADD_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) start = change.get("start") end = change.get("end") anntype = change.get("type") if ann is None: anns.add(start, end, anntype, annid=annid, features=features) else: if handle_existing_anns == ADDANN_IGNORE: pass elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID: anns.add(start, end, anntype) elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION: anns.remove(annid) anns.add(start, end, anntype, annid) elif handle_existing_anns == ADDANN_UPDATE_FEATURES: ann.features.update(features) elif handle_existing_anns == ADDANN_REPLACE_FEATURES: ann.features.clear() ann.features.update(features) elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES: fns = ann.features.names() for f in features.keys(): if f not in fns: ann.features[f] = features[f] elif handle_existing_anns == ADDANN_IGNORE: pass elif cmd == ACTION_CLEAR_ANNS: assert sname is not None anns = self.annset(sname) anns.clear() elif cmd == ACTION_CLEAR_ANN_FEATURES: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: ann.features.clear() else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_CLEAR_DOC_FEATURES: self.features.clear() elif cmd == ACTION_SET_ANN_FEATURE: assert fname is not None assert sname is not None assert annid is not None ann = self.annset(sname).get(annid) ann.features[fname] = fvalue elif cmd == ACTION_DEL_ANN_FEATURE: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: if fname is not None: ann.features.pop(fname, None) else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None self.features.pop(fname, None) elif cmd == ACTION_DEL_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) anns.remove(annid) elif cmd == ACTION_SET_DOC_FEATURE: assert fname is not None self.features[fname] = fvalue elif cmd == ACTION_CLEAR_DOC_FEATURES: self._features.clear() elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None del self._features[fname] else: raise Exception("Unknown ChangeLog action: ", cmd) @property def features(self): """Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one. :return: A FeatureViewer view of the document features. Args: Returns: """ return self._features @property def changelog(self): """Get the ChangeLog or None if no ChangeLog has been set. :return: the changelog Args: Returns: """ return self._changelog @changelog.setter def changelog(self, chlog): """Make the document use the given changelog to record all changes from this moment on. Args: chlog: the new changelog to use or None to not use any Returns: the changelog used previously or None """ self._changelog = chlog @property def text(self) -> str: """Get the text of the document. For a partial document, the text may be None. :return: the text of the document Args: Returns: """ self._ensure_type_python() return self._text @text.setter def text(self, value: str) -> None: """ Set the text of the document. This is only possible as long as it has not been set yet, after that, the text is immutable. IMPORTANT: it is possible to add arbitrary annotations to a document which does not have any text. This is meant to allow handling of annotation-only representations. However, if the text is set after annotations have been added, annotation offsets are not checked and it is possible to thus create an invalid document where annotations refer to text ranges that do not exist! Args: value: the text for the document value: str: Returns: """ if self._text is None: self._text = value else: raise NotImplementedError("Text cannot be modified") def _log_feature_change( self, command: str, feature: str = None, value=None ) -> None: """ Args: command: str: feature: str: (Default value = None) value: (Default value = None) Returns: """ if self._changelog is None: return command = "doc-" + command ch = {"command": command} if command == "doc-feature:set": ch["feature"] = feature ch["value"] = value self._changelog.append(ch) def __len__(self) -> int: """ Return the length of the text. Note: this will convert the type of the document to python! :return: the length of the document text """ self._ensure_type_python() if self._text is None: return 0 else: return len(self._text) def __getitem__(self, span) -> str: """ Get the text for the given span. :param span: a single number, an offset range of the form from:to or an annotation. If annotation, uses the annotation's offset span. :return: the text of the span """ self._ensure_type_python() if isinstance(span, Annotation): return self.text[span._start:span._end] if isinstance(span, AnnotationSet): return self.text[span.start():span.end()] if hasattr(span, "start") and hasattr(span, "end"): return self.text[span.start:span.end] return self.text[span] def annset(self, name: str = "") -> AnnotationSet: """ Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created. Args: name: the annotation set name, the empty string is used for the "default annotation set". name: str: (Default value = "") Returns: the specified annotation set. """ self._ensure_type_python() if name not in self._annotation_sets: annset = AnnotationSet(owner_doc=self, name=name) self._annotation_sets[name] = annset if self._changelog: self._changelog.append({"command": "annotations:add", "set": name}) return annset else: return self._annotation_sets[name] def annset_names(self) -> List[str]: """ Args: Returns: :return: annotation set names """ self._ensure_type_python() return list(self._annotation_sets.keys()) def remove_annset(self, name: str): """Completely remove the annotation set. Args: name: name of the annotation set to remove name: str: Returns: """ if name not in self._annotation_sets: raise Exception(f"AnnotationSet with name {name} does not exist") del self._annotation_sets[name] if self._changelog: self._changelog.append({"command": "annotations:remove", "set": name}) def anns(self, annspec): """ Return a detached annotation set with all annotations which match the annotation specification. Annotation ids are preserved if possible, but if annotations from different sets have duplicate ids, new ids are assigned instead. Args: annspec: either a single string which is interpreted as an annotation set name, or a list where each element is either a string (annotation set name) or a tuple. If an element is a tuple, the first element of the tuple must be the annotation set name and the second element either a type name or a list of type names. Returns: a detached, immutable set with all the annotations matching the annotation specification """ return AnnotationSet.create_from(self.yield_anns(annspec)) def yield_anns(self, annspec): """ Yield all annotations which match the annotation specification. The order of the annotations is unespecified. Args: annspec: either a single string which is interpreted as an annotation set name, or a list where each element is either a string (annotation set name) or a tuple. If an element is a tuple, the first element of the tuple must be the annotation set name and the second element either a type name or a list of type names. Yields: all the annotations matching the annotation specification """ if isinstance(annspec, str): tmpset = self._annotation_sets.get(annspec) if tmpset is not None: for ann in tmpset._annotations.values(): yield ann return for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: for ann in tmpset._annotations.values(): yield ann else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: for ann in tmpset._annotations.values(): if ann.type in types: yield ann def __repr__(self) -> str: """ String representation of the document, showing all content. :return: string representation """ return "Document({},features={},anns={})".format( self.text, self._features, self._annotation_sets.__repr__() ) def __str__(self) -> str: asets = ( "[" + ",".join([f"'{k}':{len(v)}" for k, v in self._annotation_sets.items()]) + "]" ) return "Document({},features={},anns={})".format( self.text, self._features, asets ) def to_dict(self, offset_type=None, annspec=None, **kwargs): """Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field! Args: offset_type: convert to the given offset type on the fly (Default value = None) annspec if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification. **kwargs: get passed on to the to_dict methods of included objects. Returns: the dictionary representation of this instance """ # if the specified offset type is equal to what we have, do nothing, otherwise # create an offset mapper and pass it down to where we actually convert the annotations if offset_type is not None: assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON if offset_type != self.offset_type: if self._text is not None: om = OffsetMapper(self._text) kwargs["offset_mapper"] = om kwargs["offset_type"] = offset_type else: offset_type = self.offset_type # create the annotation sets map if annspec is not None: annsets_dict = {} for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: annsets_dict[spec] = tmpset.to_dict(**kwargs) else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: annsets_dict[setname] = self._annotation_sets[setname].to_dict( anntypes=types, **kwargs ) else: annsets_dict = { name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() } return { "annotation_sets": annsets_dict, "text": self._text, "features": self._features.to_dict(), "offset_type": offset_type, "name": self.name, } @staticmethod def from_dict(dictrepr, **_kwargs): """Return a Document instance as represented by the dictionary dictrepr. Args: dictrepr: return: the initialized Document instance **_kwargs: not used, ignored Returns: the initialized Document instance """ feats = dictrepr.get("features", {}) doc = Document(dictrepr.get("text"), features=feats) doc.name = dictrepr.get("name") doc.offset_type = dictrepr.get("offset_type") if doc.offset_type is None: doc.offset_type = OFFSET_TYPE_PYTHON if ( doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON ): raise Exception("Invalid offset type, cannot load: ", doc.offset_type) annsets = { name: AnnotationSet.from_dict(adict, owner_doc=doc) for name, adict in dictrepr.get("annotation_sets", {}).items() } doc._annotation_sets = annsets return doc def save( self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", annspec=None, **kwargs, ): """Save the document to the destination file. Args: destination: either a file name or something that has a write(string) method. fmt: serialization format, by default the format is inferred from the file extension. offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") annspec: if not None, a list of annotation set names or tuples of set name and a list of annotation types to include in the serialized document. kwargs: additional parameters for the document saver. """ if annspec is not None: kwargs["annspec"] = annspec if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(destination, fmt) saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs) else: # assume fmt is a callable to get used directly fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs) def save_mem( self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs, ): """Serialize to a string or bytes in the given format. Args: fmt: serialization format to use. (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional parameters for the format. """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(None, fmt) return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs) else: fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs) @staticmethod def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs): """ Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib. Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)` Example: `Document.load(pathlib.Path(somepath), fmt=theformat)` NOTE: the offset type of the document is always converted to PYTHON when loading! Args: source: the URL or file path to load from. fmt: the format of the source. By default the format is inferred by the file extension. The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader Returns: the loaded document """ if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(source, fmt) doc = loader(Document, from_ext=source, **kwargs) else: doc = fmt(Document, from_ext=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc @staticmethod def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs): """ Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format. Note: the offset type is always converted to PYTHON when loading! Args: source: the string/bytes to deserialize fmt: if string, the format identifier or mime type (Default value = "json"), otherwise assumed to be a callable that retrieves and returns the document mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default") kwargs: additional arguments to pass to the loader """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(None, fmt) doc = loader(Document, from_mem=source, **kwargs) else: doc = fmt(Document, from_mem=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc def __copy__(self): """ Creates a shallow copy except the changelog which is set to None. The document feature map is a new instance, so features added in one copy will not show up in the other. However if feature values of copied features are objects, they are shared between the copies. Annotation sets are separate but the features of shared annotations are shared. Returns: shallow copy of the document """ doc = Document(self._text) doc._annotation_sets = dict() for name, aset in self._annotation_sets.items(): doc._annotation_sets[name] = aset.copy() doc._annotation_sets[name]._owner_doc = doc doc.offset_type = self.offset_type doc._features = self._features.copy() return doc def copy(self, annspec=None): """ Creates a shallow copy except the changelog which is set to None. If annspec is specified, creates a shallow copy but also limits the annotations to the one specified. Args: annspec: if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification. Returns: shallow copy of the document, optionally with some annotations removed """ if annspec is None: return self.__copy__() doc = Document(self._text) doc.offset_type = self.offset_type doc._features = self._features.copy() doc._annotation_sets = dict() for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: doc._annotation_sets[spec] = self._annotation_sets[spec].copy() doc._annotation_sets[spec]._owner_doc = doc else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: annset = AnnotationSet(owner_doc=doc, name=setname) anns = self.annset(setname).with_type(types) for ann in anns: annset.add_ann(ann) doc._annotation_sets[setname] = annset return doc def deepcopy(self, annspec=None, memo=None): """ Creates a deep copy, except the changelog which is set to None. If annset is not None, the annotations in the copy are restricted to the given set. Args: memo: the memoization dictionary to use. annspec: which annotation sets and types to include Returns: a deep copy of the document. """ if self._features is not None: fts = lib_copy.deepcopy(self._features.to_dict(), memo) else: fts = None doc = Document(self._text, features=fts) doc._changelog = None doc.offset_type = self.offset_type if annspec is None: doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo) else: doc._annotation_sets = dict() for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: doc._annotation_sets[spec] = lib_copy.deepcopy(tmpset, memo) doc._annotation_sets[spec]._owner_doc = doc else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: annset = AnnotationSet(owner_doc=doc, name=setname) anns = tmpset.with_type(types) for ann in anns: annset.add_ann(lib_copy.deepcopy(ann, memo)) doc._annotation_sets[setname] = annset return doc def __deepcopy__(self, memo=None): """ Creates a deep copy, except the changelog which is set to None. Args: memo: the memoization dictionary to use. Returns: a deep copy of the document. """ return lib_copy.deepcopy(self, memo=memo) def _repr_html_(self): """ Render function for Jupyter notebooks. Returns the html-ann-viewer HTML. This renders the HTML for notebook, for offline mode, but does not add the JS but instead initializes the JS in the notebook unless gatenlp.init_notebook() has been called already. """ # import within this method to avoid dependencies needed for this in the whole Document module from gatenlp.serialization.default_htmlannviewer import show_colab, show_notebook if in_colab(): return show_colab(self, display=False) elif in_notebook(): return show_notebook(self, display=False) else: # things like starboard.gg may call _repr_html_ but still not satisfy either in colab or notebook return self.__repr__() def show( self, to=None, htmlid=None, annspec=None, preselect=None, palette=None, cols4types=None, doc_style=None, row1_style=None, row2_style=None): """ Show the document, possibly in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary) if the destination is a notebook. Args: to: if None, try to guess if this is called from within a notebook and if yes, which kind. Otherwise, explicitly specify where to show the document to, one of "console", "jupyter", "colab". If "console" is specified or automatically chosen, the parameters "annspec", "doc_style", "row1_style", and "row2_style" are irrelevant and ignored. htmlid: the HTML id prefix to use for classes and element ids. annspec: if not None, a list of annotation set/type specifications. Each element is either the name of a set to fully include, or a tuple with the name of the set as the first element and with a single type name or a list of type names as the second element preselect: if not None, the set and type names to pre-select (show). This should have the same format as the annspec parameter. palette: if not None a list of colour codes (strings) usable in Javascript which will be used instead of the default palette. cols4types: if not None a dictionary mapping tuples (setname, typename) to a color. For the given setname and typename combinations, the colours from the palette (default or specified) will be overrriden. doc_style: if not None, use this as the style for the document text box row1_style: if not None, use this for the first row of the HTML viewer, which contains the document text and annotation sets and types panes. Default is gatenlpconfig.doc_html_repr_row1style_nostretch row2_style: if not None, use this for the second row of the HTML viewer, which contains the document and annottion features pane. Default is gatenlpconfig.doc_html_repr_row2style_nostretch """ # import within this method to avoid dependencies needed for this in the whole Document module from gatenlp.serialization.default_htmlannviewer import show_colab, show_notebook if to == "colab": show_colab(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return elif to == "jupyter": show_notebook(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return elif to == "console": return self.__str__() elif to is not None: raise Exception(f"Not a valid value for parameter to: {to}. Use one of console, jupyter, colab") if in_notebook(): if in_colab(): show_colab(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return else: show_notebook(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return else: return self.__str__() def attach(self, annset, name, check=True): """ Attach a detached set to the document. This should get used with caution and is mainly intended for use inside the gatenlp library to allow for fast incremental creation of new documents and document sets. The set can only be added if a set with the given name does not yet exist at all. Args: annset: the annotation set to attach name: the name for the annotation set check: if False, prevent any checking. WARNING: this may create an inconsistent/illegal document! """ if name in self._annotation_sets: raise Exception(f"Cannot attach set, a set with the name {name} already exists") if check: # check if the offsets are consistent with the document mylen = len(self) for ann in annset._annotations.values(): if ann.end > mylen: raise Exception(f"Cannot attach set, annotation beyond text end: {ann}") self._annotation_sets[name] = annset annset._owner_doc = self @staticmethod def _edit_text(oldtext, edits): """ Edit helper method: given some old text, applies the edit or edits and returns the changed text. Args: oldtext: the text to edit edits: a single edit (start, end, replacementtext) or a list of such edits. The list must be sorted by starting offset. Returns: the new changed text """ # note: all the offsets in the edits refer to the current text, so we cannot simply apply one edit # after the other. Instead, we collect all the text snippets we need and create the final text from them snippets = [] lastoff = 0 for edit in edits: start, end, newtext = edit if end < start: raise Exception(f"Edit ({start}, {end}, {newtext}): end offset smaller than start offset") if start > lastoff: snippets.append(oldtext[lastoff:start]) elif start < lastoff: raise Exception(f"Edits overlap or not sorted: ({start}, {end}, {newtext})") if len(newtext) > 0: snippets.append(newtext) lastoff = end if lastoff < len(oldtext): snippets.append(oldtext[lastoff:]) newtext = "".join(snippets) return newtext def edit(self, edits, affected_strategy="keepadapt"): """ Carry out one or more edits. If edits is a tuple of length 3 with the first element not being iterable, assume it is a single edit, Otherwise assume it is an iterable of edits. An edit is a tuple (start, end, newstr) giving the old offset range and the string which replaces that range. NOTE: no two edit offset ranges may overlap, if ranges do overlap, this method may raise an exception or silently perform unexpected and terrible changes. The method does not check for edit spans to not overlap! This method adapts the offsets of all annotations after the affected span, if an annotation begins or ends within an affected span, what happens depends on the affected_strategy: delete_all: remove any annotation where the start and/or end offset lies between the from/to offsets of the edit adapt: any start and/or end offset in between from/to is changed to the from or to offset keepadapt: any start and/or end offset in between is left unchanged if that offset still exists in the new span, otherwise adapted to from/to. Args: edits: single edit or iterable of edits affected_strategy: one of the following strategies: delete, adapt, keepadapt """ assert affected_strategy in ["delete_all", "adapt", "keepadapt"] if isinstance(edits, tuple) and not isinstance(edits[0], Iterable): edits = [edits] edits.sort(key=lambda x: x[0]) self._text = Document._edit_text(self._text, edits) for annset in self._annotation_sets.values(): annset._edit(edits, affected_strategy=affected_strategy) def clone(self): """ Create a clone of the current document, no data is shared between the clone and the original. Returns: A copy of the current document """ return Document.load_mem(self.save_mem())
Static methods
def from_dict(dictrepr, **_kwargs)
-
Return a Document instance as represented by the dictionary dictrepr.
Args
dictrepr
- return: the initialized Document instance
**_kwargs
- not used, ignored
Returns
the initialized Document instance
Expand source code
@staticmethod def from_dict(dictrepr, **_kwargs): """Return a Document instance as represented by the dictionary dictrepr. Args: dictrepr: return: the initialized Document instance **_kwargs: not used, ignored Returns: the initialized Document instance """ feats = dictrepr.get("features", {}) doc = Document(dictrepr.get("text"), features=feats) doc.name = dictrepr.get("name") doc.offset_type = dictrepr.get("offset_type") if doc.offset_type is None: doc.offset_type = OFFSET_TYPE_PYTHON if ( doc.offset_type != OFFSET_TYPE_JAVA and doc.offset_type != OFFSET_TYPE_PYTHON ): raise Exception("Invalid offset type, cannot load: ", doc.offset_type) annsets = { name: AnnotationSet.from_dict(adict, owner_doc=doc) for name, adict in dictrepr.get("annotation_sets", {}).items() } doc._annotation_sets = annsets return doc
def load(source, fmt=None, mod='gatenlp.serialization.default', **kwargs)
-
Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib.
Example:
Document.load(urllib.parse.urlparse(someurl), fmt=theformat)
Example:
Document.load(pathlib.Path(somepath), fmt=theformat)
NOTE: the offset type of the document is always converted to PYTHON when loading!
Args
- source: the URL or file path to load from.
- fmt: the format of the source. By default the format is inferred by the file extension.
- The format can be a format memnonic like "json", "html", or a known mime type
- like "text/bdocjs".
mod
- the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default")
kwargs
- additional format specific keyword arguments to pass to the loader
Returns
the loaded document
Expand source code
@staticmethod def load(source, fmt=None, mod="gatenlp.serialization.default", **kwargs): """ Load or import a document from the given source. The source can be a file path or file name or a URL. If the type of the source is str, then if it starts with "http[s]://" it will get treated as a URL. In order to deliberatly use a file instead of a URL, create a pathlib Path, in order to deliberately use URL instead of a file parse the URL using urllib. Example: `Document.load(urllib.parse.urlparse(someurl), fmt=theformat)` Example: `Document.load(pathlib.Path(somepath), fmt=theformat)` NOTE: the offset type of the document is always converted to PYTHON when loading! Args: source: the URL or file path to load from. fmt: the format of the source. By default the format is inferred by the file extension. The format can be a format memnonic like "json", "html", or a known mime type like "text/bdocjs". mod: the name of a module where the document loader is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional format specific keyword arguments to pass to the loader Returns: the loaded document """ if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(source, fmt) doc = loader(Document, from_ext=source, **kwargs) else: doc = fmt(Document, from_ext=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc
def load_mem(source, fmt='json', mod='gatenlp.serialization.default', **kwargs)
-
Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format.
Note: the offset type is always converted to PYTHON when loading!
Args
source
- the string/bytes to deserialize
fmt
- if string, the format identifier or mime type (Default value = "json"), otherwise assumed to be a callable that retrieves and returns the document
mod
- the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default")
kwargs
- additional arguments to pass to the loader
Expand source code
@staticmethod def load_mem(source, fmt="json", mod="gatenlp.serialization.default", **kwargs): """ Create a document from the in-memory serialization in source. Source can be a string or bytes, depending on the format. Note: the offset type is always converted to PYTHON when loading! Args: source: the string/bytes to deserialize fmt: if string, the format identifier or mime type (Default value = "json"), otherwise assumed to be a callable that retrieves and returns the document mod: the name of the module where the loader is implemented (Default value = "gatenlp.serialization.default") kwargs: additional arguments to pass to the loader """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) loader = m.get_document_loader(None, fmt) doc = loader(Document, from_mem=source, **kwargs) else: doc = fmt(Document, from_mem=source, **kwargs) if doc.offset_type == OFFSET_TYPE_JAVA: doc.to_offset_type(OFFSET_TYPE_PYTHON) return doc
Instance variables
var changelog
-
Get the ChangeLog or None if no ChangeLog has been set.
:return: the changelog
Args:
Returns:
Expand source code
@property def changelog(self): """Get the ChangeLog or None if no ChangeLog has been set. :return: the changelog Args: Returns: """ return self._changelog
var features
-
Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one.
:return: A FeatureViewer view of the document features.
Args:
Returns:
Expand source code
@property def features(self): """Accesses the features as a FeatureViewer instance. Changes made on this object are reflected in the document and recorded in the change log, if there is one. :return: A FeatureViewer view of the document features. Args: Returns: """ return self._features
var name
-
Expand source code
@property def name(self): """ """ return self._name
var text : str
-
Get the text of the document. For a partial document, the text may be None.
:return: the text of the document
Args:
Returns:
Expand source code
@property def text(self) -> str: """Get the text of the document. For a partial document, the text may be None. :return: the text of the document Args: Returns: """ self._ensure_type_python() return self._text
Methods
def anns(self, annspec)
-
Return a detached annotation set with all annotations which match the annotation specification. Annotation ids are preserved if possible, but if annotations from different sets have duplicate ids, new ids are assigned instead.
Args
annspec
- either a single string which is interpreted as an annotation set name, or a list where each element is either a string (annotation set name) or a tuple. If an element is a tuple, the first element of the tuple must be the annotation set name and the second element either a type name or a list of type names.
Returns
a detached, immutable set with all the annotations matching the annotation specification
Expand source code
def anns(self, annspec): """ Return a detached annotation set with all annotations which match the annotation specification. Annotation ids are preserved if possible, but if annotations from different sets have duplicate ids, new ids are assigned instead. Args: annspec: either a single string which is interpreted as an annotation set name, or a list where each element is either a string (annotation set name) or a tuple. If an element is a tuple, the first element of the tuple must be the annotation set name and the second element either a type name or a list of type names. Returns: a detached, immutable set with all the annotations matching the annotation specification """ return AnnotationSet.create_from(self.yield_anns(annspec))
def annset(self, name: str = '') ‑> AnnotationSet
-
Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created.
Args
name
- the annotation set name, the empty string is used for the "default annotation set".
name
- str: (Default value = "")
Returns
the specified annotation set.
Expand source code
def annset(self, name: str = "") -> AnnotationSet: """ Get the named annotation set, if name is not given or the empty string, the default annotation set. If the annotation set does not already exist, it is created. Args: name: the annotation set name, the empty string is used for the "default annotation set". name: str: (Default value = "") Returns: the specified annotation set. """ self._ensure_type_python() if name not in self._annotation_sets: annset = AnnotationSet(owner_doc=self, name=name) self._annotation_sets[name] = annset if self._changelog: self._changelog.append({"command": "annotations:add", "set": name}) return annset else: return self._annotation_sets[name]
def annset_names(self) ‑> List[str]
-
Args:
Returns
:return: annotation set names
Expand source code
def annset_names(self) -> List[str]: """ Args: Returns: :return: annotation set names """ self._ensure_type_python() return list(self._annotation_sets.keys())
def apply_changes(self, changes, handle_existing_anns='add-with-new-id')
-
Apply changes from a ChangeLog to this document.
changes
can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object.The document is modified in-place.
Args
changes
- one or more changes
handle_existing_anns
- what to do if the change from the changelog tries to add an annotation with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID)
Expand source code
def apply_changes(self, changes, handle_existing_anns=ADDANN_ADD_WITH_NEW_ID): """Apply changes from a ChangeLog to this document. `changes` can be a ChangeLog instance, a sequence of change objects (dicts) as stored in a ChangeLog instance, or a single change object. The document is modified in-place. Args: changes: one or more changes handle_existing_anns: what to do if the change from the changelog tries to add an annotation with an annotation id that already exists in the target set. (Default value = ADDANN_ADD_WITH_NEW_ID) """ if isinstance(changes, dict): changes = [changes] elif isinstance(changes, ChangeLog): changes = changes.changes for change in changes: cmd = change.get("command") fname = change.get("feature") fvalue = change.get("value") features = change.get("features") sname = change.get("set") annid = change.get("id") if cmd is None: raise Exception("Change without field 'command'") if cmd == ACTION_ADD_ANNSET: assert sname is not None self.annset(sname) elif cmd == ACTION_ADD_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) start = change.get("start") end = change.get("end") anntype = change.get("type") if ann is None: anns.add(start, end, anntype, annid=annid, features=features) else: if handle_existing_anns == ADDANN_IGNORE: pass elif handle_existing_anns == ADDANN_ADD_WITH_NEW_ID: anns.add(start, end, anntype) elif handle_existing_anns == ADDANN_REPLACE_ANNOTATION: anns.remove(annid) anns.add(start, end, anntype, annid) elif handle_existing_anns == ADDANN_UPDATE_FEATURES: ann.features.update(features) elif handle_existing_anns == ADDANN_REPLACE_FEATURES: ann.features.clear() ann.features.update(features) elif handle_existing_anns == ADDANN_ADD_NEW_FEATURES: fns = ann.features.names() for f in features.keys(): if f not in fns: ann.features[f] = features[f] elif handle_existing_anns == ADDANN_IGNORE: pass elif cmd == ACTION_CLEAR_ANNS: assert sname is not None anns = self.annset(sname) anns.clear() elif cmd == ACTION_CLEAR_ANN_FEATURES: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: ann.features.clear() else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_CLEAR_DOC_FEATURES: self.features.clear() elif cmd == ACTION_SET_ANN_FEATURE: assert fname is not None assert sname is not None assert annid is not None ann = self.annset(sname).get(annid) ann.features[fname] = fvalue elif cmd == ACTION_DEL_ANN_FEATURE: assert sname is not None assert annid is not None anns = self.annset(sname) ann = anns.get(annid) if ann is not None: if fname is not None: ann.features.pop(fname, None) else: pass # ignore, could happen with a detached annotation elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None self.features.pop(fname, None) elif cmd == ACTION_DEL_ANN: assert sname is not None assert annid is not None anns = self.annset(sname) anns.remove(annid) elif cmd == ACTION_SET_DOC_FEATURE: assert fname is not None self.features[fname] = fvalue elif cmd == ACTION_CLEAR_DOC_FEATURES: self._features.clear() elif cmd == ACTION_DEL_DOC_FEATURE: assert fname is not None del self._features[fname] else: raise Exception("Unknown ChangeLog action: ", cmd)
def attach(self, annset, name, check=True)
-
Attach a detached set to the document. This should get used with caution and is mainly intended for use inside the gatenlp library to allow for fast incremental creation of new documents and document sets. The set can only be added if a set with the given name does not yet exist at all.
Args
annset
- the annotation set to attach
name
- the name for the annotation set
check
- if False, prevent any checking. WARNING: this may create an inconsistent/illegal document!
Expand source code
def attach(self, annset, name, check=True): """ Attach a detached set to the document. This should get used with caution and is mainly intended for use inside the gatenlp library to allow for fast incremental creation of new documents and document sets. The set can only be added if a set with the given name does not yet exist at all. Args: annset: the annotation set to attach name: the name for the annotation set check: if False, prevent any checking. WARNING: this may create an inconsistent/illegal document! """ if name in self._annotation_sets: raise Exception(f"Cannot attach set, a set with the name {name} already exists") if check: # check if the offsets are consistent with the document mylen = len(self) for ann in annset._annotations.values(): if ann.end > mylen: raise Exception(f"Cannot attach set, annotation beyond text end: {ann}") self._annotation_sets[name] = annset annset._owner_doc = self
def clone(self)
-
Create a clone of the current document, no data is shared between the clone and the original.
Returns
A copy of the current document
Expand source code
def clone(self): """ Create a clone of the current document, no data is shared between the clone and the original. Returns: A copy of the current document """ return Document.load_mem(self.save_mem())
def copy(self, annspec=None)
-
Creates a shallow copy except the changelog which is set to None. If annspec is specified, creates a shallow copy but also limits the annotations to the one specified.
Args
annspec
- if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification.
Returns
shallow copy of the document, optionally with some annotations removed
Expand source code
def copy(self, annspec=None): """ Creates a shallow copy except the changelog which is set to None. If annspec is specified, creates a shallow copy but also limits the annotations to the one specified. Args: annspec: if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification. Returns: shallow copy of the document, optionally with some annotations removed """ if annspec is None: return self.__copy__() doc = Document(self._text) doc.offset_type = self.offset_type doc._features = self._features.copy() doc._annotation_sets = dict() for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: doc._annotation_sets[spec] = self._annotation_sets[spec].copy() doc._annotation_sets[spec]._owner_doc = doc else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: annset = AnnotationSet(owner_doc=doc, name=setname) anns = self.annset(setname).with_type(types) for ann in anns: annset.add_ann(ann) doc._annotation_sets[setname] = annset return doc
def deepcopy(self, annspec=None, memo=None)
-
Creates a deep copy, except the changelog which is set to None. If annset is not None, the annotations in the copy are restricted to the given set.
Args
memo
- the memoization dictionary to use.
annspec
- which annotation sets and types to include
Returns
a deep copy of the document.
Expand source code
def deepcopy(self, annspec=None, memo=None): """ Creates a deep copy, except the changelog which is set to None. If annset is not None, the annotations in the copy are restricted to the given set. Args: memo: the memoization dictionary to use. annspec: which annotation sets and types to include Returns: a deep copy of the document. """ if self._features is not None: fts = lib_copy.deepcopy(self._features.to_dict(), memo) else: fts = None doc = Document(self._text, features=fts) doc._changelog = None doc.offset_type = self.offset_type if annspec is None: doc._annotation_sets = lib_copy.deepcopy(self._annotation_sets, memo) else: doc._annotation_sets = dict() for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: doc._annotation_sets[spec] = lib_copy.deepcopy(tmpset, memo) doc._annotation_sets[spec]._owner_doc = doc else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: annset = AnnotationSet(owner_doc=doc, name=setname) anns = tmpset.with_type(types) for ann in anns: annset.add_ann(lib_copy.deepcopy(ann, memo)) doc._annotation_sets[setname] = annset return doc
def edit(self, edits, affected_strategy='keepadapt')
-
Carry out one or more edits. If edits is a tuple of length 3 with the first element not being iterable, assume it is a single edit, Otherwise assume it is an iterable of edits. An edit is a tuple (start, end, newstr) giving the old offset range and the string which replaces that range. NOTE: no two edit offset ranges may overlap, if ranges do overlap, this method may raise an exception or silently perform unexpected and terrible changes. The method does not check for edit spans to not overlap!
This method adapts the offsets of all annotations after the affected span, if an annotation begins or ends within an affected span, what happens depends on the affected_strategy:
delete_all: remove any annotation where the start and/or end offset lies between the from/to offsets of the edit adapt: any start and/or end offset in between from/to is changed to the from or to offset keepadapt: any start and/or end offset in between is left unchanged if that offset still exists in the new span, otherwise adapted to from/to.
Args
edits
- single edit or iterable of edits
affected_strategy
- one of the following strategies: delete, adapt, keepadapt
Expand source code
def edit(self, edits, affected_strategy="keepadapt"): """ Carry out one or more edits. If edits is a tuple of length 3 with the first element not being iterable, assume it is a single edit, Otherwise assume it is an iterable of edits. An edit is a tuple (start, end, newstr) giving the old offset range and the string which replaces that range. NOTE: no two edit offset ranges may overlap, if ranges do overlap, this method may raise an exception or silently perform unexpected and terrible changes. The method does not check for edit spans to not overlap! This method adapts the offsets of all annotations after the affected span, if an annotation begins or ends within an affected span, what happens depends on the affected_strategy: delete_all: remove any annotation where the start and/or end offset lies between the from/to offsets of the edit adapt: any start and/or end offset in between from/to is changed to the from or to offset keepadapt: any start and/or end offset in between is left unchanged if that offset still exists in the new span, otherwise adapted to from/to. Args: edits: single edit or iterable of edits affected_strategy: one of the following strategies: delete, adapt, keepadapt """ assert affected_strategy in ["delete_all", "adapt", "keepadapt"] if isinstance(edits, tuple) and not isinstance(edits[0], Iterable): edits = [edits] edits.sort(key=lambda x: x[0]) self._text = Document._edit_text(self._text, edits) for annset in self._annotation_sets.values(): annset._edit(edits, affected_strategy=affected_strategy)
def remove_annset(self, name: str)
-
Completely remove the annotation set.
Args
name
- name of the annotation set to remove
name
- str:
Returns:
Expand source code
def remove_annset(self, name: str): """Completely remove the annotation set. Args: name: name of the annotation set to remove name: str: Returns: """ if name not in self._annotation_sets: raise Exception(f"AnnotationSet with name {name} does not exist") del self._annotation_sets[name] if self._changelog: self._changelog.append({"command": "annotations:remove", "set": name})
def save(self, destination, fmt=None, offset_type=None, mod='gatenlp.serialization.default', annspec=None, **kwargs)
-
Save the document to the destination file.
Args
destination
- either a file name or something that has a write(string) method.
fmt
- serialization format, by default the format is inferred from the file extension.
offset_type
- store using the given offset type or keep the current if None (Default value = None)
mod
- module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
annspec
- if not None, a list of annotation set names or tuples of set name and a list of annotation types to include in the serialized document.
kwargs
- additional parameters for the document saver.
Expand source code
def save( self, destination, fmt=None, offset_type=None, mod="gatenlp.serialization.default", annspec=None, **kwargs, ): """Save the document to the destination file. Args: destination: either a file name or something that has a write(string) method. fmt: serialization format, by default the format is inferred from the file extension. offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") annspec: if not None, a list of annotation set names or tuples of set name and a list of annotation types to include in the serialized document. kwargs: additional parameters for the document saver. """ if annspec is not None: kwargs["annspec"] = annspec if fmt is None or isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(destination, fmt) saver(Document, self, to_ext=destination, offset_type=offset_type, **kwargs) else: # assume fmt is a callable to get used directly fmt(Document, self, to_ext=destination, offset_type=offset_type, **kwargs)
def save_mem(self, fmt='json', offset_type=None, mod='gatenlp.serialization.default', **kwargs)
-
Serialize to a string or bytes in the given format.
Args
fmt
- serialization format to use. (Default value = "json")
offset_type
- store using the given offset type or keep the current if None (Default value = None)
mod
- module where the document saver is implemented. (Default value = "gatenlp.serialization.default")
kwargs
- additional parameters for the format.
Expand source code
def save_mem( self, fmt="json", offset_type=None, mod="gatenlp.serialization.default", **kwargs, ): """Serialize to a string or bytes in the given format. Args: fmt: serialization format to use. (Default value = "json") offset_type: store using the given offset type or keep the current if None (Default value = None) mod: module where the document saver is implemented. (Default value = "gatenlp.serialization.default") kwargs: additional parameters for the format. """ if not fmt: raise Exception("Format required.") if isinstance(fmt, str): m = importlib.import_module(mod) saver = m.get_document_saver(None, fmt) return saver(Document, self, to_mem=True, offset_type=offset_type, **kwargs) else: fmt(Document, self, to_mem=True, offset_type=offset_type, **kwargs)
def show(self, to=None, htmlid=None, annspec=None, preselect=None, palette=None, cols4types=None, doc_style=None, row1_style=None, row2_style=None)
-
Show the document, possibly in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary) if the destination is a notebook.
Args
to
- if None, try to guess if this is called from within a notebook and if yes, which kind. Otherwise, explicitly specify where to show the document to, one of "console", "jupyter", "colab". If "console" is specified or automatically chosen, the parameters "annspec", "doc_style", "row1_style", and "row2_style" are irrelevant and ignored.
htmlid
- the HTML id prefix to use for classes and element ids.
annspec
- if not None, a list of annotation set/type specifications. Each element is either the name of a set to fully include, or a tuple with the name of the set as the first element and with a single type name or a list of type names as the second element
preselect
- if not None, the set and type names to pre-select (show). This should have the same format as the annspec parameter.
palette
- if not None a list of colour codes (strings) usable in Javascript which will be used instead of the default palette.
cols4types
- if not None a dictionary mapping tuples (setname, typename) to a color. For the given setname and typename combinations, the colours from the palette (default or specified) will be overrriden.
doc_style
- if not None, use this as the style for the document text box
row1_style
- if not None, use this for the first row of the HTML viewer, which contains the document text and annotation sets and types panes. Default is gatenlpconfig.doc_html_repr_row1style_nostretch
row2_style
- if not None, use this for the second row of the HTML viewer, which contains the document and annottion features pane. Default is gatenlpconfig.doc_html_repr_row2style_nostretch
Expand source code
def show( self, to=None, htmlid=None, annspec=None, preselect=None, palette=None, cols4types=None, doc_style=None, row1_style=None, row2_style=None): """ Show the document, possibly in a Jupyter notebook. This allows to assign a specific htmlid so the generated HTML can be directly styled afterwards. This directly sends the rendered document to the cell (no display/HTML necessary) if the destination is a notebook. Args: to: if None, try to guess if this is called from within a notebook and if yes, which kind. Otherwise, explicitly specify where to show the document to, one of "console", "jupyter", "colab". If "console" is specified or automatically chosen, the parameters "annspec", "doc_style", "row1_style", and "row2_style" are irrelevant and ignored. htmlid: the HTML id prefix to use for classes and element ids. annspec: if not None, a list of annotation set/type specifications. Each element is either the name of a set to fully include, or a tuple with the name of the set as the first element and with a single type name or a list of type names as the second element preselect: if not None, the set and type names to pre-select (show). This should have the same format as the annspec parameter. palette: if not None a list of colour codes (strings) usable in Javascript which will be used instead of the default palette. cols4types: if not None a dictionary mapping tuples (setname, typename) to a color. For the given setname and typename combinations, the colours from the palette (default or specified) will be overrriden. doc_style: if not None, use this as the style for the document text box row1_style: if not None, use this for the first row of the HTML viewer, which contains the document text and annotation sets and types panes. Default is gatenlpconfig.doc_html_repr_row1style_nostretch row2_style: if not None, use this for the second row of the HTML viewer, which contains the document and annottion features pane. Default is gatenlpconfig.doc_html_repr_row2style_nostretch """ # import within this method to avoid dependencies needed for this in the whole Document module from gatenlp.serialization.default_htmlannviewer import show_colab, show_notebook if to == "colab": show_colab(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return elif to == "jupyter": show_notebook(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return elif to == "console": return self.__str__() elif to is not None: raise Exception(f"Not a valid value for parameter to: {to}. Use one of console, jupyter, colab") if in_notebook(): if in_colab(): show_colab(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return else: show_notebook(self, htmlid=htmlid, display=True, annspec=annspec, preselect=preselect, palette=palette, cols4types=cols4types, doc_style=doc_style, row1_style=row1_style, row2_style=row2_style) return else: return self.__str__()
def to_dict(self, offset_type=None, annspec=None, **kwargs)
-
Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field!
Args
offset_type
- convert to the given offset type on the fly (Default value = None)
- annspec if not None, a list of annotation set/type specifications: each element
- is either a string, the name of the annotation set to include, or a tuple where the
- first element is the annotation set name and the second element is either a
- type name or a list of type names. The same annotation set name should not be used
- in more than one specification.
**kwargs
- get passed on to the to_dict methods of included objects.
Returns
the dictionary representation of this instance
Expand source code
def to_dict(self, offset_type=None, annspec=None, **kwargs): """Convert this instance to a dictionary that can be used to re-create the instance with from_dict. NOTE: if there is an active changelog, it is not included in the output as this field is considered a transient field! Args: offset_type: convert to the given offset type on the fly (Default value = None) annspec if not None, a list of annotation set/type specifications: each element is either a string, the name of the annotation set to include, or a tuple where the first element is the annotation set name and the second element is either a type name or a list of type names. The same annotation set name should not be used in more than one specification. **kwargs: get passed on to the to_dict methods of included objects. Returns: the dictionary representation of this instance """ # if the specified offset type is equal to what we have, do nothing, otherwise # create an offset mapper and pass it down to where we actually convert the annotations if offset_type is not None: assert offset_type == OFFSET_TYPE_JAVA or offset_type == OFFSET_TYPE_PYTHON if offset_type != self.offset_type: if self._text is not None: om = OffsetMapper(self._text) kwargs["offset_mapper"] = om kwargs["offset_type"] = offset_type else: offset_type = self.offset_type # create the annotation sets map if annspec is not None: annsets_dict = {} for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: annsets_dict[spec] = tmpset.to_dict(**kwargs) else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: annsets_dict[setname] = self._annotation_sets[setname].to_dict( anntypes=types, **kwargs ) else: annsets_dict = { name: aset.to_dict(**kwargs) for name, aset in self._annotation_sets.items() } return { "annotation_sets": annsets_dict, "text": self._text, "features": self._features.to_dict(), "offset_type": offset_type, "name": self.name, }
def to_offset_type(self, offsettype: str) ‑> Optional[OffsetMapper]
-
Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing.
NOTE: if the document has a ChangeLog, it is NOT also converted!
The method returns the offset mapper if anything actually was converted, otherwise None.
Args
offsettype
- either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON
offsettype
- str:
Returns
offset mapper or None
Expand source code
def to_offset_type(self, offsettype: str) -> Union[OffsetMapper, None]: """Convert all the offsets of all the annotations in this document to the required type, either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON. If the offsets are already of that type, this does nothing. NOTE: if the document has a ChangeLog, it is NOT also converted! The method returns the offset mapper if anything actually was converted, otherwise None. Args: offsettype: either OFFSET_TYPE_JAVA or OFFSET_TYPE_PYTHON offsettype: str: Returns: offset mapper or None """ if offsettype == self.offset_type: return None if offsettype == OFFSET_TYPE_JAVA and self.offset_type == OFFSET_TYPE_PYTHON: # convert from currently python to java om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_java) self.offset_type = OFFSET_TYPE_JAVA elif offsettype == OFFSET_TYPE_PYTHON and self.offset_type == OFFSET_TYPE_JAVA: # convert from currently java to python om = OffsetMapper(self._text) self._fixup_annotations(om.convert_to_python) self.offset_type = OFFSET_TYPE_PYTHON else: raise Exception("Odd offset type") return om
def yield_anns(self, annspec)
-
Yield all annotations which match the annotation specification. The order of the annotations is unespecified.
Args
annspec
- either a single string which is interpreted as an annotation set name, or a list where each element is either a string (annotation set name) or a tuple. If an element is a tuple, the first element of the tuple must be the annotation set name and the second element either a type name or a list of type names.
Yields
all the annotations matching the annotation specification
Expand source code
def yield_anns(self, annspec): """ Yield all annotations which match the annotation specification. The order of the annotations is unespecified. Args: annspec: either a single string which is interpreted as an annotation set name, or a list where each element is either a string (annotation set name) or a tuple. If an element is a tuple, the first element of the tuple must be the annotation set name and the second element either a type name or a list of type names. Yields: all the annotations matching the annotation specification """ if isinstance(annspec, str): tmpset = self._annotation_sets.get(annspec) if tmpset is not None: for ann in tmpset._annotations.values(): yield ann return for spec in annspec: if isinstance(spec, str): tmpset = self._annotation_sets.get(spec) if tmpset is not None: for ann in tmpset._annotations.values(): yield ann else: setname, types = spec if isinstance(types, str): types = [types] tmpset = self._annotation_sets.get(setname) if tmpset is not None: for ann in tmpset._annotations.values(): if ann.type in types: yield ann
-