Module gatenlp.annotation_set
Module for AnnotationSet class which represents a named collection of annotations which can arbitrarily overlap.
Expand source code
"""
Module for AnnotationSet class which represents a named collection of
annotations which can arbitrarily overlap.
"""
# TODO: when should two sets be equal? Currently object identity is requried!
from typing import Any, List, Union, Dict, Set, KeysView, Iterator, Generator
# TODO: prior to Python 3.9 we need different Iterable definitions for typing and type checking
from collections.abc import Iterable as abc_Iterable
from typing import Iterable, Optional
from collections import defaultdict
import copy
from gatenlp.span import Span
from gatenlp.annotation import Annotation
from gatenlp.impl import SortedIntvls
from gatenlp.utils import support_annotation_or_set, allowspan
__pdoc__ = {
"AnnotationSet.__iter__": True,
"AnnotationSet.__contains__": True,
"AnnotationSet.__getitem__": True,
"AnnotationSet.__len__": True,
}
class InvalidOffsetError(KeyError):
"""
Error that indicates some invalid offset in an operation.
"""
pass
class AnnotationSet:
"""
Represents a collection of annotations for a document.
"""
def __init__(self, name: str = "", owner_doc=None):
"""
Creates an annotation set. This should not be used directly by the
user, instead the method `Document.annset(name)` should be used to
access the annotation set with a given name from the document.
An annotation set contains an arbitrary number of annotations, which
can overlap in arbitrary ways. Each annotation set has a name and a
document can have as many named annotation sets as needed.
Args:
name: the name of the annotation set, default: the empty string
(default annotation set)
owner_doc: if this is set, the set and all sets created from it
can be queried for the owning document and offsets get checked
against the text of the owning document, if it has text.
Also, the changelog is only updated if an annotation
set has an owning document.
"""
self._name = name
self._owner_doc = owner_doc
self._index_by_offset = None
self._index_by_ol = None
self._index_by_type = None
# internally we represent the annotations as a map from
# annotation id (int) to Annotation
self._annotations = {} # map from annotation id to annotation
self._annset = set() # set containing the annotations itself based on the default hash implementation
self._is_immutable = False
self._next_annid = 0
@property
def name(self):
"""
Returns the name of the annotation set.
Note: the name of a set cannot be changed.
"""
return self._name
@property
def changelog(self):
"""
Returns the changelog or None if no changelog is set.
"""
if self._owner_doc is None:
return None
return self._owner_doc.changelog
def __setattr__(self, key, value):
"""
Prevent immutable fields from getting overridden, once they have been
set.
"""
if key == "name" or key == "owner_doc":
if self.__dict__.get(key, None) is None:
super().__setattr__(key, value)
else:
raise Exception(
"AnnotationSet attribute cannot get changed after being set"
)
else:
super().__setattr__(key, value)
def detach(self, restrict_to=None):
"""
Creates an immutable and detached copy of this set, optionally
restricted to the given annotation ids. A detached annotation
set does not have an owning document and deleting or adding
annotations does not change the annotations stored with the document.
However, the annotations in a detached annotation set
are the same as those stored in the attached set, so updating their
features will modify the annotations in the document as well.
Args:
restrict_to: an iterable of annotation ids, if None, all the
annotations from this set.
Returns:
an immutable annotation set
"""
annset = AnnotationSet(name="detached-from:" + self.name)
annset._is_immutable = True
if restrict_to is None:
annset._annotations = {
annid: self._annotations[annid] for annid in self._annotations.keys()
}
else:
annset._annotations = {
annid: self._annotations[annid] for annid in restrict_to
}
annset._annset.update(annset._annotations.values())
annset._next_annid = self._next_annid
return annset
def detach_from(self, anns: Iterable):
"""
Creates an immutable detached annotation set from the annotations
in anns which could by either a collection of annotations or
annotation ids (int numbers) which are assumed to be the annotation
ids from this set.
The next annotation id for the created set is the highest seen
annotation id from anns plus one.
Args:
anns: an iterable of annotations
Returns:
an immutable detached annotation set
"""
annset = AnnotationSet(name="detached-from:" + self.name)
annset._is_immutable = True
annset._annotations = {}
nextid = -1
for ann in anns:
if isinstance(ann, int):
annset._annotations[ann] = self._annotations[ann]
annid = ann
else:
annset._annotations[id] = ann
annid = ann.id
if annid > nextid:
nextid = annid
annset._next_annid = nextid + 1
annset._annset.update(annset._annotations.values())
return annset
@staticmethod
def create_from(anns: Union[Iterable[Annotation], Annotation], name=None) -> "AnnotationSet":
"""
Creates an immutable detached annotation set from the annotations
in anns. The set contains shallow copies of the annotations and the
annotation id is preserved, unless it is a duplicate in which the next
available id is used.
Args:
anns: an iterable of annotations or a single annotation
name: an optional name for the set
Returns:
An immutable detached annotation set
"""
annset = AnnotationSet(name=name)
annset._is_immutable = True
annset._annotations = {}
annset._next_annid = 0
if isinstance(anns, Annotation):
anns = [anns]
for ann in anns:
# if the id is already in the set, assign the next available one
ann = ann.copy()
if ann.id in annset._annotations:
ann._id = annset._next_annid
annset._annotations[annset._next_annid] = ann
annset._next_annid += 1
else:
# if the id is not yet in the set, keep it and make sure that after adding,
# the next annid is adapted, if necessary!
annset._annotations[ann.id] = ann
if ann.id >= annset._next_annid:
annset._next_annid = ann.id + 1
annset._annset.update(annset._annotations.values())
return annset
@property
def immutable(self) -> bool:
"""
Get or set the immutability of the annotation set. If it is
immutable, annotations cannot be added or removed from the set,
but the annotations themselves can still have their features modified.
All detached annotation sets are immutable when created,
but can be made mutable afterwards.
"""
return self._is_immutable
@immutable.setter
def immutable(self, val: bool) -> None:
self._is_immutable = val
def isdetached(self) -> bool:
"""
Returns True if the annotation set is detached, False otherwise.
"""
return self._owner_doc is None
def _create_index_by_offset(self) -> None:
"""
Generates the offset index, if it does not already exist.
The offset index is an interval tree that stores the annotation
ids for the offset interval of the annotation.
"""
if self._index_by_offset is None:
self._index_by_offset = SortedIntvls()
for ann in self._annotations.values():
self._index_by_offset.add(ann.start, ann.end, ann.id)
def _create_index_by_ol(self) -> None:
"""
Generates an index by start offset, end offset and annotation id
"""
if self._index_by_ol is None:
self._index_by_ol = SortedIntvls(by_ol=True)
for ann in self._annotations.values():
self._index_by_ol.add(ann.start, ann.end, ann.id)
def _create_index_by_type(self) -> None:
"""
Generates the type index, if it does not already exist.
The type index is a map from
annotation type to a set of all annotation ids with that type.
"""
if self._index_by_type is None:
self._index_by_type = defaultdict(set)
for ann in self._annotations.values():
self._index_by_type[ann.type].add(ann.id)
def _add_to_indices(self, annotation: Annotation) -> None:
"""
If we have created the indices, add the annotation to them.
Args:
annotation: the annotation to add to the indices.
annotation: Annotation:
"""
if self._index_by_type is not None:
self._index_by_type[annotation.type].add(annotation.id)
if self._index_by_offset is not None:
self._index_by_offset.add(annotation.start, annotation.end, annotation.id)
def _remove_from_indices(self, annotation: Annotation) -> None:
"""
Remove an annotation from the indices.
Args:
annotation: the annotation to remove.
"""
if self._index_by_offset is not None:
self._index_by_offset.remove(
annotation.start, annotation.end, annotation.id
)
if self._index_by_type is not None:
self._index_by_type[annotation.type].remove(annotation.id)
@staticmethod
def _intvs2idlist(intvs, ignore_id=None) -> List[int]:
"""
Convert an iterable of interval tuples (start, end, id) to a list of ids
Args:
intvs: iterable of interval tuples
ignore_id: (Default value = None) do not include this id
Returns:
list of ids
"""
if ignore_id is not None:
return [i[2] for i in intvs if i[2] != ignore_id]
else:
return [i[2] for i in intvs]
@staticmethod
def _intvs2idset(intvs, ignore_id=None) -> Set[int]:
"""
Convert an iterable of interval tuples (start, end, id) to a
set of ids
Args:
intvs: iterable of interval tuples
ignore_id: (Default value = None) do not include this id
Returns:
set of ids
"""
ret = set()
if ignore_id is not None:
for i in intvs:
if i[2] != ignore_id:
ret.add(i[2])
else:
for i in intvs:
ret.add(i[2])
return ret
def _restrict_intvs(self, intvs, ignore_id=None):
"""
Args:
intvs:
ignore_id: (Default value = None) do not include this id
"""
return self.detach(
restrict_to=AnnotationSet._intvs2idlist(intvs, ignore_id=ignore_id)
)
def __len__(self) -> int:
"""
Return number of annotations in the set.
"""
return len(self._annotations)
@property
def size(self) -> int:
"""
Returns the number of annotations in the annotation set.
"""
return len(self._annotations)
@property
def document(self):
"""
Returns the owning document, if set. If the owning document was not set, returns None.
"""
return self._owner_doc
@support_annotation_or_set
def _check_offsets(self, start: int, end: int) -> None:
"""
Checks the offsets for the given span/annotation against the document boundaries, if we know the owning
document and if the owning document has text.
Args:
start: start offset
end: end offset
"""
if self._owner_doc is None:
return
if self._owner_doc.text is None:
return
doc_size = len(self._owner_doc)
if start < 0:
raise InvalidOffsetError("Annotation starts before 0")
if end < 0:
raise InvalidOffsetError("Annotation ends before 0")
if start > end:
raise InvalidOffsetError("Annotation ends before it starts")
if start > doc_size:
raise InvalidOffsetError(
"Annotation starts after document ends: start={}, docsize={}".format(
start, doc_size
)
)
if end > doc_size:
raise InvalidOffsetError(
"Annotation ends after document ends: end={}, docsize={}".format(
end, doc_size
)
)
@property
def start(self):
"""
Returns the smallest start offset of all annotations, i.e the start
of the span of the whole set. This needs the index and creates
it if necessary.
Throws:
an exception if there are no annotations in the set.
"""
if self.size == 0:
raise Exception("Annotation set is empty, cannot determine start offset")
self._create_index_by_offset()
return self._index_by_offset.min_start()
@property
def end(self):
"""
Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation.
This needs the index and creates it if necessary.
Throws:
an exception if there are no annotations in the set.
"""
if self.size == 0:
raise Exception("Annotation set is empty, cannot determine end offset")
self._create_index_by_offset()
return self._index_by_offset.max_end()
@property
def length(self):
"""
Returns the the length of the annotation set span.
Throws:
an exception if there are no annotations in the set.
"""
return self.end - self.start
@allowspan
def add(
self,
start: int,
end: int,
anntype: str,
features: Dict[str, Any] = None,
annid: int = None,
):
"""
Adds an annotation to the set.
Once an annotation has been added,
the start and end offsets,
the type, and the annotation id of the annotation are immutable.
If an annotation id is specified that already exists in the set, an
exception is raised.
Args:
start: start offset
end: end offset
anntype: the annotation type
features: a map, an iterable of tuples or an existing feature map.
In any case, the features are used
to create a new feature map for this annotation. If the map
is empty or this parameter is None, the
annotation does not store any map at all.
annid: the annotation id, if not specified the next free one
for this set is used. NOTE: the id should
normally left unspecified and get assigned automatically.
Returns:
the new annotation
"""
if annid is not None and not isinstance(annid, int):
raise Exception("Parameter annid must be an int, mixed up with features?")
if features is not None and isinstance(features, int):
raise Exception(
"Parameter features must not be an int: mixed up with annid?"
)
if self._is_immutable:
raise Exception("Cannot add an annotation to an immutable annotation set")
self._check_offsets(start, end)
if annid and annid in self._annotations:
raise Exception(
"Cannot add annotation with id {}, already in set".format(annid)
)
if annid is None:
annid = self._next_annid
self._next_annid = self._next_annid + 1
ann = Annotation(start, end, anntype, features=features, annid=annid)
ann._owner_set = self
if not self._annotations:
self._annotations = {}
self._annotations[annid] = ann
self._annset.add(ann)
self._add_to_indices(ann)
if self.changelog is not None:
entry = {
"command": "annotation:add",
"set": self.name,
"start": ann.start,
"end": ann.end,
"type": ann.type,
"features": ann._features.to_dict(),
"id": ann.id,
}
self.changelog.append(entry)
return ann
def add_ann(self, ann, annid: int = None):
"""
Adds a shallow copy of the given ann to the annotation set,
either with a new annotation id or with the one given.
Args:
ann: the annotation to copy into the set
annid: the annotation id, if not specified the next free one for
this set is used. Note: the id should normally left unspecified
and get assigned automatically.
Returns:
the added annotation
"""
return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid)
# TODO/NOTE: Iterable[Annotation] with Iterable from collections.abc is not possible here prior to Python 3.9
# instead, Iterable must come from typing
def update(self, anns: Iterable[Annotation], annid_from_ann=False):
"""
Adds shallow copies of all annotations from the iterable to the set.
Args:
anns: an iterable of Annotations
annid_from_ann: if True, use the same annotation id as in the annotation, this will raise
an exception if the set already contains and annotation with this id.
If False assign a new id to the added annotation.
"""
for ann in anns:
if annid_from_ann:
self.add(ann.start, ann.end, ann.type, ann.features, annid=ann.id)
else:
self.add(ann.start, ann.end, ann.type, ann.features)
def add_anns(self, anns: Iterable[Annotation], annid_from_ann=False):
"""
DEPRECATED: same as update.
Args:
anns: an iterable of Annotations
annid_from_ann: if True, use the same annotation id as in the annotation, this will raise
an exception if the set already contains and annotation with this id.
If False assign a new id to the added annotation.
"""
self.update(anns, annid_from_ann=annid_from_ann)
def remove(
self, annoriter: Union[int, Annotation, Iterable], raise_on_notexisting=True
) -> None:
"""
Removes the given annotation which is either the id or the annotation
instance or recursively all annotations in the iterable.
Throws:
exception if the annotation set is immutable or the annotation
is not in the set
Args:
annoriter: either the id (int) or the annotation instance
(Annotation) or an iterable of
id or annotation instance or iterable ...
raise_on_notexisting: (default: True) if false, silently accepts
non-existing annotations/ids and does nothing.
Note: if this is True, but the annotation set is immutable,
an Exception is still raised.
"""
if self._is_immutable:
raise Exception(
"Cannot remove an annotation from an immutable annotation set"
)
if isinstance(annoriter, abc_Iterable):
for a in annoriter:
self.remove(a, raise_on_notexisting=raise_on_notexisting)
return
annid = None # make pycharm happy
if isinstance(annoriter, int):
annid = annoriter
if annid not in self._annotations:
raise Exception(
"Annotation with id {} not in annotation set, cannot remove".format(
annid
)
)
ann = self._annotations[annid]
elif isinstance(annoriter, Annotation):
annid = annoriter.id
if annid not in self._annotations:
raise Exception(
"Annotation with id {} does not belong to this set, cannot remove".format(
annid
)
)
ann = annoriter
else:
raise Exception("Should never happen!")
# NOTE: once the annotation has been removed from the set, it could
# still be referenced
# somewhere else and its features could get modified. In order to
# prevent logging of such changes,
# the owning set gets cleared for the annotation
ann._owner_set = None
del self._annotations[annid]
self._annset.remove(ann)
if self.changelog is not None:
self.changelog.append(
{"command": "annotation:remove", "set": self.name, "id": annid}
)
self._remove_from_indices(ann)
def clear(self, reset_annids=False) -> None:
"""
Removes all annotations from the set.
Args:
reset_annids: if True, also reset the next annotation id to 0, after this newly added annotations
will get annotation ids starting from 0. IMPORTANT: this must not be used for code to run in the
Java GATE Python plugin, as Java GATE handles annotation ids differently!
"""
self._annotations.clear()
self._annset.clear()
if reset_annids:
self._next_annid = 0
self._index_by_offset = None
self._index_by_type = None
if self.changelog is not None:
self.changelog.append({"command": "annotations:clear", "set": self.name})
def clone_anns(self, memo=None):
"""
Replaces the annotations in this set with deep copies of the
originals. If this is a detached set,
then this makes sure that any modifications to the annotations do not
affect the original annotations
in the attached set. If this is an attached set, it makes sure that
all other detached sets cannot affect
the annotations in this set any more. The owning set of the
annotations that get cloned is cleared.
Args:
memo: for internal use by our __deepcopy__ implementation.
"""
tmpdict = {}
for annid, ann in self._annotations.items():
newann = copy.deepcopy(ann, memo=memo)
ann._owner_set = None
tmpdict[annid] = newann
for annid, ann in tmpdict.items():
self._annset.remove(self._annotations[annid])
self._annotations[annid] = ann
self._annset.add(ann)
def __copy__(self):
"""
NOTE: creating a copy always creates a detached set, but a mutable one.
"""
c = self.detach()
c._is_immutable = False
return c
def copy(self):
"""
Returns a shallow copy of the annotation set.
"""
return self.__copy__()
def __deepcopy__(self, memo=None):
if memo is None:
memo = {}
c = self.detach()
c._is_immutable = False
c.clone_anns(memo=memo)
return c
def deepcopy(self):
"""
Returns a deep copy of the annotation set.
"""
return copy.deepcopy(self)
def __iter__(self) -> Iterator:
"""
Yields all the annotations of the set.
Important: using the iterator will always create the index if it
is not already there!
For fast iteration use fast_iter() which does not allow sorting or
offset ranges.
Yields:
the annotations in document order
"""
# return iter(self._annotations.values())
return self.iter()
def fast_iter(self) -> Generator:
"""
Yields annotations in insertion order. This is faster then the
default iterator and does not
need to index (so if the index does not exist, it will not be built).
"""
if self._annotations:
for annid, ann in self._annotations.items():
yield ann
def iter(
self,
start_ge: Union[int, None] = None,
start_lt: Union[None, int] = None,
with_type: str = None,
reverse: bool = False,
) -> Generator:
"""
Default iterator.
Yields annotations ordered by increasing starting annotation offset and increasing annotation id,
otionally limited by the other parameters.
Args:
start_ge: the offset from where to start including annotations
start_lt: the last offset to use as the starting offset of an annotation
with_type: only annotations of this type
reverse: process in reverse document order
Yields:
Annotations in default document order, or reverse document order
"""
if with_type is not None:
allowedtypes = set()
if isinstance(type, str):
allowedtypes.add(with_type)
else:
for atype in with_type:
allowedtypes.add(atype)
else:
allowedtypes = None
if not self._annotations:
return
maxoff = None
if start_ge is not None:
assert start_ge >= 0
if start_lt is not None:
assert start_lt >= 1
maxoff = start_lt + 1
if start_lt is not None and start_ge is not None:
assert start_lt > start_ge
self._create_index_by_offset()
for _start, _end, annid in self._index_by_offset.irange(
minoff=start_ge, maxoff=maxoff, reverse=reverse
):
if (
allowedtypes is not None
and self._annotations[annid].type not in allowedtypes
):
continue
yield self._annotations[annid]
def iter_ol(
self,
start_ge: Union[int, None] = None,
start_lt: Union[None, int] = None,
with_type: str = None,
reverse: bool = False,
) -> Generator:
"""
Offset-Length Iterator.
Yields annotations ordered by increasing start offset, by increasing end offset
and increasing annotoation id, otionally limited
by the other parameters.
Args:
start_ge: the offset from where to start including annotations
start_lt: the last offset to use as the starting offset of an annotation
with_type: only annotations of this type
reverse: process in reverse document order
Yields:
Annotations ordered by offset and length.
"""
if with_type is not None:
allowedtypes = set()
if isinstance(type, str):
allowedtypes.add(with_type)
else:
for atype in with_type:
allowedtypes.add(atype)
else:
allowedtypes = None
if not self._annotations:
return
maxoff = None
if start_ge is not None:
assert start_ge >= 0
if start_lt is not None:
assert start_lt >= 1
maxoff = start_lt + 1
if start_lt is not None and start_ge is not None:
assert start_lt > start_ge
self._create_index_by_ol()
for _start, _end, annid in self._index_by_ol.irange(
minoff=start_ge, maxoff=maxoff, reverse=reverse
):
if (
allowedtypes is not None
and self._annotations[annid].type not in allowedtypes
):
continue
yield self._annotations[annid]
def reverse_iter(self, **kwargs):
"""
Same as iter, but with the reverse parameter set to true.
Args:
kwargs: Same as for iter(), with revers=True fixed.
**kwargs: will get passed on the Annotation.iter
Returns:
same result as iter()
"""
return self.iter(reverse=True, **kwargs)
def get(
self, annid: Union[int, Annotation], default=None
) -> Union[Annotation, None]:
"""
Gets the annotation with the given annotation id or returns the given default.
NOTE: for handling cases where legacy code still expects the add method to return
an id and not the annotation, this will accept an annotation so the the frequent
pattern still works:
annid = annset.add(b,e,t).id
ann = annset.get(annid)
If an annotation is passed the annotation from the set with the id of that annotation is
returned, if the annotation is from that set, this will return the same object, if it is
still in the set (or return the default value).
Args:
annid: the annotation id of the annotation to retrieve.
default: what to return if an annotation with the given id is not
found. (Default value = None)
annid: Union[int:
Annotation]:
Returns:
the annotation or the default value.
"""
if isinstance(annid, Annotation):
annid = annid.id
return self._annotations.get(annid, default)
def first(self):
"""
Return the first (or only) annotation in the set by offset.
Returns:
first annotation
"""
sz = len(self._annotations)
if sz == 0:
raise Exception("Empty set, there is no first annotation")
elif sz == 1:
return next(iter(self._annotations.values()))
self._create_index_by_offset()
_, _, annid = next(self._index_by_offset.irange(reverse=False))
return self._annotations[annid]
def last(self):
"""
Return the last (or only) annotation by offset.
Returns:
last annotation
"""
sz = len(self._annotations)
if sz == 0:
raise Exception("Empty set, there is no last annotation")
elif sz == 1:
return next(iter(self._annotations.values()))
self._create_index_by_offset()
_, _, annid = next(self._index_by_offset.irange(reverse=True))
return self._annotations[annid]
def for_idx(self, idx, default=None):
"""
Return the annotation corresponding to the index idx in the set.
This returns the
annotation stored at the index, as added to the set. The order usually
depends on the insertion time.
If no annotation with the given index is specified, the value
specified for `default` is returned.
Args:
idx: index of the annotation in the set
default: default value to return if now annotation with the given index exists
Returns:
the annotation with the given index or the default value
"""
# TODO: we could make this more memory efficient (but slower) by
# iterating over values until getting idxth
tmplist = list(self._annotations.values())
if idx < len(tmplist):
return tmplist[idx]
else:
return default
def __getitem__(self, annid):
"""
Gets the annotation with the given annotation id or throws an exception.
Args:
annid: the annotation id
Returns:
annotation
"""
return self._annotations[annid]
def with_type(self, *anntype: Union[str, Iterable], non_overlapping: bool = False):
"""
Gets annotations of the specified type(s).
Creates the type index if necessary.
Args:
anntype: one or more types or type lists. The union of all types
specified that way is used to filter the annotations. If no type
is specified, an empty detached set is returned.
non_overlapping: if True, only return annotations of any of the
given types which do not overlap with other annotations. If
there are several annotations that start at
the same offset, use the type that comes first in the
parameters, if there are more than one of that type, use the
one that would come first in the usual sort order.
Returns:
a detached immutable annotation set with the matching annotations.
"""
atypes = []
for atype in anntype:
if isinstance(atype, str):
atypes.append(atype)
else:
for t in atype:
atypes.append(t)
if not atypes:
return self.detach(restrict_to=[])
self._create_index_by_type()
annids = set()
for t in atypes:
idxs = self._index_by_type.get(t)
if idxs:
annids.update(idxs)
if non_overlapping:
# need to get annotations grouped by start offset and sorted according to
# what the Annotation class defines
allanns = sorted(annids, key=lambda x: self._annotations[x])
allanns = [self._annotations[x] for x in allanns]
allannsgrouped = []
curstart = None
curset = None
for ann in allanns:
if curstart is None:
curset = [ann]
curstart = ann.start
elif curstart == ann.start:
curset.append(ann)
else:
allannsgrouped.append(curset)
curset = [ann]
curstart = ann.start
if curset:
allannsgrouped.append(curset)
retanns = []
# now go through all the grouped annoations and select the top priority one
# then skip to the next group that does not overlap with the one we just selected
typepriority = dict()
for i, atype in enumerate(atypes):
typepriority[atype] = len(atypes) - i
curminoffset = 0
for group in allannsgrouped:
# instead of sorting, go through the group and find the top priority one
topann = None
if len(group) == 1:
if group[0].start >= curminoffset:
topann = group[0]
elif len(group) == 0:
raise Exception("We should never get a 0 size group here!")
else:
i = 0
for i, ann in enumerate(group):
if ann.start >= curminoffset:
topann = ann
break
for ann in group[i + 1:]:
if ann.start < curminoffset:
continue
if typepriority[ann.type] > typepriority[topann.type]:
topann = ann
elif typepriority[ann.type] == typepriority[topann.type]:
if ann.end > topann.end:
topann = ann
elif ann.end == topann.end:
if ann.id > topann.id:
topann = ann
if topann is not None:
retanns.append(topann)
curminoffset = topann.end
annids = [ann.id for ann in retanns]
return self.detach(restrict_to=annids)
def by_offset(self):
"""
Yields lists of annotations which start at the same offset.
"""
self._create_index_by_offset()
lastoff = -1
curlist = []
for ann in self.iter():
if ann.start != lastoff:
if lastoff != -1:
yield curlist
lastoff = ann.start
curlist = [ann]
else:
curlist.append(ann)
if lastoff != -1:
yield curlist
def by_span(self):
"""
Yields list of annotations with identical spans. Note: first needs
to sort all annotations!
"""
self._create_index_by_offset()
lastsoff = -1
lasteoff = -1
curlist = []
for ann in self.iter_ol():
if ann.start != lastsoff or ann.end != lasteoff:
if lastsoff != -1:
yield curlist
lastsoff = ann.start
lasteoff = ann.end
curlist = [ann]
else:
curlist.append(ann)
if lastsoff != -1:
yield curlist
@property
def type_names(self) -> KeysView[str]:
"""
Gets the names of all types in this set. Creates the type index
if necessary.
"""
self._create_index_by_type()
return self._index_by_type.keys()
@support_annotation_or_set
def startingat(
self, start: int, _end: Any = None, ann=None, include_self: bool = False
):
"""
Gets all annotations starting at the given offset (empty if none) and
returns them in a detached annotation set.
Note: this can be called with an annotation or annotation set instead
of the start offset. If called with an annotation, this annotation is
not included in the result set if `include_self` is `False`
Args:
start: the offset where annotations should start
_end: unused/ignored end offset
ann: any annotation that was specified instead of just the offset
include_self: should annotation passed be included in the result
Returns:
detached annotation set of matching annotations
"""
self._create_index_by_offset()
intvs = self._index_by_offset.starting_at(start)
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def start_min_ge(
self, offset: int, _end: Any = None, ann: Optional["Annotation"] = None, include_self: bool = False
):
"""Gets all annotations starting at the first possible offset
at or after the given offset and returns them in an immutable
annotation set.
Args:
offset: The offset
_end: unused/ignored end offset
ann: any Annotation that was passed
include_self: should annotation passed be included in the result
Returns:
annotation set of matching annotations
"""
self._create_index_by_offset()
intvs = self._index_by_offset.starting_from(offset)
# now select only those first ones which all have the same offset
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
retids = set()
startoff = None
for intv in intvs:
if startoff is None:
startoff = intv[0]
if ignore_id is not None:
if ignore_id != intv[2]:
retids.add(intv[2])
else:
retids.add(intv[2])
elif startoff == intv[0]:
if ignore_id is not None:
if ignore_id != intv[2]:
retids.add(intv[2])
else:
retids.add(intv[2])
else:
break
return self.detach(restrict_to=retids)
@support_annotation_or_set
def start_ge(self, start: int, _end: Any = None, ann: Optional["Annotation"] = None,
include_self: bool = False):
"""
Return the annotations that start at or after the given start offset.
Args:
start: Start offset
_end: unusued/ignored end offset
ann: any Annotation passed
include_self: should annotation passed be included in the result
Returns:
an immutable annotation set of the matching annotations
"""
self._create_index_by_offset()
intvs = self._index_by_offset.starting_from(start)
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def start_lt(self, offset: int, _end: Any = None, ann: Any = None):
"""
Returns the annotations that start before the given offset
(or annotation). This also accepts an annotation or set.
Args:
offset: offset before which the annotations should start
_end: unused/ignored end offset
ann: unised/ignored Annotation passed (can never be included!)
Returns:
an immutable annotation set of the matching annotations
"""
self._create_index_by_offset()
intvs = self._index_by_offset.starting_before(offset)
return self._restrict_intvs(intvs)
@support_annotation_or_set
def overlapping(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False):
"""
Gets annotations overlapping with the given span. Instead of the
start and end offsets,
also accepts an annotation or annotation set.
For each annotation ann in the result set, ann.overlapping(span)
is True
Args:
start: start offset of the span
end: end offset of the span
ann: the annotation that is passed to this function for checking if it is included in the result.
include_self: if True and the annotation for the span is given,
do not include that annotation in the result set.
Returns:
an immutable annotation set with the matching annotations
"""
self._create_index_by_offset()
intvs = self._index_by_offset.overlapping(start, end)
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def covering(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False):
"""
Gets the annotations which contain the given offset range
(or annotation/annotation set), i.e. annotations such that the given
offset range is within the annotation.
For each annotation ann in the result set, ann.covering(span) is True.
Args:
start: the start offset of the span
end: the end offset of the span
ann: the annotation representing the span. (Default value = None)
include_self: if True and the annotation for the span is given,
do not include that annotation in the result set. (Default value = False)
Returns:
an immutable annotation set with the matching annotations, if any
"""
self._create_index_by_offset()
intvs = self._index_by_offset.covering(start, end)
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def within(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False):
"""
Gets annotations that fall completely within the given offset range,
i.e. annotations such that the offset range is covering each of the
annotation.
For each annotation ann in the result set, ann.within(span) is True.
Args:
start: start offset of the range
end: end offset of the range
ann: the annotation representing the span. (Default value = None)
include_self: if True and the annotation for the span is given,
do not include that annotation in the result set. (Default value = False)
Returns:
an immutable annotation set with the matching annotations
"""
if start > end:
raise Exception("Invalid offset range: {},{}".format(start, end))
else:
self._create_index_by_offset()
intvs = self._index_by_offset.within(start, end)
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def coextensive(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False):
"""
Returns a detached annotation set with all annotations that start and
end at the given offsets.
For each annotation ann in the result set, ann.coextensive(span) is True.
Args:
start: start offset of the span
end: end offset of the span
ann: the annotation representing the span. (Default value = None)
include_self: if True and the annotation for the span is given,
do not include that annotation in the result set.
Returns:
annotation set with all annotations that have the same start
and end offsets.
"""
self._create_index_by_offset()
intvs = self._index_by_offset.at(start, end)
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def before(
self, start: int, end: int, ann: Optional["Annotation"] = None,
include_self: bool = False, immediately: bool = False
):
"""
Returns a detached annotation set with all annotations that end
before the given offsets.
For each annotation ann in the result set, ann.isbefore(span) is True.
Args:
start: start offset of the span
end: end offset of the span
ann: the annotation representing the span. (Default value = None)
include_self: if True and the annotation id for the span is given,
do not include that annotation in the result set.
immediately: if True, the end offset of the annotations return
must coincide with the start offset of the span (default=False)
Returns:
annotation set with all annotations that end before the given span
"""
self._create_index_by_offset()
if immediately:
intvs = self._index_by_offset.ending_at(start)
else:
intvs = self._index_by_offset.ending_to(start)
# we need to filter self if self is zero-length!
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@support_annotation_or_set
def after(
self, start: int, end: int, ann: Optional["Annotation"] = None,
include_self: bool = False, immediately: bool = False
):
"""
Returns a detached annotation set with all annotations that start
after the given span.
For each annotation ann in the result set, ann.isafter(span) is True.
Args:
start: start offset of the span
end: end offset of the span
ann: the annotation representing the span. (Default value = None)
include_self: if True and the annotation id for the span is given,
do not include that annotation in the result set.
immediately: if True, the start offset of the annotations
returned must coincide with the end offset of the span (default=False)
Returns:
annotation set with all annotations that start after the given span
"""
self._create_index_by_offset()
if immediately:
intvs = self._index_by_offset.starting_at(end)
else:
intvs = self._index_by_offset.starting_from(end)
# we need to filter self if self is zero-length!
if not include_self and ann is not None and ann in self:
ignore_id = ann.id
else:
ignore_id = None
return self._restrict_intvs(intvs, ignore_id=ignore_id)
@property
def span(self) -> Span:
"""
Returns a tuple with the start and end offset the corresponds to the
smallest start offset of any annotation
and the largest end offset of any annotation.
(Builds the offset index)
"""
if len(self._annotations) == 0:
return Span(0, 0)
self._create_index_by_offset()
return Span(self._index_by_offset.min_start(), self._index_by_offset.max_end())
def __contains__(self, annorannid: Union[int, Annotation]) -> bool:
"""
Provides 'annotation in annotation_set' functionality.
Args:
annorannid: the annotation instance or annotation id to check. If this is an id, it is checked if the
id appears in this AnnotationSet. If this is an annotation instance, then True is returned if the
Annotation does have an owning set and the owning set is this AnnotationSet and the annotation id
is present in this AnnotatioSet
Returns:
`True` if the annotation exists in the set, `False` otherwise
"""
if isinstance(annorannid, Annotation):
return annorannid in self._annset
return (
annorannid in self._annotations
) # On the off chance someone passed an ID in directly
contains = __contains__
def __repr__(self) -> str:
"""
Returns the string representation of the set.
"""
return "AnnotationSet({})".format(repr(list(self.iter())))
def to_dict(self, anntypes=None, **kwargs):
"""
Convert an annotation set to its dict representation.
Args:
anntypes: if not None, an iterable of annotation types to include
**kwargs: passed on to the dict creation of contained annotations.
Returns:
the dict representation of the annotation set.
"""
if anntypes is not None:
anntypesset = set(anntypes)
anns_list = list(
val.to_dict(**kwargs)
for val in self._annotations.values()
if val.type in anntypesset
)
else:
anns_list = list(
val.to_dict(**kwargs) for val in self._annotations.values()
)
return {
# NOTE: Changelog is not getting added as it is stored in the document part!
"name": self.name,
"annotations": anns_list,
"next_annid": self._next_annid,
}
@staticmethod
def from_dict(dictrepr, owner_doc=None, **kwargs):
"""
Create an AnnotationSet from its dict representation and optionally
set the owning document.
Args:
dictrepr: the dict representation of the annotation set
owner_doc: the owning document
**kwargs: passed on to the creation of annotations
Returns:
the annotation set
"""
annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc)
annset._next_annid = dictrepr.get("next_annid")
if dictrepr.get("annotations"):
annset._annotations = dict(
(int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs))
for a in dictrepr.get("annotations")
)
annset._annset.update(annset._annotations.values())
else:
annset._annotations = {}
return annset
@staticmethod
def from_anns(anns, deep_copy=False, **kwargs):
"""
Create a detached AnnotationSet from an iterable of annotations.
Args:
anns: an iterable of annotations
deep_copy: if the annotations should get added as copies
(default) or deep copies.
Returns:
the annotation set
"""
annset = AnnotationSet(name="", owner_doc=None)
annset._annotations = dict()
maxid = 0
for ann in anns:
if deep_copy:
addann = ann.deepcopy()
else:
addann = ann.copy()
annset._annotations[addann.id] = addann
if addann.id > maxid:
maxid = addann.id
annset._next_annid = maxid
annset._is_immutable = True
return annset
def _update_offsets(self, id, start, end):
"""
In-place update the offset of the annotation with the given id. THIS IS FOR INTERNAL USE ONLY!
Using this method can lead to many different kinds of hard to debug and surprising bugs!
NOTE: this only updates the by offset index if it already exists. If the offsets are both
are unchanged, this is a NOOP.
Args:
id: id of the annotation to change
start: new start offset
end: new end offset
"""
ann = self._annotations[id]
if ann.start == start and ann.end == end:
return # nothing to do really
# print(f"DEBUG: updating offset for {id} from {ann.start},{ann.end} to {start},{end}")
if self._index_by_offset is not None:
self._index_by_offset.remove(
ann.start, ann.end, ann.id
)
ann._update_offsets(start, end)
if self._index_by_offset is not None:
self._index_by_offset.add(ann.start, ann.end, ann.id)
def _edit_anns(self, edits, affected_strategy):
"""
Edit helper method: takes a list of edits and returns two values: a dictionary annid->(start,end) of
new offset spans for all annotations that remain in the set, and a set of annids for annotations that
have to get deleted.
Args:
edits: the edit(s) to carry out
Returns:
anns: dictionary mapping annotation ids to pairs start,end of new offsets for that annotation
anns2delete: set of annotation ids to delete as a aresult of the edits
"""
# convert the list of edits into a list of lists [startoff, endoff, len, startlist, endlist]
# where the lists will contain later the ids of annotation starting/ending within that span
# This also makes sure that if the edits are mutable, we do not change them in any way
edits = [[l[0], l[1], len(l[2]) if isinstance(l[2], str) else l[2], [], []] for l in edits]
# sort the edits by ending, then starting offsets: since we operate from start to end, as soon as
# processing has moved past some offset, the annotations before that offset do not need to get
# updated any more.
edits.sort(key=lambda x: (x[1], x[0]), reverse=False)
# optimization: instead of recalculating relevant overlaps after each edit, calculate
# them beforehand
# For each edit, add the ids of annotations that start/end in that interval to the start/end lists
# of the edit tuple. Also collect the ids of those annotations in sets for starting or ending, starting, ending
# in any edit. This is necessary because offset adaptions necessary for annotations starting/ending in
# a span need to get handled different from offset adaptations for all other annotations
self._create_index_by_offset()
affectedids_start = set()
affectedids_end = set()
for edit in edits:
# find all annotations which start or end within the span of the edit (or both)
sintvs = self._index_by_offset.starting_within(edit[0], edit[1])
eintvs = self._index_by_offset.ending_within(edit[0], edit[1])
for intv in sintvs:
if affected_strategy != "delete":
edit[3].append(intv[2])
affectedids_start.add(intv[2])
for intv in eintvs:
if affected_strategy != "delete":
edit[4].append(intv[2])
affectedids_end.add(intv[2])
# Any changes of offsets or deletions are not carried out until the very end. For this we
# keep a dictionary with all the annotations id to [start,end] and a set of annotations to delete
anns = {ann.id: [ann.start, ann.end] for ann in self._annotations.values()}
anns2delete = set()
# also keep sorted lists of annid by start and end offset, but only for non-affected annotations (outside of
# edits). The offset is accessed from anns because it can change during the process
idsbystart = []
idsbyend = []
# we are getting the annotations in offset order, so the two lists we create are also in starting
# offset order, for the first list this is what we need
for ann in self.iter():
annid = ann.id
if annid not in affectedids_start:
idsbystart.append(annid)
if annid not in affectedids_end:
idsbyend.append(annid)
# sort the idsbyend list by end offset
idsbyend.sort(key=lambda x: anns[x][1], reverse=False)
# ptr_start/end are indices into the start2ids/end2ids lists: pointing to the first entry for which
# annotation offsets still need to get adapted
ptr_start = None
ptr_end = None
if len(idsbystart) > 0:
ptr_start = 0
if len(idsbyend) > 0:
ptr_end = 0
for idx in range(len(edits)):
edit = edits[idx]
editfrom, editto, editlen, edit_sanns, edit_eanns = edit
newlen = len(edit[2]) if isinstance(edit[2], str) else edit[2]
oldlen = editto - editfrom
delta = newlen - oldlen
editto_new = editto + delta
# in order to process this span we need to do this:
# - adapt all affected annotations, i.e. annotations which start or end in this span,
# according to the strategy.
# - change the offsets of all annotations after this edit if the length of the span changed
# - also change the offsets of all edits after this edit if the length of the span changed
for annid in edit_sanns: # all the ids of anns starting in this edit
if affected_strategy == "delete_all":
if annid in anns:
anns2delete.add(annid)
del anns[annid]
elif affected_strategy == "adapt":
anns[annid][0] = editfrom
elif affected_strategy == "keepadapt":
if anns[annid][0] > editto:
anns[annid][0] = editfrom
for annid in edit_eanns: # all the ids of anns ending in this edit
if affected_strategy == "delete_all":
if annid in anns:
anns2delete.add(annid)
del anns[annid]
elif affected_strategy == "adapt":
anns[annid][1] = editto_new
elif affected_strategy == "keepadapt":
if anns[annid][1] > editto_new:
anns[annid][1] = editto_new
if delta != 0:
for idx2 in range(idx+1, len(edits)):
otheredit = edits[idx2]
otheredit[0] += delta
otheredit[1] += delta
while anns[idsbystart[ptr_start]][0] < editto:
ptr_start += 1
if ptr_start >= len(idsbystart):
ptr_start = None
break
# adapt all the annotations
if ptr_start is not None:
for idx2 in range(ptr_start, len(idsbystart)):
annid = idsbystart[idx2]
if annid not in anns2delete:
# print(f"DEBUG: update start for {annid} from {anns[annid][0]} by {delta}")
anns[annid][0] += delta
# find the first annotation that ends at or after the current edit
while anns[idsbyend[ptr_end]][1] <= editto:
ptr_end += 1
if ptr_end >= len(idsbyend):
ptr_end = None
break
# adapt all the annotations
if ptr_end is not None:
for idx2 in range(ptr_end, len(idsbyend)):
annid = idsbyend[idx2]
if annid not in anns2delete:
# print(f"DEBUG: update end for {annid} from {anns[annid][0]} by {delta}")
anns[annid][1] += delta
return anns, anns2delete
def _edit(self, edits, affected_strategy="keepadapt"):
"""
Carry out one or more edits. If edits is a tuple of length 3 with the first element not being iterable,
assume it is a single edit, Otherwise assume it is an iterable of edits.
An edit is a tuple (start, end, intorstring) giving the old offset range and either the string which
replaces that range or the length that replaces that range. NOTE: no two edit offset ranges may
overlap, if ranges do overlap, this method may raise an exception or silently perform unexpected
and terrible changes. The method does not check for edit spans to not overlap!
This method adapts the offsets of all annotations after the affected span, if an annotation begins or
ends within an affected span, what happens depends on the affected_strategy:
delete_all: remove any annotation where the start and/or end offset lies between the from/to offsets of
the edit
adapt: any start and/or end offset in between from/to is changed to the from or to offset
keepadapt: any start and/or end offset in between is left unchanged if that offset still exists in the
new span, otherwise adapted to from/to.
Args:
edits: single edit or iterable of edits
affected_strategy: one of the following strategies: delete, adapt, keepadapt
"""
if isinstance(edits, tuple) and not isinstance(edits[0], Iterable):
edits = [edits]
anns, anns2delete = self._edit_anns(edits, affected_strategy)
# now delete all annotations to be delete
for annid in anns2delete:
# print(f"DEBUG: removing annotation {self[annid]}")
self.remove(annid)
# and adapt all annotation offsets, if necessary
for annid in anns:
start, end = anns[annid]
self._update_offsets(annid, start, end)
Classes
class AnnotationSet (name: str = '', owner_doc=None)
-
Represents a collection of annotations for a document.
Creates an annotation set. This should not be used directly by the user, instead the method
Document.annset(name)
should be used to access the annotation set with a given name from the document.An annotation set contains an arbitrary number of annotations, which can overlap in arbitrary ways. Each annotation set has a name and a document can have as many named annotation sets as needed.
Args
name
- the name of the annotation set, default: the empty string (default annotation set)
owner_doc
- if this is set, the set and all sets created from it can be queried for the owning document and offsets get checked against the text of the owning document, if it has text. Also, the changelog is only updated if an annotation set has an owning document.
Expand source code
class AnnotationSet: """ Represents a collection of annotations for a document. """ def __init__(self, name: str = "", owner_doc=None): """ Creates an annotation set. This should not be used directly by the user, instead the method `Document.annset(name)` should be used to access the annotation set with a given name from the document. An annotation set contains an arbitrary number of annotations, which can overlap in arbitrary ways. Each annotation set has a name and a document can have as many named annotation sets as needed. Args: name: the name of the annotation set, default: the empty string (default annotation set) owner_doc: if this is set, the set and all sets created from it can be queried for the owning document and offsets get checked against the text of the owning document, if it has text. Also, the changelog is only updated if an annotation set has an owning document. """ self._name = name self._owner_doc = owner_doc self._index_by_offset = None self._index_by_ol = None self._index_by_type = None # internally we represent the annotations as a map from # annotation id (int) to Annotation self._annotations = {} # map from annotation id to annotation self._annset = set() # set containing the annotations itself based on the default hash implementation self._is_immutable = False self._next_annid = 0 @property def name(self): """ Returns the name of the annotation set. Note: the name of a set cannot be changed. """ return self._name @property def changelog(self): """ Returns the changelog or None if no changelog is set. """ if self._owner_doc is None: return None return self._owner_doc.changelog def __setattr__(self, key, value): """ Prevent immutable fields from getting overridden, once they have been set. """ if key == "name" or key == "owner_doc": if self.__dict__.get(key, None) is None: super().__setattr__(key, value) else: raise Exception( "AnnotationSet attribute cannot get changed after being set" ) else: super().__setattr__(key, value) def detach(self, restrict_to=None): """ Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well. Args: restrict_to: an iterable of annotation ids, if None, all the annotations from this set. Returns: an immutable annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True if restrict_to is None: annset._annotations = { annid: self._annotations[annid] for annid in self._annotations.keys() } else: annset._annotations = { annid: self._annotations[annid] for annid in restrict_to } annset._annset.update(annset._annotations.values()) annset._next_annid = self._next_annid return annset def detach_from(self, anns: Iterable): """ Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set. The next annotation id for the created set is the highest seen annotation id from anns plus one. Args: anns: an iterable of annotations Returns: an immutable detached annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True annset._annotations = {} nextid = -1 for ann in anns: if isinstance(ann, int): annset._annotations[ann] = self._annotations[ann] annid = ann else: annset._annotations[id] = ann annid = ann.id if annid > nextid: nextid = annid annset._next_annid = nextid + 1 annset._annset.update(annset._annotations.values()) return annset @staticmethod def create_from(anns: Union[Iterable[Annotation], Annotation], name=None) -> "AnnotationSet": """ Creates an immutable detached annotation set from the annotations in anns. The set contains shallow copies of the annotations and the annotation id is preserved, unless it is a duplicate in which the next available id is used. Args: anns: an iterable of annotations or a single annotation name: an optional name for the set Returns: An immutable detached annotation set """ annset = AnnotationSet(name=name) annset._is_immutable = True annset._annotations = {} annset._next_annid = 0 if isinstance(anns, Annotation): anns = [anns] for ann in anns: # if the id is already in the set, assign the next available one ann = ann.copy() if ann.id in annset._annotations: ann._id = annset._next_annid annset._annotations[annset._next_annid] = ann annset._next_annid += 1 else: # if the id is not yet in the set, keep it and make sure that after adding, # the next annid is adapted, if necessary! annset._annotations[ann.id] = ann if ann.id >= annset._next_annid: annset._next_annid = ann.id + 1 annset._annset.update(annset._annotations.values()) return annset @property def immutable(self) -> bool: """ Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified. All detached annotation sets are immutable when created, but can be made mutable afterwards. """ return self._is_immutable @immutable.setter def immutable(self, val: bool) -> None: self._is_immutable = val def isdetached(self) -> bool: """ Returns True if the annotation set is detached, False otherwise. """ return self._owner_doc is None def _create_index_by_offset(self) -> None: """ Generates the offset index, if it does not already exist. The offset index is an interval tree that stores the annotation ids for the offset interval of the annotation. """ if self._index_by_offset is None: self._index_by_offset = SortedIntvls() for ann in self._annotations.values(): self._index_by_offset.add(ann.start, ann.end, ann.id) def _create_index_by_ol(self) -> None: """ Generates an index by start offset, end offset and annotation id """ if self._index_by_ol is None: self._index_by_ol = SortedIntvls(by_ol=True) for ann in self._annotations.values(): self._index_by_ol.add(ann.start, ann.end, ann.id) def _create_index_by_type(self) -> None: """ Generates the type index, if it does not already exist. The type index is a map from annotation type to a set of all annotation ids with that type. """ if self._index_by_type is None: self._index_by_type = defaultdict(set) for ann in self._annotations.values(): self._index_by_type[ann.type].add(ann.id) def _add_to_indices(self, annotation: Annotation) -> None: """ If we have created the indices, add the annotation to them. Args: annotation: the annotation to add to the indices. annotation: Annotation: """ if self._index_by_type is not None: self._index_by_type[annotation.type].add(annotation.id) if self._index_by_offset is not None: self._index_by_offset.add(annotation.start, annotation.end, annotation.id) def _remove_from_indices(self, annotation: Annotation) -> None: """ Remove an annotation from the indices. Args: annotation: the annotation to remove. """ if self._index_by_offset is not None: self._index_by_offset.remove( annotation.start, annotation.end, annotation.id ) if self._index_by_type is not None: self._index_by_type[annotation.type].remove(annotation.id) @staticmethod def _intvs2idlist(intvs, ignore_id=None) -> List[int]: """ Convert an iterable of interval tuples (start, end, id) to a list of ids Args: intvs: iterable of interval tuples ignore_id: (Default value = None) do not include this id Returns: list of ids """ if ignore_id is not None: return [i[2] for i in intvs if i[2] != ignore_id] else: return [i[2] for i in intvs] @staticmethod def _intvs2idset(intvs, ignore_id=None) -> Set[int]: """ Convert an iterable of interval tuples (start, end, id) to a set of ids Args: intvs: iterable of interval tuples ignore_id: (Default value = None) do not include this id Returns: set of ids """ ret = set() if ignore_id is not None: for i in intvs: if i[2] != ignore_id: ret.add(i[2]) else: for i in intvs: ret.add(i[2]) return ret def _restrict_intvs(self, intvs, ignore_id=None): """ Args: intvs: ignore_id: (Default value = None) do not include this id """ return self.detach( restrict_to=AnnotationSet._intvs2idlist(intvs, ignore_id=ignore_id) ) def __len__(self) -> int: """ Return number of annotations in the set. """ return len(self._annotations) @property def size(self) -> int: """ Returns the number of annotations in the annotation set. """ return len(self._annotations) @property def document(self): """ Returns the owning document, if set. If the owning document was not set, returns None. """ return self._owner_doc @support_annotation_or_set def _check_offsets(self, start: int, end: int) -> None: """ Checks the offsets for the given span/annotation against the document boundaries, if we know the owning document and if the owning document has text. Args: start: start offset end: end offset """ if self._owner_doc is None: return if self._owner_doc.text is None: return doc_size = len(self._owner_doc) if start < 0: raise InvalidOffsetError("Annotation starts before 0") if end < 0: raise InvalidOffsetError("Annotation ends before 0") if start > end: raise InvalidOffsetError("Annotation ends before it starts") if start > doc_size: raise InvalidOffsetError( "Annotation starts after document ends: start={}, docsize={}".format( start, doc_size ) ) if end > doc_size: raise InvalidOffsetError( "Annotation ends after document ends: end={}, docsize={}".format( end, doc_size ) ) @property def start(self): """ Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine start offset") self._create_index_by_offset() return self._index_by_offset.min_start() @property def end(self): """ Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine end offset") self._create_index_by_offset() return self._index_by_offset.max_end() @property def length(self): """ Returns the the length of the annotation set span. Throws: an exception if there are no annotations in the set. """ return self.end - self.start @allowspan def add( self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None, ): """ Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable. If an annotation id is specified that already exists in the set, an exception is raised. Args: start: start offset end: end offset anntype: the annotation type features: a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all. annid: the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically. Returns: the new annotation """ if annid is not None and not isinstance(annid, int): raise Exception("Parameter annid must be an int, mixed up with features?") if features is not None and isinstance(features, int): raise Exception( "Parameter features must not be an int: mixed up with annid?" ) if self._is_immutable: raise Exception("Cannot add an annotation to an immutable annotation set") self._check_offsets(start, end) if annid and annid in self._annotations: raise Exception( "Cannot add annotation with id {}, already in set".format(annid) ) if annid is None: annid = self._next_annid self._next_annid = self._next_annid + 1 ann = Annotation(start, end, anntype, features=features, annid=annid) ann._owner_set = self if not self._annotations: self._annotations = {} self._annotations[annid] = ann self._annset.add(ann) self._add_to_indices(ann) if self.changelog is not None: entry = { "command": "annotation:add", "set": self.name, "start": ann.start, "end": ann.end, "type": ann.type, "features": ann._features.to_dict(), "id": ann.id, } self.changelog.append(entry) return ann def add_ann(self, ann, annid: int = None): """ Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given. Args: ann: the annotation to copy into the set annid: the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically. Returns: the added annotation """ return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid) # TODO/NOTE: Iterable[Annotation] with Iterable from collections.abc is not possible here prior to Python 3.9 # instead, Iterable must come from typing def update(self, anns: Iterable[Annotation], annid_from_ann=False): """ Adds shallow copies of all annotations from the iterable to the set. Args: anns: an iterable of Annotations annid_from_ann: if True, use the same annotation id as in the annotation, this will raise an exception if the set already contains and annotation with this id. If False assign a new id to the added annotation. """ for ann in anns: if annid_from_ann: self.add(ann.start, ann.end, ann.type, ann.features, annid=ann.id) else: self.add(ann.start, ann.end, ann.type, ann.features) def add_anns(self, anns: Iterable[Annotation], annid_from_ann=False): """ DEPRECATED: same as update. Args: anns: an iterable of Annotations annid_from_ann: if True, use the same annotation id as in the annotation, this will raise an exception if the set already contains and annotation with this id. If False assign a new id to the added annotation. """ self.update(anns, annid_from_ann=annid_from_ann) def remove( self, annoriter: Union[int, Annotation, Iterable], raise_on_notexisting=True ) -> None: """ Removes the given annotation which is either the id or the annotation instance or recursively all annotations in the iterable. Throws: exception if the annotation set is immutable or the annotation is not in the set Args: annoriter: either the id (int) or the annotation instance (Annotation) or an iterable of id or annotation instance or iterable ... raise_on_notexisting: (default: True) if false, silently accepts non-existing annotations/ids and does nothing. Note: if this is True, but the annotation set is immutable, an Exception is still raised. """ if self._is_immutable: raise Exception( "Cannot remove an annotation from an immutable annotation set" ) if isinstance(annoriter, abc_Iterable): for a in annoriter: self.remove(a, raise_on_notexisting=raise_on_notexisting) return annid = None # make pycharm happy if isinstance(annoriter, int): annid = annoriter if annid not in self._annotations: raise Exception( "Annotation with id {} not in annotation set, cannot remove".format( annid ) ) ann = self._annotations[annid] elif isinstance(annoriter, Annotation): annid = annoriter.id if annid not in self._annotations: raise Exception( "Annotation with id {} does not belong to this set, cannot remove".format( annid ) ) ann = annoriter else: raise Exception("Should never happen!") # NOTE: once the annotation has been removed from the set, it could # still be referenced # somewhere else and its features could get modified. In order to # prevent logging of such changes, # the owning set gets cleared for the annotation ann._owner_set = None del self._annotations[annid] self._annset.remove(ann) if self.changelog is not None: self.changelog.append( {"command": "annotation:remove", "set": self.name, "id": annid} ) self._remove_from_indices(ann) def clear(self, reset_annids=False) -> None: """ Removes all annotations from the set. Args: reset_annids: if True, also reset the next annotation id to 0, after this newly added annotations will get annotation ids starting from 0. IMPORTANT: this must not be used for code to run in the Java GATE Python plugin, as Java GATE handles annotation ids differently! """ self._annotations.clear() self._annset.clear() if reset_annids: self._next_annid = 0 self._index_by_offset = None self._index_by_type = None if self.changelog is not None: self.changelog.append({"command": "annotations:clear", "set": self.name}) def clone_anns(self, memo=None): """ Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared. Args: memo: for internal use by our __deepcopy__ implementation. """ tmpdict = {} for annid, ann in self._annotations.items(): newann = copy.deepcopy(ann, memo=memo) ann._owner_set = None tmpdict[annid] = newann for annid, ann in tmpdict.items(): self._annset.remove(self._annotations[annid]) self._annotations[annid] = ann self._annset.add(ann) def __copy__(self): """ NOTE: creating a copy always creates a detached set, but a mutable one. """ c = self.detach() c._is_immutable = False return c def copy(self): """ Returns a shallow copy of the annotation set. """ return self.__copy__() def __deepcopy__(self, memo=None): if memo is None: memo = {} c = self.detach() c._is_immutable = False c.clone_anns(memo=memo) return c def deepcopy(self): """ Returns a deep copy of the annotation set. """ return copy.deepcopy(self) def __iter__(self) -> Iterator: """ Yields all the annotations of the set. Important: using the iterator will always create the index if it is not already there! For fast iteration use fast_iter() which does not allow sorting or offset ranges. Yields: the annotations in document order """ # return iter(self._annotations.values()) return self.iter() def fast_iter(self) -> Generator: """ Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built). """ if self._annotations: for annid, ann in self._annotations.items(): yield ann def iter( self, start_ge: Union[int, None] = None, start_lt: Union[None, int] = None, with_type: str = None, reverse: bool = False, ) -> Generator: """ Default iterator. Yields annotations ordered by increasing starting annotation offset and increasing annotation id, otionally limited by the other parameters. Args: start_ge: the offset from where to start including annotations start_lt: the last offset to use as the starting offset of an annotation with_type: only annotations of this type reverse: process in reverse document order Yields: Annotations in default document order, or reverse document order """ if with_type is not None: allowedtypes = set() if isinstance(type, str): allowedtypes.add(with_type) else: for atype in with_type: allowedtypes.add(atype) else: allowedtypes = None if not self._annotations: return maxoff = None if start_ge is not None: assert start_ge >= 0 if start_lt is not None: assert start_lt >= 1 maxoff = start_lt + 1 if start_lt is not None and start_ge is not None: assert start_lt > start_ge self._create_index_by_offset() for _start, _end, annid in self._index_by_offset.irange( minoff=start_ge, maxoff=maxoff, reverse=reverse ): if ( allowedtypes is not None and self._annotations[annid].type not in allowedtypes ): continue yield self._annotations[annid] def iter_ol( self, start_ge: Union[int, None] = None, start_lt: Union[None, int] = None, with_type: str = None, reverse: bool = False, ) -> Generator: """ Offset-Length Iterator. Yields annotations ordered by increasing start offset, by increasing end offset and increasing annotoation id, otionally limited by the other parameters. Args: start_ge: the offset from where to start including annotations start_lt: the last offset to use as the starting offset of an annotation with_type: only annotations of this type reverse: process in reverse document order Yields: Annotations ordered by offset and length. """ if with_type is not None: allowedtypes = set() if isinstance(type, str): allowedtypes.add(with_type) else: for atype in with_type: allowedtypes.add(atype) else: allowedtypes = None if not self._annotations: return maxoff = None if start_ge is not None: assert start_ge >= 0 if start_lt is not None: assert start_lt >= 1 maxoff = start_lt + 1 if start_lt is not None and start_ge is not None: assert start_lt > start_ge self._create_index_by_ol() for _start, _end, annid in self._index_by_ol.irange( minoff=start_ge, maxoff=maxoff, reverse=reverse ): if ( allowedtypes is not None and self._annotations[annid].type not in allowedtypes ): continue yield self._annotations[annid] def reverse_iter(self, **kwargs): """ Same as iter, but with the reverse parameter set to true. Args: kwargs: Same as for iter(), with revers=True fixed. **kwargs: will get passed on the Annotation.iter Returns: same result as iter() """ return self.iter(reverse=True, **kwargs) def get( self, annid: Union[int, Annotation], default=None ) -> Union[Annotation, None]: """ Gets the annotation with the given annotation id or returns the given default. NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works: annid = annset.add(b,e,t).id ann = annset.get(annid) If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value). Args: annid: the annotation id of the annotation to retrieve. default: what to return if an annotation with the given id is not found. (Default value = None) annid: Union[int: Annotation]: Returns: the annotation or the default value. """ if isinstance(annid, Annotation): annid = annid.id return self._annotations.get(annid, default) def first(self): """ Return the first (or only) annotation in the set by offset. Returns: first annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no first annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=False)) return self._annotations[annid] def last(self): """ Return the last (or only) annotation by offset. Returns: last annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no last annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=True)) return self._annotations[annid] def for_idx(self, idx, default=None): """ Return the annotation corresponding to the index idx in the set. This returns the annotation stored at the index, as added to the set. The order usually depends on the insertion time. If no annotation with the given index is specified, the value specified for `default` is returned. Args: idx: index of the annotation in the set default: default value to return if now annotation with the given index exists Returns: the annotation with the given index or the default value """ # TODO: we could make this more memory efficient (but slower) by # iterating over values until getting idxth tmplist = list(self._annotations.values()) if idx < len(tmplist): return tmplist[idx] else: return default def __getitem__(self, annid): """ Gets the annotation with the given annotation id or throws an exception. Args: annid: the annotation id Returns: annotation """ return self._annotations[annid] def with_type(self, *anntype: Union[str, Iterable], non_overlapping: bool = False): """ Gets annotations of the specified type(s). Creates the type index if necessary. Args: anntype: one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, an empty detached set is returned. non_overlapping: if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order. Returns: a detached immutable annotation set with the matching annotations. """ atypes = [] for atype in anntype: if isinstance(atype, str): atypes.append(atype) else: for t in atype: atypes.append(t) if not atypes: return self.detach(restrict_to=[]) self._create_index_by_type() annids = set() for t in atypes: idxs = self._index_by_type.get(t) if idxs: annids.update(idxs) if non_overlapping: # need to get annotations grouped by start offset and sorted according to # what the Annotation class defines allanns = sorted(annids, key=lambda x: self._annotations[x]) allanns = [self._annotations[x] for x in allanns] allannsgrouped = [] curstart = None curset = None for ann in allanns: if curstart is None: curset = [ann] curstart = ann.start elif curstart == ann.start: curset.append(ann) else: allannsgrouped.append(curset) curset = [ann] curstart = ann.start if curset: allannsgrouped.append(curset) retanns = [] # now go through all the grouped annoations and select the top priority one # then skip to the next group that does not overlap with the one we just selected typepriority = dict() for i, atype in enumerate(atypes): typepriority[atype] = len(atypes) - i curminoffset = 0 for group in allannsgrouped: # instead of sorting, go through the group and find the top priority one topann = None if len(group) == 1: if group[0].start >= curminoffset: topann = group[0] elif len(group) == 0: raise Exception("We should never get a 0 size group here!") else: i = 0 for i, ann in enumerate(group): if ann.start >= curminoffset: topann = ann break for ann in group[i + 1:]: if ann.start < curminoffset: continue if typepriority[ann.type] > typepriority[topann.type]: topann = ann elif typepriority[ann.type] == typepriority[topann.type]: if ann.end > topann.end: topann = ann elif ann.end == topann.end: if ann.id > topann.id: topann = ann if topann is not None: retanns.append(topann) curminoffset = topann.end annids = [ann.id for ann in retanns] return self.detach(restrict_to=annids) def by_offset(self): """ Yields lists of annotations which start at the same offset. """ self._create_index_by_offset() lastoff = -1 curlist = [] for ann in self.iter(): if ann.start != lastoff: if lastoff != -1: yield curlist lastoff = ann.start curlist = [ann] else: curlist.append(ann) if lastoff != -1: yield curlist def by_span(self): """ Yields list of annotations with identical spans. Note: first needs to sort all annotations! """ self._create_index_by_offset() lastsoff = -1 lasteoff = -1 curlist = [] for ann in self.iter_ol(): if ann.start != lastsoff or ann.end != lasteoff: if lastsoff != -1: yield curlist lastsoff = ann.start lasteoff = ann.end curlist = [ann] else: curlist.append(ann) if lastsoff != -1: yield curlist @property def type_names(self) -> KeysView[str]: """ Gets the names of all types in this set. Creates the type index if necessary. """ self._create_index_by_type() return self._index_by_type.keys() @support_annotation_or_set def startingat( self, start: int, _end: Any = None, ann=None, include_self: bool = False ): """ Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set. Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if `include_self` is `False` Args: start: the offset where annotations should start _end: unused/ignored end offset ann: any annotation that was specified instead of just the offset include_self: should annotation passed be included in the result Returns: detached annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_at(start) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def start_min_ge( self, offset: int, _end: Any = None, ann: Optional["Annotation"] = None, include_self: bool = False ): """Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set. Args: offset: The offset _end: unused/ignored end offset ann: any Annotation that was passed include_self: should annotation passed be included in the result Returns: annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(offset) # now select only those first ones which all have the same offset if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None retids = set() startoff = None for intv in intvs: if startoff is None: startoff = intv[0] if ignore_id is not None: if ignore_id != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) elif startoff == intv[0]: if ignore_id is not None: if ignore_id != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) else: break return self.detach(restrict_to=retids) @support_annotation_or_set def start_ge(self, start: int, _end: Any = None, ann: Optional["Annotation"] = None, include_self: bool = False): """ Return the annotations that start at or after the given start offset. Args: start: Start offset _end: unusued/ignored end offset ann: any Annotation passed include_self: should annotation passed be included in the result Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(start) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def start_lt(self, offset: int, _end: Any = None, ann: Any = None): """ Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set. Args: offset: offset before which the annotations should start _end: unused/ignored end offset ann: unised/ignored Annotation passed (can never be included!) Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_before(offset) return self._restrict_intvs(intvs) @support_annotation_or_set def overlapping(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set. For each annotation ann in the result set, ann.overlapping(span) is True Args: start: start offset of the span end: end offset of the span ann: the annotation that is passed to this function for checking if it is included in the result. include_self: if True and the annotation for the span is given, do not include that annotation in the result set. Returns: an immutable annotation set with the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.overlapping(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def covering(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation. For each annotation ann in the result set, ann.covering(span) is True. Args: start: the start offset of the span end: the end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations, if any """ self._create_index_by_offset() intvs = self._index_by_offset.covering(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def within(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation. For each annotation ann in the result set, ann.within(span) is True. Args: start: start offset of the range end: end offset of the range ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations """ if start > end: raise Exception("Invalid offset range: {},{}".format(start, end)) else: self._create_index_by_offset() intvs = self._index_by_offset.within(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def coextensive(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Returns a detached annotation set with all annotations that start and end at the given offsets. For each annotation ann in the result set, ann.coextensive(span) is True. Args: start: start offset of the span end: end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation for the span is given, do not include that annotation in the result set. Returns: annotation set with all annotations that have the same start and end offsets. """ self._create_index_by_offset() intvs = self._index_by_offset.at(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def before( self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False, immediately: bool = False ): """ Returns a detached annotation set with all annotations that end before the given offsets. For each annotation ann in the result set, ann.isbefore(span) is True. Args: start: start offset of the span end: end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. immediately: if True, the end offset of the annotations return must coincide with the start offset of the span (default=False) Returns: annotation set with all annotations that end before the given span """ self._create_index_by_offset() if immediately: intvs = self._index_by_offset.ending_at(start) else: intvs = self._index_by_offset.ending_to(start) # we need to filter self if self is zero-length! if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @support_annotation_or_set def after( self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False, immediately: bool = False ): """ Returns a detached annotation set with all annotations that start after the given span. For each annotation ann in the result set, ann.isafter(span) is True. Args: start: start offset of the span end: end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. immediately: if True, the start offset of the annotations returned must coincide with the end offset of the span (default=False) Returns: annotation set with all annotations that start after the given span """ self._create_index_by_offset() if immediately: intvs = self._index_by_offset.starting_at(end) else: intvs = self._index_by_offset.starting_from(end) # we need to filter self if self is zero-length! if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id) @property def span(self) -> Span: """ Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index) """ if len(self._annotations) == 0: return Span(0, 0) self._create_index_by_offset() return Span(self._index_by_offset.min_start(), self._index_by_offset.max_end()) def __contains__(self, annorannid: Union[int, Annotation]) -> bool: """ Provides 'annotation in annotation_set' functionality. Args: annorannid: the annotation instance or annotation id to check. If this is an id, it is checked if the id appears in this AnnotationSet. If this is an annotation instance, then True is returned if the Annotation does have an owning set and the owning set is this AnnotationSet and the annotation id is present in this AnnotatioSet Returns: `True` if the annotation exists in the set, `False` otherwise """ if isinstance(annorannid, Annotation): return annorannid in self._annset return ( annorannid in self._annotations ) # On the off chance someone passed an ID in directly contains = __contains__ def __repr__(self) -> str: """ Returns the string representation of the set. """ return "AnnotationSet({})".format(repr(list(self.iter()))) def to_dict(self, anntypes=None, **kwargs): """ Convert an annotation set to its dict representation. Args: anntypes: if not None, an iterable of annotation types to include **kwargs: passed on to the dict creation of contained annotations. Returns: the dict representation of the annotation set. """ if anntypes is not None: anntypesset = set(anntypes) anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() if val.type in anntypesset ) else: anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() ) return { # NOTE: Changelog is not getting added as it is stored in the document part! "name": self.name, "annotations": anns_list, "next_annid": self._next_annid, } @staticmethod def from_dict(dictrepr, owner_doc=None, **kwargs): """ Create an AnnotationSet from its dict representation and optionally set the owning document. Args: dictrepr: the dict representation of the annotation set owner_doc: the owning document **kwargs: passed on to the creation of annotations Returns: the annotation set """ annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc) annset._next_annid = dictrepr.get("next_annid") if dictrepr.get("annotations"): annset._annotations = dict( (int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs)) for a in dictrepr.get("annotations") ) annset._annset.update(annset._annotations.values()) else: annset._annotations = {} return annset @staticmethod def from_anns(anns, deep_copy=False, **kwargs): """ Create a detached AnnotationSet from an iterable of annotations. Args: anns: an iterable of annotations deep_copy: if the annotations should get added as copies (default) or deep copies. Returns: the annotation set """ annset = AnnotationSet(name="", owner_doc=None) annset._annotations = dict() maxid = 0 for ann in anns: if deep_copy: addann = ann.deepcopy() else: addann = ann.copy() annset._annotations[addann.id] = addann if addann.id > maxid: maxid = addann.id annset._next_annid = maxid annset._is_immutable = True return annset def _update_offsets(self, id, start, end): """ In-place update the offset of the annotation with the given id. THIS IS FOR INTERNAL USE ONLY! Using this method can lead to many different kinds of hard to debug and surprising bugs! NOTE: this only updates the by offset index if it already exists. If the offsets are both are unchanged, this is a NOOP. Args: id: id of the annotation to change start: new start offset end: new end offset """ ann = self._annotations[id] if ann.start == start and ann.end == end: return # nothing to do really # print(f"DEBUG: updating offset for {id} from {ann.start},{ann.end} to {start},{end}") if self._index_by_offset is not None: self._index_by_offset.remove( ann.start, ann.end, ann.id ) ann._update_offsets(start, end) if self._index_by_offset is not None: self._index_by_offset.add(ann.start, ann.end, ann.id) def _edit_anns(self, edits, affected_strategy): """ Edit helper method: takes a list of edits and returns two values: a dictionary annid->(start,end) of new offset spans for all annotations that remain in the set, and a set of annids for annotations that have to get deleted. Args: edits: the edit(s) to carry out Returns: anns: dictionary mapping annotation ids to pairs start,end of new offsets for that annotation anns2delete: set of annotation ids to delete as a aresult of the edits """ # convert the list of edits into a list of lists [startoff, endoff, len, startlist, endlist] # where the lists will contain later the ids of annotation starting/ending within that span # This also makes sure that if the edits are mutable, we do not change them in any way edits = [[l[0], l[1], len(l[2]) if isinstance(l[2], str) else l[2], [], []] for l in edits] # sort the edits by ending, then starting offsets: since we operate from start to end, as soon as # processing has moved past some offset, the annotations before that offset do not need to get # updated any more. edits.sort(key=lambda x: (x[1], x[0]), reverse=False) # optimization: instead of recalculating relevant overlaps after each edit, calculate # them beforehand # For each edit, add the ids of annotations that start/end in that interval to the start/end lists # of the edit tuple. Also collect the ids of those annotations in sets for starting or ending, starting, ending # in any edit. This is necessary because offset adaptions necessary for annotations starting/ending in # a span need to get handled different from offset adaptations for all other annotations self._create_index_by_offset() affectedids_start = set() affectedids_end = set() for edit in edits: # find all annotations which start or end within the span of the edit (or both) sintvs = self._index_by_offset.starting_within(edit[0], edit[1]) eintvs = self._index_by_offset.ending_within(edit[0], edit[1]) for intv in sintvs: if affected_strategy != "delete": edit[3].append(intv[2]) affectedids_start.add(intv[2]) for intv in eintvs: if affected_strategy != "delete": edit[4].append(intv[2]) affectedids_end.add(intv[2]) # Any changes of offsets or deletions are not carried out until the very end. For this we # keep a dictionary with all the annotations id to [start,end] and a set of annotations to delete anns = {ann.id: [ann.start, ann.end] for ann in self._annotations.values()} anns2delete = set() # also keep sorted lists of annid by start and end offset, but only for non-affected annotations (outside of # edits). The offset is accessed from anns because it can change during the process idsbystart = [] idsbyend = [] # we are getting the annotations in offset order, so the two lists we create are also in starting # offset order, for the first list this is what we need for ann in self.iter(): annid = ann.id if annid not in affectedids_start: idsbystart.append(annid) if annid not in affectedids_end: idsbyend.append(annid) # sort the idsbyend list by end offset idsbyend.sort(key=lambda x: anns[x][1], reverse=False) # ptr_start/end are indices into the start2ids/end2ids lists: pointing to the first entry for which # annotation offsets still need to get adapted ptr_start = None ptr_end = None if len(idsbystart) > 0: ptr_start = 0 if len(idsbyend) > 0: ptr_end = 0 for idx in range(len(edits)): edit = edits[idx] editfrom, editto, editlen, edit_sanns, edit_eanns = edit newlen = len(edit[2]) if isinstance(edit[2], str) else edit[2] oldlen = editto - editfrom delta = newlen - oldlen editto_new = editto + delta # in order to process this span we need to do this: # - adapt all affected annotations, i.e. annotations which start or end in this span, # according to the strategy. # - change the offsets of all annotations after this edit if the length of the span changed # - also change the offsets of all edits after this edit if the length of the span changed for annid in edit_sanns: # all the ids of anns starting in this edit if affected_strategy == "delete_all": if annid in anns: anns2delete.add(annid) del anns[annid] elif affected_strategy == "adapt": anns[annid][0] = editfrom elif affected_strategy == "keepadapt": if anns[annid][0] > editto: anns[annid][0] = editfrom for annid in edit_eanns: # all the ids of anns ending in this edit if affected_strategy == "delete_all": if annid in anns: anns2delete.add(annid) del anns[annid] elif affected_strategy == "adapt": anns[annid][1] = editto_new elif affected_strategy == "keepadapt": if anns[annid][1] > editto_new: anns[annid][1] = editto_new if delta != 0: for idx2 in range(idx+1, len(edits)): otheredit = edits[idx2] otheredit[0] += delta otheredit[1] += delta while anns[idsbystart[ptr_start]][0] < editto: ptr_start += 1 if ptr_start >= len(idsbystart): ptr_start = None break # adapt all the annotations if ptr_start is not None: for idx2 in range(ptr_start, len(idsbystart)): annid = idsbystart[idx2] if annid not in anns2delete: # print(f"DEBUG: update start for {annid} from {anns[annid][0]} by {delta}") anns[annid][0] += delta # find the first annotation that ends at or after the current edit while anns[idsbyend[ptr_end]][1] <= editto: ptr_end += 1 if ptr_end >= len(idsbyend): ptr_end = None break # adapt all the annotations if ptr_end is not None: for idx2 in range(ptr_end, len(idsbyend)): annid = idsbyend[idx2] if annid not in anns2delete: # print(f"DEBUG: update end for {annid} from {anns[annid][0]} by {delta}") anns[annid][1] += delta return anns, anns2delete def _edit(self, edits, affected_strategy="keepadapt"): """ Carry out one or more edits. If edits is a tuple of length 3 with the first element not being iterable, assume it is a single edit, Otherwise assume it is an iterable of edits. An edit is a tuple (start, end, intorstring) giving the old offset range and either the string which replaces that range or the length that replaces that range. NOTE: no two edit offset ranges may overlap, if ranges do overlap, this method may raise an exception or silently perform unexpected and terrible changes. The method does not check for edit spans to not overlap! This method adapts the offsets of all annotations after the affected span, if an annotation begins or ends within an affected span, what happens depends on the affected_strategy: delete_all: remove any annotation where the start and/or end offset lies between the from/to offsets of the edit adapt: any start and/or end offset in between from/to is changed to the from or to offset keepadapt: any start and/or end offset in between is left unchanged if that offset still exists in the new span, otherwise adapted to from/to. Args: edits: single edit or iterable of edits affected_strategy: one of the following strategies: delete, adapt, keepadapt """ if isinstance(edits, tuple) and not isinstance(edits[0], Iterable): edits = [edits] anns, anns2delete = self._edit_anns(edits, affected_strategy) # now delete all annotations to be delete for annid in anns2delete: # print(f"DEBUG: removing annotation {self[annid]}") self.remove(annid) # and adapt all annotation offsets, if necessary for annid in anns: start, end = anns[annid] self._update_offsets(annid, start, end)
Static methods
def create_from(anns: Union[Iterable[Annotation], Annotation], name=None) ‑> AnnotationSet
-
Creates an immutable detached annotation set from the annotations in anns. The set contains shallow copies of the annotations and the annotation id is preserved, unless it is a duplicate in which the next available id is used.
Args
anns
- an iterable of annotations or a single annotation
name
- an optional name for the set
Returns
An immutable detached annotation set
Expand source code
@staticmethod def create_from(anns: Union[Iterable[Annotation], Annotation], name=None) -> "AnnotationSet": """ Creates an immutable detached annotation set from the annotations in anns. The set contains shallow copies of the annotations and the annotation id is preserved, unless it is a duplicate in which the next available id is used. Args: anns: an iterable of annotations or a single annotation name: an optional name for the set Returns: An immutable detached annotation set """ annset = AnnotationSet(name=name) annset._is_immutable = True annset._annotations = {} annset._next_annid = 0 if isinstance(anns, Annotation): anns = [anns] for ann in anns: # if the id is already in the set, assign the next available one ann = ann.copy() if ann.id in annset._annotations: ann._id = annset._next_annid annset._annotations[annset._next_annid] = ann annset._next_annid += 1 else: # if the id is not yet in the set, keep it and make sure that after adding, # the next annid is adapted, if necessary! annset._annotations[ann.id] = ann if ann.id >= annset._next_annid: annset._next_annid = ann.id + 1 annset._annset.update(annset._annotations.values()) return annset
def from_anns(anns, deep_copy=False, **kwargs)
-
Create a detached AnnotationSet from an iterable of annotations.
Args
anns
- an iterable of annotations
deep_copy
- if the annotations should get added as copies (default) or deep copies.
Returns
the annotation set
Expand source code
@staticmethod def from_anns(anns, deep_copy=False, **kwargs): """ Create a detached AnnotationSet from an iterable of annotations. Args: anns: an iterable of annotations deep_copy: if the annotations should get added as copies (default) or deep copies. Returns: the annotation set """ annset = AnnotationSet(name="", owner_doc=None) annset._annotations = dict() maxid = 0 for ann in anns: if deep_copy: addann = ann.deepcopy() else: addann = ann.copy() annset._annotations[addann.id] = addann if addann.id > maxid: maxid = addann.id annset._next_annid = maxid annset._is_immutable = True return annset
def from_dict(dictrepr, owner_doc=None, **kwargs)
-
Create an AnnotationSet from its dict representation and optionally set the owning document.
Args
dictrepr
- the dict representation of the annotation set
owner_doc
- the owning document
**kwargs
- passed on to the creation of annotations
Returns
the annotation set
Expand source code
@staticmethod def from_dict(dictrepr, owner_doc=None, **kwargs): """ Create an AnnotationSet from its dict representation and optionally set the owning document. Args: dictrepr: the dict representation of the annotation set owner_doc: the owning document **kwargs: passed on to the creation of annotations Returns: the annotation set """ annset = AnnotationSet(dictrepr.get("name"), owner_doc=owner_doc) annset._next_annid = dictrepr.get("next_annid") if dictrepr.get("annotations"): annset._annotations = dict( (int(a["id"]), Annotation.from_dict(a, owner_set=annset, **kwargs)) for a in dictrepr.get("annotations") ) annset._annset.update(annset._annotations.values()) else: annset._annotations = {} return annset
Instance variables
var changelog
-
Returns the changelog or None if no changelog is set.
Expand source code
@property def changelog(self): """ Returns the changelog or None if no changelog is set. """ if self._owner_doc is None: return None return self._owner_doc.changelog
var document
-
Returns the owning document, if set. If the owning document was not set, returns None.
Expand source code
@property def document(self): """ Returns the owning document, if set. If the owning document was not set, returns None. """ return self._owner_doc
var end
-
Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary.
Throws
an exception if there are no annotations in the set.
Expand source code
@property def end(self): """ Returns the end offset of the annotation set, i.e. the biggest end offset of any annotation. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine end offset") self._create_index_by_offset() return self._index_by_offset.max_end()
var immutable : bool
-
Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified.
All detached annotation sets are immutable when created, but can be made mutable afterwards.
Expand source code
@property def immutable(self) -> bool: """ Get or set the immutability of the annotation set. If it is immutable, annotations cannot be added or removed from the set, but the annotations themselves can still have their features modified. All detached annotation sets are immutable when created, but can be made mutable afterwards. """ return self._is_immutable
var length
-
Returns the the length of the annotation set span.
Throws
an exception if there are no annotations in the set.
Expand source code
@property def length(self): """ Returns the the length of the annotation set span. Throws: an exception if there are no annotations in the set. """ return self.end - self.start
var name
-
Returns the name of the annotation set.
Note: the name of a set cannot be changed.
Expand source code
@property def name(self): """ Returns the name of the annotation set. Note: the name of a set cannot be changed. """ return self._name
var size : int
-
Returns the number of annotations in the annotation set.
Expand source code
@property def size(self) -> int: """ Returns the number of annotations in the annotation set. """ return len(self._annotations)
var span : Span
-
Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index)
Expand source code
@property def span(self) -> Span: """ Returns a tuple with the start and end offset the corresponds to the smallest start offset of any annotation and the largest end offset of any annotation. (Builds the offset index) """ if len(self._annotations) == 0: return Span(0, 0) self._create_index_by_offset() return Span(self._index_by_offset.min_start(), self._index_by_offset.max_end())
var start
-
Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary.
Throws
an exception if there are no annotations in the set.
Expand source code
@property def start(self): """ Returns the smallest start offset of all annotations, i.e the start of the span of the whole set. This needs the index and creates it if necessary. Throws: an exception if there are no annotations in the set. """ if self.size == 0: raise Exception("Annotation set is empty, cannot determine start offset") self._create_index_by_offset() return self._index_by_offset.min_start()
var type_names : KeysView[str]
-
Gets the names of all types in this set. Creates the type index if necessary.
Expand source code
@property def type_names(self) -> KeysView[str]: """ Gets the names of all types in this set. Creates the type index if necessary. """ self._create_index_by_type() return self._index_by_type.keys()
Methods
def __contains__(self, annorannid: Union[int, Annotation]) ‑> bool
-
Provides 'annotation in annotation_set' functionality.
Args
annorannid
- the annotation instance or annotation id to check. If this is an id, it is checked if the id appears in this AnnotationSet. If this is an annotation instance, then True is returned if the Annotation does have an owning set and the owning set is this AnnotationSet and the annotation id is present in this AnnotatioSet
Returns
True
if the annotation exists in the set,False
otherwiseExpand source code
def __contains__(self, annorannid: Union[int, Annotation]) -> bool: """ Provides 'annotation in annotation_set' functionality. Args: annorannid: the annotation instance or annotation id to check. If this is an id, it is checked if the id appears in this AnnotationSet. If this is an annotation instance, then True is returned if the Annotation does have an owning set and the owning set is this AnnotationSet and the annotation id is present in this AnnotatioSet Returns: `True` if the annotation exists in the set, `False` otherwise """ if isinstance(annorannid, Annotation): return annorannid in self._annset return ( annorannid in self._annotations ) # On the off chance someone passed an ID in directly
def __getitem__(self, annid)
-
Gets the annotation with the given annotation id or throws an exception.
Args
annid
- the annotation id
Returns
annotation
Expand source code
def __getitem__(self, annid): """ Gets the annotation with the given annotation id or throws an exception. Args: annid: the annotation id Returns: annotation """ return self._annotations[annid]
def __iter__(self) ‑> Iterator[+T_co]
-
Yields all the annotations of the set.
Important: using the iterator will always create the index if it is not already there! For fast iteration use fast_iter() which does not allow sorting or offset ranges.
Yields
the annotations in document order
Expand source code
def __iter__(self) -> Iterator: """ Yields all the annotations of the set. Important: using the iterator will always create the index if it is not already there! For fast iteration use fast_iter() which does not allow sorting or offset ranges. Yields: the annotations in document order """ # return iter(self._annotations.values()) return self.iter()
def __len__(self) ‑> int
-
Return number of annotations in the set.
Expand source code
def __len__(self) -> int: """ Return number of annotations in the set. """ return len(self._annotations)
def add(self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None)
-
Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable.
If an annotation id is specified that already exists in the set, an exception is raised.
Args
start
- start offset
end
- end offset
anntype
- the annotation type
features
- a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all.
annid
- the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically.
Returns
the new annotation
Expand source code
@allowspan def add( self, start: int, end: int, anntype: str, features: Dict[str, Any] = None, annid: int = None, ): """ Adds an annotation to the set. Once an annotation has been added, the start and end offsets, the type, and the annotation id of the annotation are immutable. If an annotation id is specified that already exists in the set, an exception is raised. Args: start: start offset end: end offset anntype: the annotation type features: a map, an iterable of tuples or an existing feature map. In any case, the features are used to create a new feature map for this annotation. If the map is empty or this parameter is None, the annotation does not store any map at all. annid: the annotation id, if not specified the next free one for this set is used. NOTE: the id should normally left unspecified and get assigned automatically. Returns: the new annotation """ if annid is not None and not isinstance(annid, int): raise Exception("Parameter annid must be an int, mixed up with features?") if features is not None and isinstance(features, int): raise Exception( "Parameter features must not be an int: mixed up with annid?" ) if self._is_immutable: raise Exception("Cannot add an annotation to an immutable annotation set") self._check_offsets(start, end) if annid and annid in self._annotations: raise Exception( "Cannot add annotation with id {}, already in set".format(annid) ) if annid is None: annid = self._next_annid self._next_annid = self._next_annid + 1 ann = Annotation(start, end, anntype, features=features, annid=annid) ann._owner_set = self if not self._annotations: self._annotations = {} self._annotations[annid] = ann self._annset.add(ann) self._add_to_indices(ann) if self.changelog is not None: entry = { "command": "annotation:add", "set": self.name, "start": ann.start, "end": ann.end, "type": ann.type, "features": ann._features.to_dict(), "id": ann.id, } self.changelog.append(entry) return ann
def add_ann(self, ann, annid: int = None)
-
Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given.
Args
ann
- the annotation to copy into the set
annid
- the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically.
Returns
the added annotation
Expand source code
def add_ann(self, ann, annid: int = None): """ Adds a shallow copy of the given ann to the annotation set, either with a new annotation id or with the one given. Args: ann: the annotation to copy into the set annid: the annotation id, if not specified the next free one for this set is used. Note: the id should normally left unspecified and get assigned automatically. Returns: the added annotation """ return self.add(ann.start, ann.end, ann.type, ann.features, annid=annid)
def add_anns(self, anns: Iterable[Annotation], annid_from_ann=False)
-
DEPRECATED: same as update.
Args
anns
- an iterable of Annotations
annid_from_ann
- if True, use the same annotation id as in the annotation, this will raise an exception if the set already contains and annotation with this id. If False assign a new id to the added annotation.
Expand source code
def add_anns(self, anns: Iterable[Annotation], annid_from_ann=False): """ DEPRECATED: same as update. Args: anns: an iterable of Annotations annid_from_ann: if True, use the same annotation id as in the annotation, this will raise an exception if the set already contains and annotation with this id. If False assign a new id to the added annotation. """ self.update(anns, annid_from_ann=annid_from_ann)
def after(self, start: int, end: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False, immediately: bool = False)
-
Returns a detached annotation set with all annotations that start after the given span.
For each annotation ann in the result set, ann.isafter(span) is True.
Args
start
- start offset of the span
end
- end offset of the span
ann
- the annotation representing the span. (Default value = None)
include_self
- if True and the annotation id for the span is given, do not include that annotation in the result set.
immediately
- if True, the start offset of the annotations returned must coincide with the end offset of the span (default=False)
Returns
annotation set with all annotations that start after the given span
Expand source code
@support_annotation_or_set def after( self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False, immediately: bool = False ): """ Returns a detached annotation set with all annotations that start after the given span. For each annotation ann in the result set, ann.isafter(span) is True. Args: start: start offset of the span end: end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. immediately: if True, the start offset of the annotations returned must coincide with the end offset of the span (default=False) Returns: annotation set with all annotations that start after the given span """ self._create_index_by_offset() if immediately: intvs = self._index_by_offset.starting_at(end) else: intvs = self._index_by_offset.starting_from(end) # we need to filter self if self is zero-length! if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def before(self, start: int, end: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False, immediately: bool = False)
-
Returns a detached annotation set with all annotations that end before the given offsets.
For each annotation ann in the result set, ann.isbefore(span) is True.
Args
start
- start offset of the span
end
- end offset of the span
ann
- the annotation representing the span. (Default value = None)
include_self
- if True and the annotation id for the span is given, do not include that annotation in the result set.
immediately
- if True, the end offset of the annotations return must coincide with the start offset of the span (default=False)
Returns
annotation set with all annotations that end before the given span
Expand source code
@support_annotation_or_set def before( self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False, immediately: bool = False ): """ Returns a detached annotation set with all annotations that end before the given offsets. For each annotation ann in the result set, ann.isbefore(span) is True. Args: start: start offset of the span end: end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation id for the span is given, do not include that annotation in the result set. immediately: if True, the end offset of the annotations return must coincide with the start offset of the span (default=False) Returns: annotation set with all annotations that end before the given span """ self._create_index_by_offset() if immediately: intvs = self._index_by_offset.ending_at(start) else: intvs = self._index_by_offset.ending_to(start) # we need to filter self if self is zero-length! if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def by_offset(self)
-
Yields lists of annotations which start at the same offset.
Expand source code
def by_offset(self): """ Yields lists of annotations which start at the same offset. """ self._create_index_by_offset() lastoff = -1 curlist = [] for ann in self.iter(): if ann.start != lastoff: if lastoff != -1: yield curlist lastoff = ann.start curlist = [ann] else: curlist.append(ann) if lastoff != -1: yield curlist
def by_span(self)
-
Yields list of annotations with identical spans. Note: first needs to sort all annotations!
Expand source code
def by_span(self): """ Yields list of annotations with identical spans. Note: first needs to sort all annotations! """ self._create_index_by_offset() lastsoff = -1 lasteoff = -1 curlist = [] for ann in self.iter_ol(): if ann.start != lastsoff or ann.end != lasteoff: if lastsoff != -1: yield curlist lastsoff = ann.start lasteoff = ann.end curlist = [ann] else: curlist.append(ann) if lastsoff != -1: yield curlist
def clear(self, reset_annids=False) ‑> None
-
Removes all annotations from the set.
Args
reset_annids
- if True, also reset the next annotation id to 0, after this newly added annotations will get annotation ids starting from 0. IMPORTANT: this must not be used for code to run in the Java GATE Python plugin, as Java GATE handles annotation ids differently!
Expand source code
def clear(self, reset_annids=False) -> None: """ Removes all annotations from the set. Args: reset_annids: if True, also reset the next annotation id to 0, after this newly added annotations will get annotation ids starting from 0. IMPORTANT: this must not be used for code to run in the Java GATE Python plugin, as Java GATE handles annotation ids differently! """ self._annotations.clear() self._annset.clear() if reset_annids: self._next_annid = 0 self._index_by_offset = None self._index_by_type = None if self.changelog is not None: self.changelog.append({"command": "annotations:clear", "set": self.name})
def clone_anns(self, memo=None)
-
Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared.
Args
memo
- for internal use by our deepcopy implementation.
Expand source code
def clone_anns(self, memo=None): """ Replaces the annotations in this set with deep copies of the originals. If this is a detached set, then this makes sure that any modifications to the annotations do not affect the original annotations in the attached set. If this is an attached set, it makes sure that all other detached sets cannot affect the annotations in this set any more. The owning set of the annotations that get cloned is cleared. Args: memo: for internal use by our __deepcopy__ implementation. """ tmpdict = {} for annid, ann in self._annotations.items(): newann = copy.deepcopy(ann, memo=memo) ann._owner_set = None tmpdict[annid] = newann for annid, ann in tmpdict.items(): self._annset.remove(self._annotations[annid]) self._annotations[annid] = ann self._annset.add(ann)
def coextensive(self, start: int, end: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False)
-
Returns a detached annotation set with all annotations that start and end at the given offsets.
For each annotation ann in the result set, ann.coextensive(span) is True.
Args
start
- start offset of the span
end
- end offset of the span
ann
- the annotation representing the span. (Default value = None)
include_self
- if True and the annotation for the span is given, do not include that annotation in the result set.
Returns
annotation set with all annotations that have the same start and end offsets.
Expand source code
@support_annotation_or_set def coextensive(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Returns a detached annotation set with all annotations that start and end at the given offsets. For each annotation ann in the result set, ann.coextensive(span) is True. Args: start: start offset of the span end: end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation for the span is given, do not include that annotation in the result set. Returns: annotation set with all annotations that have the same start and end offsets. """ self._create_index_by_offset() intvs = self._index_by_offset.at(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def contains(self, annorannid: Union[int, Annotation]) ‑> bool
-
Provides 'annotation in annotation_set' functionality.
Args
annorannid
- the annotation instance or annotation id to check. If this is an id, it is checked if the id appears in this AnnotationSet. If this is an annotation instance, then True is returned if the Annotation does have an owning set and the owning set is this AnnotationSet and the annotation id is present in this AnnotatioSet
Returns
True
if the annotation exists in the set,False
otherwiseExpand source code
def __contains__(self, annorannid: Union[int, Annotation]) -> bool: """ Provides 'annotation in annotation_set' functionality. Args: annorannid: the annotation instance or annotation id to check. If this is an id, it is checked if the id appears in this AnnotationSet. If this is an annotation instance, then True is returned if the Annotation does have an owning set and the owning set is this AnnotationSet and the annotation id is present in this AnnotatioSet Returns: `True` if the annotation exists in the set, `False` otherwise """ if isinstance(annorannid, Annotation): return annorannid in self._annset return ( annorannid in self._annotations ) # On the off chance someone passed an ID in directly
def copy(self)
-
Returns a shallow copy of the annotation set.
Expand source code
def copy(self): """ Returns a shallow copy of the annotation set. """ return self.__copy__()
def covering(self, start: int, end: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False)
-
Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation.
For each annotation ann in the result set, ann.covering(span) is True.
Args
start
- the start offset of the span
end
- the end offset of the span
ann
- the annotation representing the span. (Default value = None)
include_self
- if True and the annotation for the span is given, do not include that annotation in the result set. (Default value = False)
Returns
an immutable annotation set with the matching annotations, if any
Expand source code
@support_annotation_or_set def covering(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Gets the annotations which contain the given offset range (or annotation/annotation set), i.e. annotations such that the given offset range is within the annotation. For each annotation ann in the result set, ann.covering(span) is True. Args: start: the start offset of the span end: the end offset of the span ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations, if any """ self._create_index_by_offset() intvs = self._index_by_offset.covering(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def deepcopy(self)
-
Returns a deep copy of the annotation set.
Expand source code
def deepcopy(self): """ Returns a deep copy of the annotation set. """ return copy.deepcopy(self)
def detach(self, restrict_to=None)
-
Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well.
Args
restrict_to
- an iterable of annotation ids, if None, all the annotations from this set.
Returns
an immutable annotation set
Expand source code
def detach(self, restrict_to=None): """ Creates an immutable and detached copy of this set, optionally restricted to the given annotation ids. A detached annotation set does not have an owning document and deleting or adding annotations does not change the annotations stored with the document. However, the annotations in a detached annotation set are the same as those stored in the attached set, so updating their features will modify the annotations in the document as well. Args: restrict_to: an iterable of annotation ids, if None, all the annotations from this set. Returns: an immutable annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True if restrict_to is None: annset._annotations = { annid: self._annotations[annid] for annid in self._annotations.keys() } else: annset._annotations = { annid: self._annotations[annid] for annid in restrict_to } annset._annset.update(annset._annotations.values()) annset._next_annid = self._next_annid return annset
def detach_from(self, anns: Iterable[+T_co])
-
Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set.
The next annotation id for the created set is the highest seen annotation id from anns plus one.
Args
anns
- an iterable of annotations
Returns
an immutable detached annotation set
Expand source code
def detach_from(self, anns: Iterable): """ Creates an immutable detached annotation set from the annotations in anns which could by either a collection of annotations or annotation ids (int numbers) which are assumed to be the annotation ids from this set. The next annotation id for the created set is the highest seen annotation id from anns plus one. Args: anns: an iterable of annotations Returns: an immutable detached annotation set """ annset = AnnotationSet(name="detached-from:" + self.name) annset._is_immutable = True annset._annotations = {} nextid = -1 for ann in anns: if isinstance(ann, int): annset._annotations[ann] = self._annotations[ann] annid = ann else: annset._annotations[id] = ann annid = ann.id if annid > nextid: nextid = annid annset._next_annid = nextid + 1 annset._annset.update(annset._annotations.values()) return annset
def fast_iter(self) ‑> Generator[+T_co, -T_contra, +V_co]
-
Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built).
Expand source code
def fast_iter(self) -> Generator: """ Yields annotations in insertion order. This is faster then the default iterator and does not need to index (so if the index does not exist, it will not be built). """ if self._annotations: for annid, ann in self._annotations.items(): yield ann
def first(self)
-
Return the first (or only) annotation in the set by offset.
Returns
first annotation
Expand source code
def first(self): """ Return the first (or only) annotation in the set by offset. Returns: first annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no first annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=False)) return self._annotations[annid]
def for_idx(self, idx, default=None)
-
Return the annotation corresponding to the index idx in the set. This returns the annotation stored at the index, as added to the set. The order usually depends on the insertion time. If no annotation with the given index is specified, the value specified for
default
is returned.Args
idx
- index of the annotation in the set
default
- default value to return if now annotation with the given index exists
Returns
the annotation with the given index or the default value
Expand source code
def for_idx(self, idx, default=None): """ Return the annotation corresponding to the index idx in the set. This returns the annotation stored at the index, as added to the set. The order usually depends on the insertion time. If no annotation with the given index is specified, the value specified for `default` is returned. Args: idx: index of the annotation in the set default: default value to return if now annotation with the given index exists Returns: the annotation with the given index or the default value """ # TODO: we could make this more memory efficient (but slower) by # iterating over values until getting idxth tmplist = list(self._annotations.values()) if idx < len(tmplist): return tmplist[idx] else: return default
def get(self, annid: Union[int, Annotation], default=None) ‑> Optional[Annotation]
-
Gets the annotation with the given annotation id or returns the given default.
NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works:
annid = annset.add(b,e,t).id ann = annset.get(annid)
If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value).
Args
annid
- the annotation id of the annotation to retrieve.
default
- what to return if an annotation with the given id is not found. (Default value = None)
annid
- Union[int:
Annotation]:
Returns
the annotation or the default value.
Expand source code
def get( self, annid: Union[int, Annotation], default=None ) -> Union[Annotation, None]: """ Gets the annotation with the given annotation id or returns the given default. NOTE: for handling cases where legacy code still expects the add method to return an id and not the annotation, this will accept an annotation so the the frequent pattern still works: annid = annset.add(b,e,t).id ann = annset.get(annid) If an annotation is passed the annotation from the set with the id of that annotation is returned, if the annotation is from that set, this will return the same object, if it is still in the set (or return the default value). Args: annid: the annotation id of the annotation to retrieve. default: what to return if an annotation with the given id is not found. (Default value = None) annid: Union[int: Annotation]: Returns: the annotation or the default value. """ if isinstance(annid, Annotation): annid = annid.id return self._annotations.get(annid, default)
def isdetached(self) ‑> bool
-
Returns True if the annotation set is detached, False otherwise.
Expand source code
def isdetached(self) -> bool: """ Returns True if the annotation set is detached, False otherwise. """ return self._owner_doc is None
def iter(self, start_ge: Optional[int] = None, start_lt: Optional[int] = None, with_type: str = None, reverse: bool = False) ‑> Generator[+T_co, -T_contra, +V_co]
-
Default iterator. Yields annotations ordered by increasing starting annotation offset and increasing annotation id, otionally limited by the other parameters.
Args
start_ge
- the offset from where to start including annotations
start_lt
- the last offset to use as the starting offset of an annotation
with_type
- only annotations of this type
reverse
- process in reverse document order
Yields
Annotations in default document order, or reverse document order
Expand source code
def iter( self, start_ge: Union[int, None] = None, start_lt: Union[None, int] = None, with_type: str = None, reverse: bool = False, ) -> Generator: """ Default iterator. Yields annotations ordered by increasing starting annotation offset and increasing annotation id, otionally limited by the other parameters. Args: start_ge: the offset from where to start including annotations start_lt: the last offset to use as the starting offset of an annotation with_type: only annotations of this type reverse: process in reverse document order Yields: Annotations in default document order, or reverse document order """ if with_type is not None: allowedtypes = set() if isinstance(type, str): allowedtypes.add(with_type) else: for atype in with_type: allowedtypes.add(atype) else: allowedtypes = None if not self._annotations: return maxoff = None if start_ge is not None: assert start_ge >= 0 if start_lt is not None: assert start_lt >= 1 maxoff = start_lt + 1 if start_lt is not None and start_ge is not None: assert start_lt > start_ge self._create_index_by_offset() for _start, _end, annid in self._index_by_offset.irange( minoff=start_ge, maxoff=maxoff, reverse=reverse ): if ( allowedtypes is not None and self._annotations[annid].type not in allowedtypes ): continue yield self._annotations[annid]
def iter_ol(self, start_ge: Optional[int] = None, start_lt: Optional[int] = None, with_type: str = None, reverse: bool = False) ‑> Generator[+T_co, -T_contra, +V_co]
-
Offset-Length Iterator. Yields annotations ordered by increasing start offset, by increasing end offset and increasing annotoation id, otionally limited by the other parameters.
Args
start_ge
- the offset from where to start including annotations
start_lt
- the last offset to use as the starting offset of an annotation
with_type
- only annotations of this type
reverse
- process in reverse document order
Yields
Annotations ordered by offset and length.
Expand source code
def iter_ol( self, start_ge: Union[int, None] = None, start_lt: Union[None, int] = None, with_type: str = None, reverse: bool = False, ) -> Generator: """ Offset-Length Iterator. Yields annotations ordered by increasing start offset, by increasing end offset and increasing annotoation id, otionally limited by the other parameters. Args: start_ge: the offset from where to start including annotations start_lt: the last offset to use as the starting offset of an annotation with_type: only annotations of this type reverse: process in reverse document order Yields: Annotations ordered by offset and length. """ if with_type is not None: allowedtypes = set() if isinstance(type, str): allowedtypes.add(with_type) else: for atype in with_type: allowedtypes.add(atype) else: allowedtypes = None if not self._annotations: return maxoff = None if start_ge is not None: assert start_ge >= 0 if start_lt is not None: assert start_lt >= 1 maxoff = start_lt + 1 if start_lt is not None and start_ge is not None: assert start_lt > start_ge self._create_index_by_ol() for _start, _end, annid in self._index_by_ol.irange( minoff=start_ge, maxoff=maxoff, reverse=reverse ): if ( allowedtypes is not None and self._annotations[annid].type not in allowedtypes ): continue yield self._annotations[annid]
def last(self)
-
Return the last (or only) annotation by offset.
Returns
last annotation
Expand source code
def last(self): """ Return the last (or only) annotation by offset. Returns: last annotation """ sz = len(self._annotations) if sz == 0: raise Exception("Empty set, there is no last annotation") elif sz == 1: return next(iter(self._annotations.values())) self._create_index_by_offset() _, _, annid = next(self._index_by_offset.irange(reverse=True)) return self._annotations[annid]
def overlapping(self, start: int, end: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False)
-
Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set.
For each annotation ann in the result set, ann.overlapping(span) is True
Args
start
- start offset of the span
end
- end offset of the span
ann
- the annotation that is passed to this function for checking if it is included in the result.
include_self
- if True and the annotation for the span is given, do not include that annotation in the result set.
Returns
an immutable annotation set with the matching annotations
Expand source code
@support_annotation_or_set def overlapping(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Gets annotations overlapping with the given span. Instead of the start and end offsets, also accepts an annotation or annotation set. For each annotation ann in the result set, ann.overlapping(span) is True Args: start: start offset of the span end: end offset of the span ann: the annotation that is passed to this function for checking if it is included in the result. include_self: if True and the annotation for the span is given, do not include that annotation in the result set. Returns: an immutable annotation set with the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.overlapping(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def remove(self, annoriter: Union[int, Annotation, Iterable[+T_co]], raise_on_notexisting=True) ‑> None
-
Removes the given annotation which is either the id or the annotation instance or recursively all annotations in the iterable.
Throws
exception if the annotation set is immutable or the annotation is not in the set
Args
annoriter
- either the id (int) or the annotation instance (Annotation) or an iterable of id or annotation instance or iterable …
raise_on_notexisting
- (default: True) if false, silently accepts non-existing annotations/ids and does nothing. Note: if this is True, but the annotation set is immutable, an Exception is still raised.
Expand source code
def remove( self, annoriter: Union[int, Annotation, Iterable], raise_on_notexisting=True ) -> None: """ Removes the given annotation which is either the id or the annotation instance or recursively all annotations in the iterable. Throws: exception if the annotation set is immutable or the annotation is not in the set Args: annoriter: either the id (int) or the annotation instance (Annotation) or an iterable of id or annotation instance or iterable ... raise_on_notexisting: (default: True) if false, silently accepts non-existing annotations/ids and does nothing. Note: if this is True, but the annotation set is immutable, an Exception is still raised. """ if self._is_immutable: raise Exception( "Cannot remove an annotation from an immutable annotation set" ) if isinstance(annoriter, abc_Iterable): for a in annoriter: self.remove(a, raise_on_notexisting=raise_on_notexisting) return annid = None # make pycharm happy if isinstance(annoriter, int): annid = annoriter if annid not in self._annotations: raise Exception( "Annotation with id {} not in annotation set, cannot remove".format( annid ) ) ann = self._annotations[annid] elif isinstance(annoriter, Annotation): annid = annoriter.id if annid not in self._annotations: raise Exception( "Annotation with id {} does not belong to this set, cannot remove".format( annid ) ) ann = annoriter else: raise Exception("Should never happen!") # NOTE: once the annotation has been removed from the set, it could # still be referenced # somewhere else and its features could get modified. In order to # prevent logging of such changes, # the owning set gets cleared for the annotation ann._owner_set = None del self._annotations[annid] self._annset.remove(ann) if self.changelog is not None: self.changelog.append( {"command": "annotation:remove", "set": self.name, "id": annid} ) self._remove_from_indices(ann)
def reverse_iter(self, **kwargs)
-
Same as iter, but with the reverse parameter set to true.
Args
kwargs
- Same as for iter(), with revers=True fixed.
**kwargs
- will get passed on the Annotation.iter
Returns
same result as iter()
Expand source code
def reverse_iter(self, **kwargs): """ Same as iter, but with the reverse parameter set to true. Args: kwargs: Same as for iter(), with revers=True fixed. **kwargs: will get passed on the Annotation.iter Returns: same result as iter() """ return self.iter(reverse=True, **kwargs)
def start_ge(self, start: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False)
-
Return the annotations that start at or after the given start offset.
Args
start
- Start offset
_end
- unusued/ignored end offset
ann
- any Annotation passed
include_self
- should annotation passed be included in the result
Returns
an immutable annotation set of the matching annotations
Expand source code
@support_annotation_or_set def start_ge(self, start: int, _end: Any = None, ann: Optional["Annotation"] = None, include_self: bool = False): """ Return the annotations that start at or after the given start offset. Args: start: Start offset _end: unusued/ignored end offset ann: any Annotation passed include_self: should annotation passed be included in the result Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(start) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def start_lt(self, offset: int, ann: Any = None)
-
Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set.
Args
offset
- offset before which the annotations should start
_end
- unused/ignored end offset
ann
- unised/ignored Annotation passed (can never be included!)
Returns
an immutable annotation set of the matching annotations
Expand source code
@support_annotation_or_set def start_lt(self, offset: int, _end: Any = None, ann: Any = None): """ Returns the annotations that start before the given offset (or annotation). This also accepts an annotation or set. Args: offset: offset before which the annotations should start _end: unused/ignored end offset ann: unised/ignored Annotation passed (can never be included!) Returns: an immutable annotation set of the matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_before(offset) return self._restrict_intvs(intvs)
def start_min_ge(self, offset: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False)
-
Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set.
Args
offset
- The offset
_end
- unused/ignored end offset
ann
- any Annotation that was passed
include_self
- should annotation passed be included in the result
Returns
annotation set of matching annotations
Expand source code
@support_annotation_or_set def start_min_ge( self, offset: int, _end: Any = None, ann: Optional["Annotation"] = None, include_self: bool = False ): """Gets all annotations starting at the first possible offset at or after the given offset and returns them in an immutable annotation set. Args: offset: The offset _end: unused/ignored end offset ann: any Annotation that was passed include_self: should annotation passed be included in the result Returns: annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_from(offset) # now select only those first ones which all have the same offset if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None retids = set() startoff = None for intv in intvs: if startoff is None: startoff = intv[0] if ignore_id is not None: if ignore_id != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) elif startoff == intv[0]: if ignore_id is not None: if ignore_id != intv[2]: retids.add(intv[2]) else: retids.add(intv[2]) else: break return self.detach(restrict_to=retids)
def startingat(self, start: int, ann=None, include_self: bool = False)
-
Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set.
Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if
include_self
isFalse
Args
start
- the offset where annotations should start
_end
- unused/ignored end offset
ann
- any annotation that was specified instead of just the offset
include_self
- should annotation passed be included in the result
Returns
detached annotation set of matching annotations
Expand source code
@support_annotation_or_set def startingat( self, start: int, _end: Any = None, ann=None, include_self: bool = False ): """ Gets all annotations starting at the given offset (empty if none) and returns them in a detached annotation set. Note: this can be called with an annotation or annotation set instead of the start offset. If called with an annotation, this annotation is not included in the result set if `include_self` is `False` Args: start: the offset where annotations should start _end: unused/ignored end offset ann: any annotation that was specified instead of just the offset include_self: should annotation passed be included in the result Returns: detached annotation set of matching annotations """ self._create_index_by_offset() intvs = self._index_by_offset.starting_at(start) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
def to_dict(self, anntypes=None, **kwargs)
-
Convert an annotation set to its dict representation.
Args
anntypes
- if not None, an iterable of annotation types to include
**kwargs
- passed on to the dict creation of contained annotations.
Returns
the dict representation of the annotation set.
Expand source code
def to_dict(self, anntypes=None, **kwargs): """ Convert an annotation set to its dict representation. Args: anntypes: if not None, an iterable of annotation types to include **kwargs: passed on to the dict creation of contained annotations. Returns: the dict representation of the annotation set. """ if anntypes is not None: anntypesset = set(anntypes) anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() if val.type in anntypesset ) else: anns_list = list( val.to_dict(**kwargs) for val in self._annotations.values() ) return { # NOTE: Changelog is not getting added as it is stored in the document part! "name": self.name, "annotations": anns_list, "next_annid": self._next_annid, }
def update(self, anns: Iterable[Annotation], annid_from_ann=False)
-
Adds shallow copies of all annotations from the iterable to the set.
Args
anns
- an iterable of Annotations
annid_from_ann
- if True, use the same annotation id as in the annotation, this will raise an exception if the set already contains and annotation with this id. If False assign a new id to the added annotation.
Expand source code
def update(self, anns: Iterable[Annotation], annid_from_ann=False): """ Adds shallow copies of all annotations from the iterable to the set. Args: anns: an iterable of Annotations annid_from_ann: if True, use the same annotation id as in the annotation, this will raise an exception if the set already contains and annotation with this id. If False assign a new id to the added annotation. """ for ann in anns: if annid_from_ann: self.add(ann.start, ann.end, ann.type, ann.features, annid=ann.id) else: self.add(ann.start, ann.end, ann.type, ann.features)
def with_type(self, *anntype: Union[str, Iterable[+T_co]], non_overlapping: bool = False)
-
Gets annotations of the specified type(s). Creates the type index if necessary.
Args
anntype
- one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, an empty detached set is returned.
non_overlapping
- if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order.
Returns
a detached immutable annotation set with the matching annotations.
Expand source code
def with_type(self, *anntype: Union[str, Iterable], non_overlapping: bool = False): """ Gets annotations of the specified type(s). Creates the type index if necessary. Args: anntype: one or more types or type lists. The union of all types specified that way is used to filter the annotations. If no type is specified, an empty detached set is returned. non_overlapping: if True, only return annotations of any of the given types which do not overlap with other annotations. If there are several annotations that start at the same offset, use the type that comes first in the parameters, if there are more than one of that type, use the one that would come first in the usual sort order. Returns: a detached immutable annotation set with the matching annotations. """ atypes = [] for atype in anntype: if isinstance(atype, str): atypes.append(atype) else: for t in atype: atypes.append(t) if not atypes: return self.detach(restrict_to=[]) self._create_index_by_type() annids = set() for t in atypes: idxs = self._index_by_type.get(t) if idxs: annids.update(idxs) if non_overlapping: # need to get annotations grouped by start offset and sorted according to # what the Annotation class defines allanns = sorted(annids, key=lambda x: self._annotations[x]) allanns = [self._annotations[x] for x in allanns] allannsgrouped = [] curstart = None curset = None for ann in allanns: if curstart is None: curset = [ann] curstart = ann.start elif curstart == ann.start: curset.append(ann) else: allannsgrouped.append(curset) curset = [ann] curstart = ann.start if curset: allannsgrouped.append(curset) retanns = [] # now go through all the grouped annoations and select the top priority one # then skip to the next group that does not overlap with the one we just selected typepriority = dict() for i, atype in enumerate(atypes): typepriority[atype] = len(atypes) - i curminoffset = 0 for group in allannsgrouped: # instead of sorting, go through the group and find the top priority one topann = None if len(group) == 1: if group[0].start >= curminoffset: topann = group[0] elif len(group) == 0: raise Exception("We should never get a 0 size group here!") else: i = 0 for i, ann in enumerate(group): if ann.start >= curminoffset: topann = ann break for ann in group[i + 1:]: if ann.start < curminoffset: continue if typepriority[ann.type] > typepriority[topann.type]: topann = ann elif typepriority[ann.type] == typepriority[topann.type]: if ann.end > topann.end: topann = ann elif ann.end == topann.end: if ann.id > topann.id: topann = ann if topann is not None: retanns.append(topann) curminoffset = topann.end annids = [ann.id for ann in retanns] return self.detach(restrict_to=annids)
def within(self, start: int, end: int, ann: Optional[ForwardRef('Annotation')] = None, include_self: bool = False)
-
Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation.
For each annotation ann in the result set, ann.within(span) is True.
Args
start
- start offset of the range
end
- end offset of the range
ann
- the annotation representing the span. (Default value = None)
include_self
- if True and the annotation for the span is given, do not include that annotation in the result set. (Default value = False)
Returns
an immutable annotation set with the matching annotations
Expand source code
@support_annotation_or_set def within(self, start: int, end: int, ann: Optional["Annotation"] = None, include_self: bool = False): """ Gets annotations that fall completely within the given offset range, i.e. annotations such that the offset range is covering each of the annotation. For each annotation ann in the result set, ann.within(span) is True. Args: start: start offset of the range end: end offset of the range ann: the annotation representing the span. (Default value = None) include_self: if True and the annotation for the span is given, do not include that annotation in the result set. (Default value = False) Returns: an immutable annotation set with the matching annotations """ if start > end: raise Exception("Invalid offset range: {},{}".format(start, end)) else: self._create_index_by_offset() intvs = self._index_by_offset.within(start, end) if not include_self and ann is not None and ann in self: ignore_id = ann.id else: ignore_id = None return self._restrict_intvs(intvs, ignore_id=ignore_id)
class InvalidOffsetError (*args, **kwargs)
-
Error that indicates some invalid offset in an operation.
Expand source code
class InvalidOffsetError(KeyError): """ Error that indicates some invalid offset in an operation. """ pass
Ancestors
- builtins.KeyError
- builtins.LookupError
- builtins.Exception
- builtins.BaseException