Module `gatenlp.annotation_utils`

Module defining several utility functions for annotating documents in various ways.

Expand source code

"""
Module defining several utility functions for annotating documents in various ways.
"""
from typing import List, Optional


def annotate_substrings(doc,
                        substrings: List[str],
                        outset_name: str = "",
                        featureslist: Optional[List[str]] = None,
                        ann_type: str = "Token",
                        annotate_gaps: bool = False,
                        gap_type: str = "SpaceToken",
                        raise_if_unmatched: bool = True,
                        from_offset: int = 0,
                        to_offset: Optional[int] = None
                        ):
    """
    Annotate the document by matching the text substrings in the substrings list to the corresponding locations
    in the text. If the features list is not None it must be of equal length as susbstrings and contain a dict
    of features to assign to the annotation to create. If annotate_gaps is True, the gaps between matched substrings
    will be annotated with the gap_type type.

    Args:
        doc: the document to annotate
        outset_name: the name of the output annotation set (default set)
        substrings: a list of substrings to match
        featureslist: if not None a list of dicts which are used as features for the annotations created
        ann_type: the type of the annotations created for matching substings
        annotate_gaps: if True, the gaps between matching substrings are annotated using the gap_type
        gap_type: the type to use for gap annotations
        raise_if_unmatched: if True and a substring in the substrings list cannot be matched, an exception is raised,
            otherwise, the unmatchable substring is ignored.
        from_offset: the offset where to start matching
        to_offset: the offset before which a matching substring must end (if None, the end of the document)

    Returns:
        the annotated doc, identical to the document passed
    """
    outset = doc.annset(outset_name)
    if featureslist is not None:
        assert len(featureslist) == len(substrings)
    else:
        featureslist = [{} for _ in substrings]
    if to_offset is None:
        to_offset = len(doc.text)
    assert from_offset < to_offset
    assert to_offset <= len(doc.text)
    last_end = from_offset
    for substring, features in zip(substrings, featureslist):
        idx = doc.text.find(substring, last_end, to_offset)
        if idx < 0:  # not found
            if raise_if_unmatched:
                raise Exception(f"Unmatched string '{substring}' in {doc.text} from {last_end} to {to_offset}")
        else:
            end = idx+len(substring)
            if idx > last_end and annotate_gaps:
                outset.add(last_end, idx, gap_type)
            if end <= to_offset:
                outset.add(idx, end, ann_type, features=features)
            else:
                break
    return doc

Functions

def annotate_substrings(doc, substrings: List[str], outset_name: str = '', featureslist: Optional[List[str]] = None, ann_type: str = 'Token', annotate_gaps: bool = False, gap_type: str = 'SpaceToken', raise_if_unmatched: bool = True, from_offset: int = 0, to_offset: Optional[int] = None)

Annotate the document by matching the text substrings in the substrings list to the corresponding locations in the text. If the features list is not None it must be of equal length as susbstrings and contain a dict of features to assign to the annotation to create. If annotate_gaps is True, the gaps between matched substrings will be annotated with the gap_type type.

Args

doc: the document to annotate
outset_name: the name of the output annotation set (default set)
substrings: a list of substrings to match
featureslist: if not None a list of dicts which are used as features for the annotations created
ann_type: the type of the annotations created for matching substings
annotate_gaps: if True, the gaps between matching substrings are annotated using the gap_type
gap_type: the type to use for gap annotations
raise_if_unmatched: if True and a substring in the substrings list cannot be matched, an exception is raised, otherwise, the unmatchable substring is ignored.
from_offset: the offset where to start matching
to_offset: the offset before which a matching substring must end (if None, the end of the document)

Returns

the annotated doc, identical to the document passed

Expand source code

def annotate_substrings(doc,
                        substrings: List[str],
                        outset_name: str = "",
                        featureslist: Optional[List[str]] = None,
                        ann_type: str = "Token",
                        annotate_gaps: bool = False,
                        gap_type: str = "SpaceToken",
                        raise_if_unmatched: bool = True,
                        from_offset: int = 0,
                        to_offset: Optional[int] = None
                        ):
    """
    Annotate the document by matching the text substrings in the substrings list to the corresponding locations
    in the text. If the features list is not None it must be of equal length as susbstrings and contain a dict
    of features to assign to the annotation to create. If annotate_gaps is True, the gaps between matched substrings
    will be annotated with the gap_type type.

    Args:
        doc: the document to annotate
        outset_name: the name of the output annotation set (default set)
        substrings: a list of substrings to match
        featureslist: if not None a list of dicts which are used as features for the annotations created
        ann_type: the type of the annotations created for matching substings
        annotate_gaps: if True, the gaps between matching substrings are annotated using the gap_type
        gap_type: the type to use for gap annotations
        raise_if_unmatched: if True and a substring in the substrings list cannot be matched, an exception is raised,
            otherwise, the unmatchable substring is ignored.
        from_offset: the offset where to start matching
        to_offset: the offset before which a matching substring must end (if None, the end of the document)

    Returns:
        the annotated doc, identical to the document passed
    """
    outset = doc.annset(outset_name)
    if featureslist is not None:
        assert len(featureslist) == len(substrings)
    else:
        featureslist = [{} for _ in substrings]
    if to_offset is None:
        to_offset = len(doc.text)
    assert from_offset < to_offset
    assert to_offset <= len(doc.text)
    last_end = from_offset
    for substring, features in zip(substrings, featureslist):
        idx = doc.text.find(substring, last_end, to_offset)
        if idx < 0:  # not found
            if raise_if_unmatched:
                raise Exception(f"Unmatched string '{substring}' in {doc.text} from {last_end} to {to_offset}")
        else:
            end = idx+len(substring)
            if idx > last_end and annotate_gaps:
                outset.add(last_end, idx, gap_type)
            if end <= to_offset:
                outset.add(idx, end, ann_type, features=features)
            else:
                break
    return doc