Module `gatenlp.chunking`

Module for chunking-related methods and annotators.

Expand source code

"""
Module for chunking-related methods and annotators.
"""
import re
from typing import Union, List, Optional, Dict, Generator, Tuple
import iobes
from gatenlp import Document, Span, AnnotationSet

SPANENCS = dict(
    BIO=iobes.SpanEncoding.BIO,
    IOB=iobes.SpanEncoding.IOB,
    IOBES=iobes.SpanEncoding.IOBES,
    BILOU=iobes.SpanEncoding.BILOU,
    BMEOW=iobes.SpanEncoding.BMEOW,
    BMEWO=iobes.SpanEncoding.BMEWO,
)

PAT_WS = re.compile(r"\s+")


def normalize_type(typename):
    """
    Normalize the type name so it can be used as part of an BIO-like code that is usable in a conll-like dataset:
    replace all whitespace with a hyphen.

    Args:
        typename: the chunk type name

    Returns:
        normalized chunk type name
    """
    return re.sub(PAT_WS, "-", typename)


def doc_to_ibo(
        doc: Document,
        annset_name: str = "",
        sentence_type: Optional[str] = None,
        token_type: str = "Token",
        token_feature: Optional[str] = None,
        chunk_annset_name: Optional[str] = None,
        chunk_types: Optional[List[str]] = None,
        type2code: Optional[Dict] = None,
        scheme: str = "BIO",
        return_rows:bool = True,
) -> Generator[Union[List, Tuple], None, None]:
    """
    Extract tokens and corresponding token entity codes.

    Args:
        doc: The document to process
        annset_name: name of the annotation set which contains all the types needed
        sentence_type: if None, use whole document, otherwise generate one result per sentence type annotation,
            if the sentence contains at least one token.
        token_type: type of token annotations to use
        token_feature: if not None, use the feature instead of the covered document text
        chunk_annset_name: is specified, the annotation set name to use for retrieving the chunk annotations,
            otherwise annset_name is used for the chunk annotations too.
        chunk_types: a list of annotation types which identify chunks, each chunk type is used as entity type
            Note the chunk type annotations must not overlap, but this is currently not checked, for performance
            reasons.
        type2code: an optionam dict mapping the chunk_type to the type name used in the BIO codes
        scheme: the encoding scheme to use, default is BIO, possible: IOB, BIO, IOBES, BILOU, BMEOW, BMEWO
        return_rows: if True, return a list of (tokenstring, code) tuples for each sentence, if False return two lists
            of equal length, the first with the token strings and the second with the codes

    Yields:
        either one list of (tokenstring, code) tuples per sentence found or two lists, one with the tokenstrings and
        the other with the codes.
    """
    spanenc = SPANENCS[scheme]
    if type2code is None:
        type2code = {}
    if sentence_type is None:
        spans = [Span(0, len(doc))]
    else:
        spans = [a.span for a in doc.annset(annset_name).with_type(sentence_type)]
    all_tokens = doc.annset(annset_name).with_type(token_type)
    if chunk_types is None:
        all_chunks = AnnotationSet()
    else:
        all_chunks = doc.annset(annset_name if chunk_annset_name is None else chunk_annset_name).with_type(chunk_types)
    for span in spans:
        tokens = all_tokens.within(span)
        if len(tokens) == 0:
            continue
        # map token start offsets to token indices
        start2idx = {t.start: idx for idx, t in enumerate(tokens)}
        chunks = all_chunks.within(span)
        # now we want to know which of all the tokens are covered by chunks. So for each chunk, we check
        # which tokens are contained and append an iobes Span that points to the index of the token
        iobes_spans = []
        for chunk in chunks:
            ctokens = list(tokens.within(chunk))
            start = start2idx[ctokens[0].start]
            end = start2idx[ctokens[-1].start]+1
            iobes_span = iobes.Span(
                type=type2code.get(chunk.type, normalize_type(chunk.type)),
                start=start,
                end=end,
                tokens=tuple(range(start, end))
            )
            iobes_spans.append(iobes_span)
        codes = iobes.write_tags(iobes_spans, spanenc, length=len(tokens))
        assert len(tokens) == len(codes)
        if token_feature:
            token_strings = [t.features[token_feature] for t in tokens]
        else:
            token_strings = [doc[t] for t in tokens]
        if return_rows:
            yield [(t, c) for t, c in zip(token_strings, codes)]
        else:
            yield token_strings, codes

Functions

def doc_to_ibo(doc: Document, annset_name: str = '', sentence_type: Optional[str] = None, token_type: str = 'Token', token_feature: Optional[str] = None, chunk_annset_name: Optional[str] = None, chunk_types: Optional[List[str]] = None, type2code: Optional[Dict[~KT, ~VT]] = None, scheme: str = 'BIO', return_rows: bool = True) ‑> Generator[Union[List[~T], Tuple[]], None, None]

Extract tokens and corresponding token entity codes.

Args

doc: The document to process
annset_name: name of the annotation set which contains all the types needed
sentence_type: if None, use whole document, otherwise generate one result per sentence type annotation, if the sentence contains at least one token.
token_type: type of token annotations to use
token_feature: if not None, use the feature instead of the covered document text
chunk_annset_name: is specified, the annotation set name to use for retrieving the chunk annotations, otherwise annset_name is used for the chunk annotations too.
chunk_types: a list of annotation types which identify chunks, each chunk type is used as entity type Note the chunk type annotations must not overlap, but this is currently not checked, for performance reasons.
type2code: an optionam dict mapping the chunk_type to the type name used in the BIO codes
scheme: the encoding scheme to use, default is BIO, possible: IOB, BIO, IOBES, BILOU, BMEOW, BMEWO
return_rows: if True, return a list of (tokenstring, code) tuples for each sentence, if False return two lists of equal length, the first with the token strings and the second with the codes

Yields

either one list of (tokenstring, code) tuples per sentence found or two lists, one with the tokenstrings and the other with the codes.

Expand source code

def doc_to_ibo(
        doc: Document,
        annset_name: str = "",
        sentence_type: Optional[str] = None,
        token_type: str = "Token",
        token_feature: Optional[str] = None,
        chunk_annset_name: Optional[str] = None,
        chunk_types: Optional[List[str]] = None,
        type2code: Optional[Dict] = None,
        scheme: str = "BIO",
        return_rows:bool = True,
) -> Generator[Union[List, Tuple], None, None]:
    """
    Extract tokens and corresponding token entity codes.

    Args:
        doc: The document to process
        annset_name: name of the annotation set which contains all the types needed
        sentence_type: if None, use whole document, otherwise generate one result per sentence type annotation,
            if the sentence contains at least one token.
        token_type: type of token annotations to use
        token_feature: if not None, use the feature instead of the covered document text
        chunk_annset_name: is specified, the annotation set name to use for retrieving the chunk annotations,
            otherwise annset_name is used for the chunk annotations too.
        chunk_types: a list of annotation types which identify chunks, each chunk type is used as entity type
            Note the chunk type annotations must not overlap, but this is currently not checked, for performance
            reasons.
        type2code: an optionam dict mapping the chunk_type to the type name used in the BIO codes
        scheme: the encoding scheme to use, default is BIO, possible: IOB, BIO, IOBES, BILOU, BMEOW, BMEWO
        return_rows: if True, return a list of (tokenstring, code) tuples for each sentence, if False return two lists
            of equal length, the first with the token strings and the second with the codes

    Yields:
        either one list of (tokenstring, code) tuples per sentence found or two lists, one with the tokenstrings and
        the other with the codes.
    """
    spanenc = SPANENCS[scheme]
    if type2code is None:
        type2code = {}
    if sentence_type is None:
        spans = [Span(0, len(doc))]
    else:
        spans = [a.span for a in doc.annset(annset_name).with_type(sentence_type)]
    all_tokens = doc.annset(annset_name).with_type(token_type)
    if chunk_types is None:
        all_chunks = AnnotationSet()
    else:
        all_chunks = doc.annset(annset_name if chunk_annset_name is None else chunk_annset_name).with_type(chunk_types)
    for span in spans:
        tokens = all_tokens.within(span)
        if len(tokens) == 0:
            continue
        # map token start offsets to token indices
        start2idx = {t.start: idx for idx, t in enumerate(tokens)}
        chunks = all_chunks.within(span)
        # now we want to know which of all the tokens are covered by chunks. So for each chunk, we check
        # which tokens are contained and append an iobes Span that points to the index of the token
        iobes_spans = []
        for chunk in chunks:
            ctokens = list(tokens.within(chunk))
            start = start2idx[ctokens[0].start]
            end = start2idx[ctokens[-1].start]+1
            iobes_span = iobes.Span(
                type=type2code.get(chunk.type, normalize_type(chunk.type)),
                start=start,
                end=end,
                tokens=tuple(range(start, end))
            )
            iobes_spans.append(iobes_span)
        codes = iobes.write_tags(iobes_spans, spanenc, length=len(tokens))
        assert len(tokens) == len(codes)
        if token_feature:
            token_strings = [t.features[token_feature] for t in tokens]
        else:
            token_strings = [doc[t] for t in tokens]
        if return_rows:
            yield [(t, c) for t, c in zip(token_strings, codes)]
        else:
            yield token_strings, codes

def normalize_type(typename)

Normalize the type name so it can be used as part of an BIO-like code that is usable in a conll-like dataset: replace all whitespace with a hyphen.

Args

typename: the chunk type name

Returns

normalized chunk type name

Expand source code

def normalize_type(typename):
    """
    Normalize the type name so it can be used as part of an BIO-like code that is usable in a conll-like dataset:
    replace all whitespace with a hyphen.

    Args:
        typename: the chunk type name

    Returns:
        normalized chunk type name
    """
    return re.sub(PAT_WS, "-", typename)