Module `gatenlp.corpora.export`

Module that defines DocumentDestination classes for exporting specific formats.

Expand source code

"""
Module that defines DocumentDestination classes for exporting specific formats.
"""
import os
from typing import Union, Optional, List, Dict, IO
from gatenlp.corpora import DocumentDestination
from gatenlp import Document
from gatenlp.chunking import doc_to_ibo


class Conll2003FileDestination(DocumentDestination):
    """
    Extracts tokens and BIO-like codes from the documents and writes those in CONLL (2003) format.
    """

    def __init__(
            self,
            file: Union[str, IO],
            annset_name: str = "",
            sentence_type: Optional[str] = None,
            token_type: str = "Token",
            token_feature: Optional[str] = None,
            chunk_annset_name: Optional[str] = None,
            chunk_types: Optional[List[str]] = None,
            type2code: Optional[Dict] = None,
            scheme: str = "BIO",
):
        """
        Create a Conll2003FileDestination to write CONLL 2003 format data to a file.

        Args:
            file: either the file path (str) or an open file handle for writing
            annset_name: the annotation set which contains the annotation types needed
            sentence_type: if None, use whole document, otherwise generate one result per sentence type annotation,
                if the sentence contains at least one token.
            token_type: type of token annotations to use
            token_feature: if not None, use the feature instead of the covered document text
            chunk_annset_name: is specified, the annotation set name to use for retrieving the chunk annotations,
                otherwise annset_name is used for the chunk annotations too.
            chunk_types: a list of annotation types which identify chunks, each chunk type is used as entity type
                Note the chunk type annotations must not overlap, but this is currently not checked, for performance
                reasons.
            type2code: an optionam dict mapping the chunk_type to the type name used in the BIO codes
            scheme: the encoding scheme to use, default is BIO, possible: IOB, BIO, IOBES, BILOU, BMEOW, BMEWO
        """
        super().__init__()
        if isinstance(file, str):
            self.fh = open(file, "wt", encoding="utf-8")
        else:
            self.fh = file
        self.annset_name = annset_name
        self.sentence_type = sentence_type
        self.token_type = token_type
        self.token_feature = token_feature
        self.chunk_annset_name = chunk_annset_name
        self.chunk_types = chunk_types
        self.type2code = type2code
        self.scheme = scheme

    def __enter__(self):
        return self

    def __exit__(self, _extype, _value, _traceback):
        self.fh.close()

    def append(self, doc):
        """
        Append a document to the destination.

        Args:
            doc: the document, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        for sentence_rows in doc_to_ibo(
                doc,
                annset_name=self.annset_name,
                sentence_type=self.sentence_type,
                token_type=self.token_type,
                token_feature=self.token_feature,
                chunk_annset_name=self.chunk_annset_name,
                chunk_types=self.chunk_types,
                type2code=self.type2code,
                scheme=self.scheme
        ):
            for token, code in sentence_rows:
                print(token, code, sep="\t", file=self.fh)
            print(file=self.fh)  # empty line for sentence boundary
        self._n += 1

    def close(self):
        self.fh.close()

Classes

class Conll2003FileDestination (file: Union[str, IO], annset_name: str = '', sentence_type: Optional[str] = None, token_type: str = 'Token', token_feature: Optional[str] = None, chunk_annset_name: Optional[str] = None, chunk_types: Optional[List[str]] = None, type2code: Optional[Dict[~KT, ~VT]] = None, scheme: str = 'BIO')

Extracts tokens and BIO-like codes from the documents and writes those in CONLL (2003) format.

Create a Conll2003FileDestination to write CONLL 2003 format data to a file.

Args

file: either the file path (str) or an open file handle for writing
annset_name: the annotation set which contains the annotation types needed
sentence_type: if None, use whole document, otherwise generate one result per sentence type annotation, if the sentence contains at least one token.
token_type: type of token annotations to use
token_feature: if not None, use the feature instead of the covered document text
chunk_annset_name: is specified, the annotation set name to use for retrieving the chunk annotations, otherwise annset_name is used for the chunk annotations too.
chunk_types: a list of annotation types which identify chunks, each chunk type is used as entity type Note the chunk type annotations must not overlap, but this is currently not checked, for performance reasons.
type2code: an optionam dict mapping the chunk_type to the type name used in the BIO codes
scheme: the encoding scheme to use, default is BIO, possible: IOB, BIO, IOBES, BILOU, BMEOW, BMEWO

Expand source code

class Conll2003FileDestination(DocumentDestination):
    """
    Extracts tokens and BIO-like codes from the documents and writes those in CONLL (2003) format.
    """

    def __init__(
            self,
            file: Union[str, IO],
            annset_name: str = "",
            sentence_type: Optional[str] = None,
            token_type: str = "Token",
            token_feature: Optional[str] = None,
            chunk_annset_name: Optional[str] = None,
            chunk_types: Optional[List[str]] = None,
            type2code: Optional[Dict] = None,
            scheme: str = "BIO",
):
        """
        Create a Conll2003FileDestination to write CONLL 2003 format data to a file.

        Args:
            file: either the file path (str) or an open file handle for writing
            annset_name: the annotation set which contains the annotation types needed
            sentence_type: if None, use whole document, otherwise generate one result per sentence type annotation,
                if the sentence contains at least one token.
            token_type: type of token annotations to use
            token_feature: if not None, use the feature instead of the covered document text
            chunk_annset_name: is specified, the annotation set name to use for retrieving the chunk annotations,
                otherwise annset_name is used for the chunk annotations too.
            chunk_types: a list of annotation types which identify chunks, each chunk type is used as entity type
                Note the chunk type annotations must not overlap, but this is currently not checked, for performance
                reasons.
            type2code: an optionam dict mapping the chunk_type to the type name used in the BIO codes
            scheme: the encoding scheme to use, default is BIO, possible: IOB, BIO, IOBES, BILOU, BMEOW, BMEWO
        """
        super().__init__()
        if isinstance(file, str):
            self.fh = open(file, "wt", encoding="utf-8")
        else:
            self.fh = file
        self.annset_name = annset_name
        self.sentence_type = sentence_type
        self.token_type = token_type
        self.token_feature = token_feature
        self.chunk_annset_name = chunk_annset_name
        self.chunk_types = chunk_types
        self.type2code = type2code
        self.scheme = scheme

    def __enter__(self):
        return self

    def __exit__(self, _extype, _value, _traceback):
        self.fh.close()

    def append(self, doc):
        """
        Append a document to the destination.

        Args:
            doc: the document, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        for sentence_rows in doc_to_ibo(
                doc,
                annset_name=self.annset_name,
                sentence_type=self.sentence_type,
                token_type=self.token_type,
                token_feature=self.token_feature,
                chunk_annset_name=self.chunk_annset_name,
                chunk_types=self.chunk_types,
                type2code=self.type2code,
                scheme=self.scheme
        ):
            for token, code in sentence_rows:
                print(token, code, sep="\t", file=self.fh)
            print(file=self.fh)  # empty line for sentence boundary
        self._n += 1

    def close(self):
        self.fh.close()

Ancestors

DocumentDestination
contextlib.AbstractContextManager
abc.ABC

Methods

def append(self, doc)

Append a document to the destination.

Args

doc: the document, if None, no action is performed.

Expand source code

def append(self, doc):
    """
    Append a document to the destination.

    Args:
        doc: the document, if None, no action is performed.
    """
    if doc is None:
        return
    assert isinstance(doc, Document)
    for sentence_rows in doc_to_ibo(
            doc,
            annset_name=self.annset_name,
            sentence_type=self.sentence_type,
            token_type=self.token_type,
            token_feature=self.token_feature,
            chunk_annset_name=self.chunk_annset_name,
            chunk_types=self.chunk_types,
            type2code=self.type2code,
            scheme=self.scheme
    ):
        for token, code in sentence_rows:
            print(token, code, sep="\t", file=self.fh)
        print(file=self.fh)  # empty line for sentence boundary
    self._n += 1

Inherited members

DocumentDestination:
- close
- relpathfeatname