Module gatenlp.lib_spacy

Support for using spacy: convert from spacy to gatenlp documents and annotations.

Expand source code
"""
Support for using spacy: convert from spacy to gatenlp documents and annotations.
"""

from gatenlp import Document, AnnotationSet
from gatenlp.processing.annotator import Annotator
import spacy

if int(spacy.__version__.split(".")[0]) < 3:
    SPACY_IS_PARSED = lambda doc: doc.is_parsed   # noqa: E731
    SPACY_IS_TAGGED = lambda doc: doc.is_tagged   # noqa: E731
    SPACY_IS_SENTENCED = lambda doc: doc.is_sentenced   # noqa: E731
    SPACY_IS_NERED = lambda doc: doc.is_nered   # noqa: E731
else:
    SPACY_IS_PARSED = lambda doc: doc.has_annotation("DEP")  # noqa: E731
    SPACY_IS_TAGGED = lambda doc: doc.has_annotation("TAG")  # noqa: E731
    SPACY_IS_SENTENCED = lambda doc: doc.has_annotation("SENT_START")  # noqa: E731
    SPACY_IS_NERED = lambda doc: doc.has_annotation("ENT_IOB")  # noqa: E731


class AnnSpacy(Annotator):
    """
    An annotator that runs a Spacy pipeline to annotate a gatenlp document.
    """

    def __init__(
        self,
        pipeline=None,
        outsetname="",
        token_type="Token",
        space_token_type="SpaceToken",
        sentence_type="Sentence",
        nounchunk_type="NounChunk",
        add_tokens=True,
        # add_spacetokens=True, # not sure how to do this yet
        add_entities=True,
        add_sentences=True,
        add_nounchunks=True,
        add_deps=True,
        ent_prefix=None,
        **kwargs
    ):
        """
        Create an annotator for running a spacy pipeline on documents.

        Args:
            pipeline: a pre-configure spacy pipeline to use
            outsetname: the annotation set name where to put the annotations
            token_type: the annotation type for the token annotations
            space_token_type: type of any space token annotations
            sentence_type: the annotation type for the sentence annotations
            nounchunk_type: annotation type for noun chunks
            add_tokens: if token annotations should be added
            add_entities: if true, add entity annotations
            add_sentences: if sentence annotations should be added
            add_nounchunks: if nounchunks should be added
            add_deps: if dependencies should be added
            ent_prefix: the prefix to add to all entity annotation types
            kwargs: if no pipeline is specified, pass these arguments to the spacy.load method,
                use name= to specify the model name
        """
        self.outsetname = outsetname

        self.token_type = token_type
        self.sentence_type = sentence_type
        self.add_entities = add_entities
        self.ent_prefix = ent_prefix
        self.space_token_type = space_token_type
        self.nounchunk_type = nounchunk_type
        self.add_tokens = add_tokens
        self.add_sentences = add_sentences
        self.add_nounchunks = add_nounchunks
        self.add_deps = add_deps
        if pipeline:
            self.pipeline = pipeline
        else:
            self.pipeline = spacy.load("en_core_web_sm")

    def __call__(self, doc, **kwargs):
        spacy_doc = self.pipeline(doc.text)
        spacy2gatenlp(
            spacy_doc,
            doc,
            setname=self.outsetname,
            token_type=self.token_type,
            space_token_type=self.space_token_type,
            sentence_type=self.sentence_type,
            nounchunk_type=self.nounchunk_type,
            add_tokens=self.add_tokens,
            add_ents=self.add_entities,
            add_nounchunks=self.add_nounchunks,
            add_sents=self.add_sentences,
            add_dep=self.add_deps,
            ent_prefix=self.ent_prefix,
        )
        return doc


def apply_spacy(nlp, gatenlpdoc, setname="", containing_anns=None,
                component_cfg=None, retrieve_spans=None):
    """Run the spacy nlp pipeline on the gatenlp document and transfer the annotations.
    This modifies the gatenlp document in place.

    Args:
        nlp: spacy pipeline
        gatenlpdoc: gatenlp document
        setname: annotation set to receive the annotations (Default value = "")
        containing_anns: annotation set or iterable of annotations. If not None, only the text covered be each
            of the annotations is analyzed. The annotations should not overlap.
        component_cfg: the component config to use for Spacy
        retrieve_spans: if not None, a list of additional span types to retrieve from the SpaCy document

    Returns:
        The modified document.
    """
    if containing_anns:
        component_config = None
        if isinstance(containing_anns, AnnotationSet):
            annsiter = containing_anns.fast_iter()
        else:
            annsiter = containing_anns
        for ann in annsiter:
            if component_cfg:
                component_config = {component_cfg: ann.features.to_dict()}

            covered = gatenlpdoc[ann.start:ann.end]
            spacydoc = nlp(covered, component_cfg=component_config)
            spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname,
                          start_offset=ann.start, retrieve_spans=retrieve_spans)
            elems = dir(spacydoc._)
            for elem in elems:
                if elem not in ['get', 'set', 'has']:
                    ann.features[elem] = spacydoc._.get(elem)
        return gatenlpdoc
    else:
        spacydoc = nlp(gatenlpdoc.text)
        return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)


def spacy2gatenlp(
    spacydoc,
    gatenlpdoc=None,
    setname="",
    token_type="Token",
    space_token_type="SpaceToken",
    sentence_type="Sentence",
    nounchunk_type="NounChunk",
    add_tokens=True,
    # add_spacetokens=True, # not sure how to do this yet
    add_ents=True,
    add_sents=True,
    add_nounchunks=True,
    add_dep=True,
    ent_prefix=None,
    start_offset=0,
    retrieve_spans=None
):
    """Convert a spacy document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the spacy document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
        spacydoc: a spacy document
        gatenlpdoc: if None, a new gatenlp document is created otherwise this
            document is added to. (Default value = None)
        setname: the annotation set name to which the annotations get added, empty string
            for the default annotation set.
        token_type: the annotation type to use for tokens (Default value = "Token")
        space_token_type: the annotation type to use for space tokens (Default value = "SpaceToken")
        sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
        nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
        add_tokens: should annotations for tokens get added? If not, dependency parser
            info cannot be added either. (Default value = True)
        add_ents: should annotations for entities get added
        add_sents: should sentence annotations get added (Default value = True)
        add_nounchunks: should noun chunk annotations get added (Default value = True)
        add_dep: should dependency parser information get added (Default value = True)
        add_ents:  (Default value = True)
        ent_prefix:  (Default value = None)
        start_offset: If a document is specified, an offset where the text starts can be defined.
            This allows a part of a document with spacy and then include the annotations back to the document,
            in the corresponding possition
        retrieve_spans: if not None, a list of additional Spacy span types to retrieve

    Returns:
      the new or modified Document
    """

    # add_spacetokens:  (Default value = True)
    # not sure how to do this yet

    if retrieve_spans is None:
        retrieve_spans = []
    if gatenlpdoc is None:
        retdoc = Document(spacydoc.text)
        start_offset = 0
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    for tok in spacydoc:
        from_off = tok.idx
        to_off = tok.idx + len(tok)
        # is_space = tok.is_space
        fm = {
            "_i": tok.i,
            "is_alpha": tok.is_alpha,
            "is_bracket": tok.is_bracket,
            "is_currency": tok.is_currency,
            "is_digit": tok.is_digit,
            "is_left_punct": tok.is_left_punct,
            "is_lower": tok.is_lower,
            "is_oov": tok.is_oov,
            "is_punct": tok.is_punct,
            "is_quote": tok.is_quote,
            "is_right_punct": tok.is_right_punct,
            "is_sent_start": tok.is_sent_start,
            "is_space": tok.is_space,
            "is_stop": tok.is_stop,
            "is_title": tok.is_title,
            "is_upper": tok.is_upper,
            "lang": tok.lang_,
            "lemma": tok.lemma_,
            "like_email": tok.like_email,
            "like_num": tok.like_num,
            "like_url": tok.like_url,
            "orth": tok.orth,
            "pos": tok.pos_,
            "prefix": tok.prefix_,
            "prob": tok.prob,
            "rank": tok.rank,
            "sentiment": tok.sentiment,
            "tag": tok.tag_,
            "shape": tok.shape_,
            "suffix": tok.suffix_,
        }
        if SPACY_IS_NERED(spacydoc) and add_ents:
            fm["ent_type"] = tok.ent_type_
        if SPACY_IS_PARSED(spacydoc) and add_dep:
            fm["dep"] = tok.dep_
        if tok.is_space:
            anntype = space_token_type
        else:
            anntype = token_type
        annid = annset.add(from_off + start_offset, to_off + start_offset, anntype, fm).id
        toki2annid[tok.i] = annid
        # print("Added annotation with id: {} for token {}".format(annid, tok.i))
        ws = tok.whitespace_
        if len(ws) > 0:
            annset.add(to_off + start_offset, to_off + len(ws) + start_offset, space_token_type, {"is_space": True})
    # if we have a dependency parse, now also add the parse edges
    if SPACY_IS_PARSED(spacydoc) and add_tokens and add_dep:
        for tok in spacydoc:
            ann = annset.get(toki2annid[tok.i])
            ann.features["head"] = toki2annid[tok.head.i]
            ann.features["left_edge"] = toki2annid[tok.left_edge.i]
            ann.features["right_edge"] = toki2annid[tok.right_edge.i]
    if spacydoc.ents and add_ents:
        for ent in spacydoc.ents:
            if ent_prefix:
                entname = ent_prefix + ent.label_
            else:
                entname = ent.label_
            annset.add(ent.start_char + start_offset, ent.end_char + start_offset, entname, {"lemma": ent.lemma_})
    if spacydoc.sents and add_sents:
        for sent in spacydoc.sents:
            annset.add(sent.start_char + start_offset, sent.end_char + start_offset, sentence_type, {})
    if spacydoc.noun_chunks and add_nounchunks:
        for chunk in spacydoc.noun_chunks:
            annset.add(chunk.start_char + start_offset, chunk.end_char + start_offset, nounchunk_type, {})
    for span_type in retrieve_spans:
        for span in spacydoc.spans[span_type]:
            annset.add(span.start_char + start_offset, span.end_char + start_offset, span_type, {})
    return retdoc

Functions

def SPACY_IS_NERED(doc)
Expand source code
SPACY_IS_NERED = lambda doc: doc.has_annotation("ENT_IOB")  # noqa: E731
def SPACY_IS_PARSED(doc)
Expand source code
SPACY_IS_PARSED = lambda doc: doc.has_annotation("DEP")  # noqa: E731
def SPACY_IS_SENTENCED(doc)
Expand source code
SPACY_IS_SENTENCED = lambda doc: doc.has_annotation("SENT_START")  # noqa: E731
def SPACY_IS_TAGGED(doc)
Expand source code
SPACY_IS_TAGGED = lambda doc: doc.has_annotation("TAG")  # noqa: E731
def apply_spacy(nlp, gatenlpdoc, setname='', containing_anns=None, component_cfg=None, retrieve_spans=None)

Run the spacy nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place.

Args

nlp
spacy pipeline
gatenlpdoc
gatenlp document
setname
annotation set to receive the annotations (Default value = "")
containing_anns
annotation set or iterable of annotations. If not None, only the text covered be each of the annotations is analyzed. The annotations should not overlap.
component_cfg
the component config to use for Spacy
retrieve_spans
if not None, a list of additional span types to retrieve from the SpaCy document

Returns

The modified document.

Expand source code
def apply_spacy(nlp, gatenlpdoc, setname="", containing_anns=None,
                component_cfg=None, retrieve_spans=None):
    """Run the spacy nlp pipeline on the gatenlp document and transfer the annotations.
    This modifies the gatenlp document in place.

    Args:
        nlp: spacy pipeline
        gatenlpdoc: gatenlp document
        setname: annotation set to receive the annotations (Default value = "")
        containing_anns: annotation set or iterable of annotations. If not None, only the text covered be each
            of the annotations is analyzed. The annotations should not overlap.
        component_cfg: the component config to use for Spacy
        retrieve_spans: if not None, a list of additional span types to retrieve from the SpaCy document

    Returns:
        The modified document.
    """
    if containing_anns:
        component_config = None
        if isinstance(containing_anns, AnnotationSet):
            annsiter = containing_anns.fast_iter()
        else:
            annsiter = containing_anns
        for ann in annsiter:
            if component_cfg:
                component_config = {component_cfg: ann.features.to_dict()}

            covered = gatenlpdoc[ann.start:ann.end]
            spacydoc = nlp(covered, component_cfg=component_config)
            spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname,
                          start_offset=ann.start, retrieve_spans=retrieve_spans)
            elems = dir(spacydoc._)
            for elem in elems:
                if elem not in ['get', 'set', 'has']:
                    ann.features[elem] = spacydoc._.get(elem)
        return gatenlpdoc
    else:
        spacydoc = nlp(gatenlpdoc.text)
        return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)
def spacy2gatenlp(spacydoc, gatenlpdoc=None, setname='', token_type='Token', space_token_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, start_offset=0, retrieve_spans=None)

Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified.

Args

spacydoc
a spacy document
gatenlpdoc
if None, a new gatenlp document is created otherwise this document is added to. (Default value = None)
setname
the annotation set name to which the annotations get added, empty string for the default annotation set.
token_type
the annotation type to use for tokens (Default value = "Token")
space_token_type
the annotation type to use for space tokens (Default value = "SpaceToken")
sentence_type
the annotation type to use for sentence anntoations (Default value = "Sentence")
nounchunk_type
the annotation type to use for noun chunk annotations (Default value = "NounChunk")
add_tokens
should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True)
add_ents
should annotations for entities get added
add_sents
should sentence annotations get added (Default value = True)
add_nounchunks
should noun chunk annotations get added (Default value = True)
add_dep
should dependency parser information get added (Default value = True)
add_ents
(Default value = True)
ent_prefix
(Default value = None)
start_offset
If a document is specified, an offset where the text starts can be defined. This allows a part of a document with spacy and then include the annotations back to the document, in the corresponding possition
retrieve_spans
if not None, a list of additional Spacy span types to retrieve

Returns

the new or modified Document

Expand source code
def spacy2gatenlp(
    spacydoc,
    gatenlpdoc=None,
    setname="",
    token_type="Token",
    space_token_type="SpaceToken",
    sentence_type="Sentence",
    nounchunk_type="NounChunk",
    add_tokens=True,
    # add_spacetokens=True, # not sure how to do this yet
    add_ents=True,
    add_sents=True,
    add_nounchunks=True,
    add_dep=True,
    ent_prefix=None,
    start_offset=0,
    retrieve_spans=None
):
    """Convert a spacy document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the spacy document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
        spacydoc: a spacy document
        gatenlpdoc: if None, a new gatenlp document is created otherwise this
            document is added to. (Default value = None)
        setname: the annotation set name to which the annotations get added, empty string
            for the default annotation set.
        token_type: the annotation type to use for tokens (Default value = "Token")
        space_token_type: the annotation type to use for space tokens (Default value = "SpaceToken")
        sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
        nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
        add_tokens: should annotations for tokens get added? If not, dependency parser
            info cannot be added either. (Default value = True)
        add_ents: should annotations for entities get added
        add_sents: should sentence annotations get added (Default value = True)
        add_nounchunks: should noun chunk annotations get added (Default value = True)
        add_dep: should dependency parser information get added (Default value = True)
        add_ents:  (Default value = True)
        ent_prefix:  (Default value = None)
        start_offset: If a document is specified, an offset where the text starts can be defined.
            This allows a part of a document with spacy and then include the annotations back to the document,
            in the corresponding possition
        retrieve_spans: if not None, a list of additional Spacy span types to retrieve

    Returns:
      the new or modified Document
    """

    # add_spacetokens:  (Default value = True)
    # not sure how to do this yet

    if retrieve_spans is None:
        retrieve_spans = []
    if gatenlpdoc is None:
        retdoc = Document(spacydoc.text)
        start_offset = 0
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    for tok in spacydoc:
        from_off = tok.idx
        to_off = tok.idx + len(tok)
        # is_space = tok.is_space
        fm = {
            "_i": tok.i,
            "is_alpha": tok.is_alpha,
            "is_bracket": tok.is_bracket,
            "is_currency": tok.is_currency,
            "is_digit": tok.is_digit,
            "is_left_punct": tok.is_left_punct,
            "is_lower": tok.is_lower,
            "is_oov": tok.is_oov,
            "is_punct": tok.is_punct,
            "is_quote": tok.is_quote,
            "is_right_punct": tok.is_right_punct,
            "is_sent_start": tok.is_sent_start,
            "is_space": tok.is_space,
            "is_stop": tok.is_stop,
            "is_title": tok.is_title,
            "is_upper": tok.is_upper,
            "lang": tok.lang_,
            "lemma": tok.lemma_,
            "like_email": tok.like_email,
            "like_num": tok.like_num,
            "like_url": tok.like_url,
            "orth": tok.orth,
            "pos": tok.pos_,
            "prefix": tok.prefix_,
            "prob": tok.prob,
            "rank": tok.rank,
            "sentiment": tok.sentiment,
            "tag": tok.tag_,
            "shape": tok.shape_,
            "suffix": tok.suffix_,
        }
        if SPACY_IS_NERED(spacydoc) and add_ents:
            fm["ent_type"] = tok.ent_type_
        if SPACY_IS_PARSED(spacydoc) and add_dep:
            fm["dep"] = tok.dep_
        if tok.is_space:
            anntype = space_token_type
        else:
            anntype = token_type
        annid = annset.add(from_off + start_offset, to_off + start_offset, anntype, fm).id
        toki2annid[tok.i] = annid
        # print("Added annotation with id: {} for token {}".format(annid, tok.i))
        ws = tok.whitespace_
        if len(ws) > 0:
            annset.add(to_off + start_offset, to_off + len(ws) + start_offset, space_token_type, {"is_space": True})
    # if we have a dependency parse, now also add the parse edges
    if SPACY_IS_PARSED(spacydoc) and add_tokens and add_dep:
        for tok in spacydoc:
            ann = annset.get(toki2annid[tok.i])
            ann.features["head"] = toki2annid[tok.head.i]
            ann.features["left_edge"] = toki2annid[tok.left_edge.i]
            ann.features["right_edge"] = toki2annid[tok.right_edge.i]
    if spacydoc.ents and add_ents:
        for ent in spacydoc.ents:
            if ent_prefix:
                entname = ent_prefix + ent.label_
            else:
                entname = ent.label_
            annset.add(ent.start_char + start_offset, ent.end_char + start_offset, entname, {"lemma": ent.lemma_})
    if spacydoc.sents and add_sents:
        for sent in spacydoc.sents:
            annset.add(sent.start_char + start_offset, sent.end_char + start_offset, sentence_type, {})
    if spacydoc.noun_chunks and add_nounchunks:
        for chunk in spacydoc.noun_chunks:
            annset.add(chunk.start_char + start_offset, chunk.end_char + start_offset, nounchunk_type, {})
    for span_type in retrieve_spans:
        for span in spacydoc.spans[span_type]:
            annset.add(span.start_char + start_offset, span.end_char + start_offset, span_type, {})
    return retdoc

Classes

class AnnSpacy (pipeline=None, outsetname='', token_type='Token', space_token_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None, **kwargs)

An annotator that runs a Spacy pipeline to annotate a gatenlp document.

Create an annotator for running a spacy pipeline on documents.

Args

pipeline
a pre-configure spacy pipeline to use
outsetname
the annotation set name where to put the annotations
token_type
the annotation type for the token annotations
space_token_type
type of any space token annotations
sentence_type
the annotation type for the sentence annotations
nounchunk_type
annotation type for noun chunks
add_tokens
if token annotations should be added
add_entities
if true, add entity annotations
add_sentences
if sentence annotations should be added
add_nounchunks
if nounchunks should be added
add_deps
if dependencies should be added
ent_prefix
the prefix to add to all entity annotation types
kwargs
if no pipeline is specified, pass these arguments to the spacy.load method, use name= to specify the model name
Expand source code
class AnnSpacy(Annotator):
    """
    An annotator that runs a Spacy pipeline to annotate a gatenlp document.
    """

    def __init__(
        self,
        pipeline=None,
        outsetname="",
        token_type="Token",
        space_token_type="SpaceToken",
        sentence_type="Sentence",
        nounchunk_type="NounChunk",
        add_tokens=True,
        # add_spacetokens=True, # not sure how to do this yet
        add_entities=True,
        add_sentences=True,
        add_nounchunks=True,
        add_deps=True,
        ent_prefix=None,
        **kwargs
    ):
        """
        Create an annotator for running a spacy pipeline on documents.

        Args:
            pipeline: a pre-configure spacy pipeline to use
            outsetname: the annotation set name where to put the annotations
            token_type: the annotation type for the token annotations
            space_token_type: type of any space token annotations
            sentence_type: the annotation type for the sentence annotations
            nounchunk_type: annotation type for noun chunks
            add_tokens: if token annotations should be added
            add_entities: if true, add entity annotations
            add_sentences: if sentence annotations should be added
            add_nounchunks: if nounchunks should be added
            add_deps: if dependencies should be added
            ent_prefix: the prefix to add to all entity annotation types
            kwargs: if no pipeline is specified, pass these arguments to the spacy.load method,
                use name= to specify the model name
        """
        self.outsetname = outsetname

        self.token_type = token_type
        self.sentence_type = sentence_type
        self.add_entities = add_entities
        self.ent_prefix = ent_prefix
        self.space_token_type = space_token_type
        self.nounchunk_type = nounchunk_type
        self.add_tokens = add_tokens
        self.add_sentences = add_sentences
        self.add_nounchunks = add_nounchunks
        self.add_deps = add_deps
        if pipeline:
            self.pipeline = pipeline
        else:
            self.pipeline = spacy.load("en_core_web_sm")

    def __call__(self, doc, **kwargs):
        spacy_doc = self.pipeline(doc.text)
        spacy2gatenlp(
            spacy_doc,
            doc,
            setname=self.outsetname,
            token_type=self.token_type,
            space_token_type=self.space_token_type,
            sentence_type=self.sentence_type,
            nounchunk_type=self.nounchunk_type,
            add_tokens=self.add_tokens,
            add_ents=self.add_entities,
            add_nounchunks=self.add_nounchunks,
            add_sents=self.add_sentences,
            add_dep=self.add_deps,
            ent_prefix=self.ent_prefix,
        )
        return doc

Ancestors

Inherited members