Module gatenlp.lib_spacy
Support for using spacy: convert from spacy to gatenlp documents and annotations.
Expand source code
"""
Support for using spacy: convert from spacy to gatenlp documents and annotations.
"""
from gatenlp import Document, AnnotationSet
from gatenlp.processing.annotator import Annotator
import spacy
if int(spacy.__version__.split(".")[0]) < 3:
SPACY_IS_PARSED = lambda doc: doc.is_parsed # noqa: E731
SPACY_IS_TAGGED = lambda doc: doc.is_tagged # noqa: E731
SPACY_IS_SENTENCED = lambda doc: doc.is_sentenced # noqa: E731
SPACY_IS_NERED = lambda doc: doc.is_nered # noqa: E731
else:
SPACY_IS_PARSED = lambda doc: doc.has_annotation("DEP") # noqa: E731
SPACY_IS_TAGGED = lambda doc: doc.has_annotation("TAG") # noqa: E731
SPACY_IS_SENTENCED = lambda doc: doc.has_annotation("SENT_START") # noqa: E731
SPACY_IS_NERED = lambda doc: doc.has_annotation("ENT_IOB") # noqa: E731
class AnnSpacy(Annotator):
"""
An annotator that runs a Spacy pipeline to annotate a gatenlp document.
"""
def __init__(
self,
pipeline=None,
outsetname="",
token_type="Token",
space_token_type="SpaceToken",
sentence_type="Sentence",
nounchunk_type="NounChunk",
add_tokens=True,
# add_spacetokens=True, # not sure how to do this yet
add_entities=True,
add_sentences=True,
add_nounchunks=True,
add_deps=True,
ent_prefix=None,
**kwargs
):
"""
Create an annotator for running a spacy pipeline on documents.
Args:
pipeline: a pre-configure spacy pipeline to use
outsetname: the annotation set name where to put the annotations
token_type: the annotation type for the token annotations
space_token_type: type of any space token annotations
sentence_type: the annotation type for the sentence annotations
nounchunk_type: annotation type for noun chunks
add_tokens: if token annotations should be added
add_entities: if true, add entity annotations
add_sentences: if sentence annotations should be added
add_nounchunks: if nounchunks should be added
add_deps: if dependencies should be added
ent_prefix: the prefix to add to all entity annotation types
kwargs: if no pipeline is specified, pass these arguments to the spacy.load method,
use name= to specify the model name
"""
self.outsetname = outsetname
self.token_type = token_type
self.sentence_type = sentence_type
self.add_entities = add_entities
self.ent_prefix = ent_prefix
self.space_token_type = space_token_type
self.nounchunk_type = nounchunk_type
self.add_tokens = add_tokens
self.add_sentences = add_sentences
self.add_nounchunks = add_nounchunks
self.add_deps = add_deps
if pipeline:
self.pipeline = pipeline
else:
self.pipeline = spacy.load("en_core_web_sm")
def __call__(self, doc, **kwargs):
spacy_doc = self.pipeline(doc.text)
spacy2gatenlp(
spacy_doc,
doc,
setname=self.outsetname,
token_type=self.token_type,
space_token_type=self.space_token_type,
sentence_type=self.sentence_type,
nounchunk_type=self.nounchunk_type,
add_tokens=self.add_tokens,
add_ents=self.add_entities,
add_nounchunks=self.add_nounchunks,
add_sents=self.add_sentences,
add_dep=self.add_deps,
ent_prefix=self.ent_prefix,
)
return doc
def apply_spacy(nlp, gatenlpdoc, setname="", containing_anns=None,
component_cfg=None, retrieve_spans=None):
"""Run the spacy nlp pipeline on the gatenlp document and transfer the annotations.
This modifies the gatenlp document in place.
Args:
nlp: spacy pipeline
gatenlpdoc: gatenlp document
setname: annotation set to receive the annotations (Default value = "")
containing_anns: annotation set or iterable of annotations. If not None, only the text covered be each
of the annotations is analyzed. The annotations should not overlap.
component_cfg: the component config to use for Spacy
retrieve_spans: if not None, a list of additional span types to retrieve from the SpaCy document
Returns:
The modified document.
"""
if containing_anns:
component_config = None
if isinstance(containing_anns, AnnotationSet):
annsiter = containing_anns.fast_iter()
else:
annsiter = containing_anns
for ann in annsiter:
if component_cfg:
component_config = {component_cfg: ann.features.to_dict()}
covered = gatenlpdoc[ann.start:ann.end]
spacydoc = nlp(covered, component_cfg=component_config)
spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname,
start_offset=ann.start, retrieve_spans=retrieve_spans)
elems = dir(spacydoc._)
for elem in elems:
if elem not in ['get', 'set', 'has']:
ann.features[elem] = spacydoc._.get(elem)
return gatenlpdoc
else:
spacydoc = nlp(gatenlpdoc.text)
return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)
def spacy2gatenlp(
spacydoc,
gatenlpdoc=None,
setname="",
token_type="Token",
space_token_type="SpaceToken",
sentence_type="Sentence",
nounchunk_type="NounChunk",
add_tokens=True,
# add_spacetokens=True, # not sure how to do this yet
add_ents=True,
add_sents=True,
add_nounchunks=True,
add_dep=True,
ent_prefix=None,
start_offset=0,
retrieve_spans=None
):
"""Convert a spacy document to a gatenlp document. If a gatenlp document is already
provided, add the annotations from the spacy document to it. In this case the
original gatenlpdoc is used and gets modified.
Args:
spacydoc: a spacy document
gatenlpdoc: if None, a new gatenlp document is created otherwise this
document is added to. (Default value = None)
setname: the annotation set name to which the annotations get added, empty string
for the default annotation set.
token_type: the annotation type to use for tokens (Default value = "Token")
space_token_type: the annotation type to use for space tokens (Default value = "SpaceToken")
sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
add_tokens: should annotations for tokens get added? If not, dependency parser
info cannot be added either. (Default value = True)
add_ents: should annotations for entities get added
add_sents: should sentence annotations get added (Default value = True)
add_nounchunks: should noun chunk annotations get added (Default value = True)
add_dep: should dependency parser information get added (Default value = True)
add_ents: (Default value = True)
ent_prefix: (Default value = None)
start_offset: If a document is specified, an offset where the text starts can be defined.
This allows a part of a document with spacy and then include the annotations back to the document,
in the corresponding possition
retrieve_spans: if not None, a list of additional Spacy span types to retrieve
Returns:
the new or modified Document
"""
# add_spacetokens: (Default value = True)
# not sure how to do this yet
if retrieve_spans is None:
retrieve_spans = []
if gatenlpdoc is None:
retdoc = Document(spacydoc.text)
start_offset = 0
else:
retdoc = gatenlpdoc
toki2annid = {}
annset = retdoc.annset(setname)
for tok in spacydoc:
from_off = tok.idx
to_off = tok.idx + len(tok)
# is_space = tok.is_space
fm = {
"_i": tok.i,
"is_alpha": tok.is_alpha,
"is_bracket": tok.is_bracket,
"is_currency": tok.is_currency,
"is_digit": tok.is_digit,
"is_left_punct": tok.is_left_punct,
"is_lower": tok.is_lower,
"is_oov": tok.is_oov,
"is_punct": tok.is_punct,
"is_quote": tok.is_quote,
"is_right_punct": tok.is_right_punct,
"is_sent_start": tok.is_sent_start,
"is_space": tok.is_space,
"is_stop": tok.is_stop,
"is_title": tok.is_title,
"is_upper": tok.is_upper,
"lang": tok.lang_,
"lemma": tok.lemma_,
"like_email": tok.like_email,
"like_num": tok.like_num,
"like_url": tok.like_url,
"orth": tok.orth,
"pos": tok.pos_,
"prefix": tok.prefix_,
"prob": tok.prob,
"rank": tok.rank,
"sentiment": tok.sentiment,
"tag": tok.tag_,
"shape": tok.shape_,
"suffix": tok.suffix_,
}
if SPACY_IS_NERED(spacydoc) and add_ents:
fm["ent_type"] = tok.ent_type_
if SPACY_IS_PARSED(spacydoc) and add_dep:
fm["dep"] = tok.dep_
if tok.is_space:
anntype = space_token_type
else:
anntype = token_type
annid = annset.add(from_off + start_offset, to_off + start_offset, anntype, fm).id
toki2annid[tok.i] = annid
# print("Added annotation with id: {} for token {}".format(annid, tok.i))
ws = tok.whitespace_
if len(ws) > 0:
annset.add(to_off + start_offset, to_off + len(ws) + start_offset, space_token_type, {"is_space": True})
# if we have a dependency parse, now also add the parse edges
if SPACY_IS_PARSED(spacydoc) and add_tokens and add_dep:
for tok in spacydoc:
ann = annset.get(toki2annid[tok.i])
ann.features["head"] = toki2annid[tok.head.i]
ann.features["left_edge"] = toki2annid[tok.left_edge.i]
ann.features["right_edge"] = toki2annid[tok.right_edge.i]
if spacydoc.ents and add_ents:
for ent in spacydoc.ents:
if ent_prefix:
entname = ent_prefix + ent.label_
else:
entname = ent.label_
annset.add(ent.start_char + start_offset, ent.end_char + start_offset, entname, {"lemma": ent.lemma_})
if spacydoc.sents and add_sents:
for sent in spacydoc.sents:
annset.add(sent.start_char + start_offset, sent.end_char + start_offset, sentence_type, {})
if spacydoc.noun_chunks and add_nounchunks:
for chunk in spacydoc.noun_chunks:
annset.add(chunk.start_char + start_offset, chunk.end_char + start_offset, nounchunk_type, {})
for span_type in retrieve_spans:
for span in spacydoc.spans[span_type]:
annset.add(span.start_char + start_offset, span.end_char + start_offset, span_type, {})
return retdoc
Functions
def SPACY_IS_NERED(doc)
-
Expand source code
SPACY_IS_NERED = lambda doc: doc.has_annotation("ENT_IOB") # noqa: E731
def SPACY_IS_PARSED(doc)
-
Expand source code
SPACY_IS_PARSED = lambda doc: doc.has_annotation("DEP") # noqa: E731
def SPACY_IS_SENTENCED(doc)
-
Expand source code
SPACY_IS_SENTENCED = lambda doc: doc.has_annotation("SENT_START") # noqa: E731
def SPACY_IS_TAGGED(doc)
-
Expand source code
SPACY_IS_TAGGED = lambda doc: doc.has_annotation("TAG") # noqa: E731
def apply_spacy(nlp, gatenlpdoc, setname='', containing_anns=None, component_cfg=None, retrieve_spans=None)
-
Run the spacy nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place.
Args
nlp
- spacy pipeline
gatenlpdoc
- gatenlp document
setname
- annotation set to receive the annotations (Default value = "")
containing_anns
- annotation set or iterable of annotations. If not None, only the text covered be each of the annotations is analyzed. The annotations should not overlap.
component_cfg
- the component config to use for Spacy
retrieve_spans
- if not None, a list of additional span types to retrieve from the SpaCy document
Returns
The modified document.
Expand source code
def apply_spacy(nlp, gatenlpdoc, setname="", containing_anns=None, component_cfg=None, retrieve_spans=None): """Run the spacy nlp pipeline on the gatenlp document and transfer the annotations. This modifies the gatenlp document in place. Args: nlp: spacy pipeline gatenlpdoc: gatenlp document setname: annotation set to receive the annotations (Default value = "") containing_anns: annotation set or iterable of annotations. If not None, only the text covered be each of the annotations is analyzed. The annotations should not overlap. component_cfg: the component config to use for Spacy retrieve_spans: if not None, a list of additional span types to retrieve from the SpaCy document Returns: The modified document. """ if containing_anns: component_config = None if isinstance(containing_anns, AnnotationSet): annsiter = containing_anns.fast_iter() else: annsiter = containing_anns for ann in annsiter: if component_cfg: component_config = {component_cfg: ann.features.to_dict()} covered = gatenlpdoc[ann.start:ann.end] spacydoc = nlp(covered, component_cfg=component_config) spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname, start_offset=ann.start, retrieve_spans=retrieve_spans) elems = dir(spacydoc._) for elem in elems: if elem not in ['get', 'set', 'has']: ann.features[elem] = spacydoc._.get(elem) return gatenlpdoc else: spacydoc = nlp(gatenlpdoc.text) return spacy2gatenlp(spacydoc, gatenlpdoc=gatenlpdoc, setname=setname)
def spacy2gatenlp(spacydoc, gatenlpdoc=None, setname='', token_type='Token', space_token_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, start_offset=0, retrieve_spans=None)
-
Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified.
Args
spacydoc
- a spacy document
gatenlpdoc
- if None, a new gatenlp document is created otherwise this document is added to. (Default value = None)
setname
- the annotation set name to which the annotations get added, empty string for the default annotation set.
token_type
- the annotation type to use for tokens (Default value = "Token")
space_token_type
- the annotation type to use for space tokens (Default value = "SpaceToken")
sentence_type
- the annotation type to use for sentence anntoations (Default value = "Sentence")
nounchunk_type
- the annotation type to use for noun chunk annotations (Default value = "NounChunk")
add_tokens
- should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True)
add_ents
- should annotations for entities get added
add_sents
- should sentence annotations get added (Default value = True)
add_nounchunks
- should noun chunk annotations get added (Default value = True)
add_dep
- should dependency parser information get added (Default value = True)
add_ents
- (Default value = True)
ent_prefix
- (Default value = None)
start_offset
- If a document is specified, an offset where the text starts can be defined. This allows a part of a document with spacy and then include the annotations back to the document, in the corresponding possition
retrieve_spans
- if not None, a list of additional Spacy span types to retrieve
Returns
the new or modified Document
Expand source code
def spacy2gatenlp( spacydoc, gatenlpdoc=None, setname="", token_type="Token", space_token_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, start_offset=0, retrieve_spans=None ): """Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified. Args: spacydoc: a spacy document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens (Default value = "Token") space_token_type: the annotation type to use for space tokens (Default value = "SpaceToken") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk") add_tokens: should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True) add_ents: should annotations for entities get added add_sents: should sentence annotations get added (Default value = True) add_nounchunks: should noun chunk annotations get added (Default value = True) add_dep: should dependency parser information get added (Default value = True) add_ents: (Default value = True) ent_prefix: (Default value = None) start_offset: If a document is specified, an offset where the text starts can be defined. This allows a part of a document with spacy and then include the annotations back to the document, in the corresponding possition retrieve_spans: if not None, a list of additional Spacy span types to retrieve Returns: the new or modified Document """ # add_spacetokens: (Default value = True) # not sure how to do this yet if retrieve_spans is None: retrieve_spans = [] if gatenlpdoc is None: retdoc = Document(spacydoc.text) start_offset = 0 else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) for tok in spacydoc: from_off = tok.idx to_off = tok.idx + len(tok) # is_space = tok.is_space fm = { "_i": tok.i, "is_alpha": tok.is_alpha, "is_bracket": tok.is_bracket, "is_currency": tok.is_currency, "is_digit": tok.is_digit, "is_left_punct": tok.is_left_punct, "is_lower": tok.is_lower, "is_oov": tok.is_oov, "is_punct": tok.is_punct, "is_quote": tok.is_quote, "is_right_punct": tok.is_right_punct, "is_sent_start": tok.is_sent_start, "is_space": tok.is_space, "is_stop": tok.is_stop, "is_title": tok.is_title, "is_upper": tok.is_upper, "lang": tok.lang_, "lemma": tok.lemma_, "like_email": tok.like_email, "like_num": tok.like_num, "like_url": tok.like_url, "orth": tok.orth, "pos": tok.pos_, "prefix": tok.prefix_, "prob": tok.prob, "rank": tok.rank, "sentiment": tok.sentiment, "tag": tok.tag_, "shape": tok.shape_, "suffix": tok.suffix_, } if SPACY_IS_NERED(spacydoc) and add_ents: fm["ent_type"] = tok.ent_type_ if SPACY_IS_PARSED(spacydoc) and add_dep: fm["dep"] = tok.dep_ if tok.is_space: anntype = space_token_type else: anntype = token_type annid = annset.add(from_off + start_offset, to_off + start_offset, anntype, fm).id toki2annid[tok.i] = annid # print("Added annotation with id: {} for token {}".format(annid, tok.i)) ws = tok.whitespace_ if len(ws) > 0: annset.add(to_off + start_offset, to_off + len(ws) + start_offset, space_token_type, {"is_space": True}) # if we have a dependency parse, now also add the parse edges if SPACY_IS_PARSED(spacydoc) and add_tokens and add_dep: for tok in spacydoc: ann = annset.get(toki2annid[tok.i]) ann.features["head"] = toki2annid[tok.head.i] ann.features["left_edge"] = toki2annid[tok.left_edge.i] ann.features["right_edge"] = toki2annid[tok.right_edge.i] if spacydoc.ents and add_ents: for ent in spacydoc.ents: if ent_prefix: entname = ent_prefix + ent.label_ else: entname = ent.label_ annset.add(ent.start_char + start_offset, ent.end_char + start_offset, entname, {"lemma": ent.lemma_}) if spacydoc.sents and add_sents: for sent in spacydoc.sents: annset.add(sent.start_char + start_offset, sent.end_char + start_offset, sentence_type, {}) if spacydoc.noun_chunks and add_nounchunks: for chunk in spacydoc.noun_chunks: annset.add(chunk.start_char + start_offset, chunk.end_char + start_offset, nounchunk_type, {}) for span_type in retrieve_spans: for span in spacydoc.spans[span_type]: annset.add(span.start_char + start_offset, span.end_char + start_offset, span_type, {}) return retdoc
Classes
class AnnSpacy (pipeline=None, outsetname='', token_type='Token', space_token_type='SpaceToken', sentence_type='Sentence', nounchunk_type='NounChunk', add_tokens=True, add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None, **kwargs)
-
An annotator that runs a Spacy pipeline to annotate a gatenlp document.
Create an annotator for running a spacy pipeline on documents.
Args
pipeline
- a pre-configure spacy pipeline to use
outsetname
- the annotation set name where to put the annotations
token_type
- the annotation type for the token annotations
space_token_type
- type of any space token annotations
sentence_type
- the annotation type for the sentence annotations
nounchunk_type
- annotation type for noun chunks
add_tokens
- if token annotations should be added
add_entities
- if true, add entity annotations
add_sentences
- if sentence annotations should be added
add_nounchunks
- if nounchunks should be added
add_deps
- if dependencies should be added
ent_prefix
- the prefix to add to all entity annotation types
kwargs
- if no pipeline is specified, pass these arguments to the spacy.load method, use name= to specify the model name
Expand source code
class AnnSpacy(Annotator): """ An annotator that runs a Spacy pipeline to annotate a gatenlp document. """ def __init__( self, pipeline=None, outsetname="", token_type="Token", space_token_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_entities=True, add_sentences=True, add_nounchunks=True, add_deps=True, ent_prefix=None, **kwargs ): """ Create an annotator for running a spacy pipeline on documents. Args: pipeline: a pre-configure spacy pipeline to use outsetname: the annotation set name where to put the annotations token_type: the annotation type for the token annotations space_token_type: type of any space token annotations sentence_type: the annotation type for the sentence annotations nounchunk_type: annotation type for noun chunks add_tokens: if token annotations should be added add_entities: if true, add entity annotations add_sentences: if sentence annotations should be added add_nounchunks: if nounchunks should be added add_deps: if dependencies should be added ent_prefix: the prefix to add to all entity annotation types kwargs: if no pipeline is specified, pass these arguments to the spacy.load method, use name= to specify the model name """ self.outsetname = outsetname self.token_type = token_type self.sentence_type = sentence_type self.add_entities = add_entities self.ent_prefix = ent_prefix self.space_token_type = space_token_type self.nounchunk_type = nounchunk_type self.add_tokens = add_tokens self.add_sentences = add_sentences self.add_nounchunks = add_nounchunks self.add_deps = add_deps if pipeline: self.pipeline = pipeline else: self.pipeline = spacy.load("en_core_web_sm") def __call__(self, doc, **kwargs): spacy_doc = self.pipeline(doc.text) spacy2gatenlp( spacy_doc, doc, setname=self.outsetname, token_type=self.token_type, space_token_type=self.space_token_type, sentence_type=self.sentence_type, nounchunk_type=self.nounchunk_type, add_tokens=self.add_tokens, add_ents=self.add_entities, add_nounchunks=self.add_nounchunks, add_sents=self.add_sentences, add_dep=self.add_deps, ent_prefix=self.ent_prefix, ) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members