Module gatenlp.gateworker.gateworkerannotator
Module for interacting with a Java GATE process.
Expand source code
#!/usr/bin/env python
"""
Module for interacting with a Java GATE process.
"""
from gatenlp.urlfileutils import is_url
from gatenlp.processing.annotator import Annotator
# NOTE: we delay importing py4j to the class initializer. This allows us to make GateWorker available via gatenlp
# but does not force everyone to actually have py4j installed if they do not use the GateWorker
# from py4j.java_gateway import JavaGateway, GatewayParameters
from gatenlp.utils import init_logger
logger = init_logger("gateworker-annotator")
__pdoc__ = {"GateWorkerAnnotator.__call__": True}
class GateWorkerAnnotator(Annotator):
# TODO: parameter to influence how exceptions are handled
def __init__(
self,
pipeline,
gateworker,
annspec_send=None,
annspec_receive=None,
replace_anns=False,
update_document=False,
):
"""
Create a GateWorker annotator.
This starts the gate worker, loads the pipeline and
can then be used to annotate Python gatenlp Document instances with the Java GATE
pipeline.
Note: to make sure that start/finish callbacks on the Java side are invoked, the annotator
start() method should be invoked once before processing documents and finish() should
get called once after processing documents. (Any Executor implementation shoudl do this
autimatically)
If the GateWorkerAnnotator is not used any more, close() should be invoked to terminate
the Java GATE Worker process.
Example:
```python
pipeline = GateWorkerAnnotator("annie.xgapp", GateWorker())
for idx, doc in enumerate(mycorpus):
corpus[idx] = pipeline(doc)
```
Args:
pipeline: the path to a Java GATE pipeline to load into the GATE worker
gateworker: the GateWorker instance to use
annspec_send: a list of either annotation set names, or tuples where the first element
is the name of an annotation set and the second element is either the name of a type
or a list of type names. If not None, only the sets/types specified are sent to Java GATE.
If an empty list is specified, no annotations are sent at all.
annspec_receive: this only works if update_document is True: same format as annspec_send to specify
which annotation sets/types are
sent back to Python after the document has been processed on the Java side.
replace_anns: this is only relevant if update_document is True: if True and an annotation is received
which already exists (same set and annotation id)
then the existing annotation is replaced (if offsets and type are also same, only the features are
replaced). If False, all received annotations are added which may change their annotation id.
update_document: if True, then existing annotations in the gatenlp document are kept and the annotations
received from Java GATE are added. In this case, other changes to the document, e.g. the document
text or document features are not applied to the current python document.
If False, the existing document is completely replaced with what gets
received from Java GATE.
"""
self.pipeline = pipeline
self.annspec_send = annspec_send
self.annspec_receive = annspec_receive
self.replace_anns = replace_anns
self.update_document = update_document
self.gateworker = gateworker
isurl, ext = is_url(pipeline)
if isurl:
self.controller = self.gateworker.worker.loadPipelineFromUri(ext)
else:
self.controller = self.gateworker.worker.loadPipelineFromFile(ext)
self.corpus = self.gateworker.worker.newCorpus()
self.controller.setCorpus(self.corpus)
self.controller.setControllerCallbacksEnabled(False)
def start(self):
"""
Invoke the controller execution started method on the GATE controller.
"""
self.controller.invokeControllerExecutionStarted()
def finish(self):
"""
Invoke the controller execution finished method on the GATE controller.
"""
self.controller.invokeControllerExecutionFinished()
def __call__(self, doc, **_kwargs):
"""
Run the GATE controller on the given document.
This runs the GATE pipeline (controller) on the given document by first sending the document
to the GATE process and coverting it to a GATE document there, running the pipeline on it,
and sending the document back and converting back to a new gatenlp Document.
Args:
doc: the document to process
**kwargs: ignored so far
Returns:
the processed gatenlp document
"""
if self.annspec_send is not None:
# create shallow copy, we only need it for reading!
tmpdoc = doc.copy(annspec=self.annspec_send)
else:
tmpdoc = doc
gdoc = self.gateworker.pdoc2gdoc(tmpdoc)
self.gateworker.worker.run4Document(self.controller, gdoc)
if self.update_document:
self.gateworker.gdocanns2pdoc(gdoc, doc, annspec=self.annspec_receive, replace=self.replace_anns)
else:
doc = self.gateworker.gdoc2pdoc(gdoc)
self.gateworker.del_resource(gdoc)
return doc
Classes
class GateWorkerAnnotator (pipeline, gateworker, annspec_send=None, annspec_receive=None, replace_anns=False, update_document=False)
-
Helper class that provides a standard way to create an ABC using inheritance.
Create a GateWorker annotator.
This starts the gate worker, loads the pipeline and can then be used to annotate Python gatenlp Document instances with the Java GATE pipeline.
Note: to make sure that start/finish callbacks on the Java side are invoked, the annotator start() method should be invoked once before processing documents and finish() should get called once after processing documents. (Any Executor implementation shoudl do this autimatically)
If the GateWorkerAnnotator is not used any more, close() should be invoked to terminate the Java GATE Worker process.
Example
pipeline = GateWorkerAnnotator("annie.xgapp", GateWorker()) for idx, doc in enumerate(mycorpus): corpus[idx] = pipeline(doc)
Args
pipeline
- the path to a Java GATE pipeline to load into the GATE worker
gateworker
- the GateWorker instance to use
annspec_send
- a list of either annotation set names, or tuples where the first element is the name of an annotation set and the second element is either the name of a type or a list of type names. If not None, only the sets/types specified are sent to Java GATE. If an empty list is specified, no annotations are sent at all.
annspec_receive
- this only works if update_document is True: same format as annspec_send to specify which annotation sets/types are sent back to Python after the document has been processed on the Java side.
replace_anns
- this is only relevant if update_document is True: if True and an annotation is received which already exists (same set and annotation id) then the existing annotation is replaced (if offsets and type are also same, only the features are replaced). If False, all received annotations are added which may change their annotation id.
update_document
- if True, then existing annotations in the gatenlp document are kept and the annotations received from Java GATE are added. In this case, other changes to the document, e.g. the document text or document features are not applied to the current python document. If False, the existing document is completely replaced with what gets received from Java GATE.
Expand source code
class GateWorkerAnnotator(Annotator): # TODO: parameter to influence how exceptions are handled def __init__( self, pipeline, gateworker, annspec_send=None, annspec_receive=None, replace_anns=False, update_document=False, ): """ Create a GateWorker annotator. This starts the gate worker, loads the pipeline and can then be used to annotate Python gatenlp Document instances with the Java GATE pipeline. Note: to make sure that start/finish callbacks on the Java side are invoked, the annotator start() method should be invoked once before processing documents and finish() should get called once after processing documents. (Any Executor implementation shoudl do this autimatically) If the GateWorkerAnnotator is not used any more, close() should be invoked to terminate the Java GATE Worker process. Example: ```python pipeline = GateWorkerAnnotator("annie.xgapp", GateWorker()) for idx, doc in enumerate(mycorpus): corpus[idx] = pipeline(doc) ``` Args: pipeline: the path to a Java GATE pipeline to load into the GATE worker gateworker: the GateWorker instance to use annspec_send: a list of either annotation set names, or tuples where the first element is the name of an annotation set and the second element is either the name of a type or a list of type names. If not None, only the sets/types specified are sent to Java GATE. If an empty list is specified, no annotations are sent at all. annspec_receive: this only works if update_document is True: same format as annspec_send to specify which annotation sets/types are sent back to Python after the document has been processed on the Java side. replace_anns: this is only relevant if update_document is True: if True and an annotation is received which already exists (same set and annotation id) then the existing annotation is replaced (if offsets and type are also same, only the features are replaced). If False, all received annotations are added which may change their annotation id. update_document: if True, then existing annotations in the gatenlp document are kept and the annotations received from Java GATE are added. In this case, other changes to the document, e.g. the document text or document features are not applied to the current python document. If False, the existing document is completely replaced with what gets received from Java GATE. """ self.pipeline = pipeline self.annspec_send = annspec_send self.annspec_receive = annspec_receive self.replace_anns = replace_anns self.update_document = update_document self.gateworker = gateworker isurl, ext = is_url(pipeline) if isurl: self.controller = self.gateworker.worker.loadPipelineFromUri(ext) else: self.controller = self.gateworker.worker.loadPipelineFromFile(ext) self.corpus = self.gateworker.worker.newCorpus() self.controller.setCorpus(self.corpus) self.controller.setControllerCallbacksEnabled(False) def start(self): """ Invoke the controller execution started method on the GATE controller. """ self.controller.invokeControllerExecutionStarted() def finish(self): """ Invoke the controller execution finished method on the GATE controller. """ self.controller.invokeControllerExecutionFinished() def __call__(self, doc, **_kwargs): """ Run the GATE controller on the given document. This runs the GATE pipeline (controller) on the given document by first sending the document to the GATE process and coverting it to a GATE document there, running the pipeline on it, and sending the document back and converting back to a new gatenlp Document. Args: doc: the document to process **kwargs: ignored so far Returns: the processed gatenlp document """ if self.annspec_send is not None: # create shallow copy, we only need it for reading! tmpdoc = doc.copy(annspec=self.annspec_send) else: tmpdoc = doc gdoc = self.gateworker.pdoc2gdoc(tmpdoc) self.gateworker.worker.run4Document(self.controller, gdoc) if self.update_document: self.gateworker.gdocanns2pdoc(gdoc, doc, annspec=self.annspec_receive, replace=self.replace_anns) else: doc = self.gateworker.gdoc2pdoc(gdoc) self.gateworker.del_resource(gdoc) return doc
Ancestors
- Annotator
- abc.ABC
Methods
def __call__(self, doc, **_kwargs)
-
Run the GATE controller on the given document.
This runs the GATE pipeline (controller) on the given document by first sending the document to the GATE process and coverting it to a GATE document there, running the pipeline on it, and sending the document back and converting back to a new gatenlp Document.
Args
doc
- the document to process
**kwargs
- ignored so far
Returns
the processed gatenlp document
Expand source code
def __call__(self, doc, **_kwargs): """ Run the GATE controller on the given document. This runs the GATE pipeline (controller) on the given document by first sending the document to the GATE process and coverting it to a GATE document there, running the pipeline on it, and sending the document back and converting back to a new gatenlp Document. Args: doc: the document to process **kwargs: ignored so far Returns: the processed gatenlp document """ if self.annspec_send is not None: # create shallow copy, we only need it for reading! tmpdoc = doc.copy(annspec=self.annspec_send) else: tmpdoc = doc gdoc = self.gateworker.pdoc2gdoc(tmpdoc) self.gateworker.worker.run4Document(self.controller, gdoc) if self.update_document: self.gateworker.gdocanns2pdoc(gdoc, doc, annspec=self.annspec_receive, replace=self.replace_anns) else: doc = self.gateworker.gdoc2pdoc(gdoc) self.gateworker.del_resource(gdoc) return doc
def finish(self)
-
Invoke the controller execution finished method on the GATE controller.
Expand source code
def finish(self): """ Invoke the controller execution finished method on the GATE controller. """ self.controller.invokeControllerExecutionFinished()
def start(self)
-
Invoke the controller execution started method on the GATE controller.
Expand source code
def start(self): """ Invoke the controller execution started method on the GATE controller. """ self.controller.invokeControllerExecutionStarted()
Inherited members