Module `gatenlp.processing.client.googlenlp`

Google NLP client.

Expand source code

"""
Google NLP client.
"""
import json
from typing import Union, Optional, List
import logging

from google.cloud import language_v1

from gatenlp.features import Features
from gatenlp.processing.annotator import Annotator
from gatenlp.utils import init_logger

# See:
# https://googleapis.dev/python/language/latest/usage.html
# https://cloud.google.com/natural-language/docs/quickstart?hl=en_GB
# https://github.com/googleapis/python-language/tree/main/samples
# https://cloud.google.com/natural-language/docs/samples
# https://cloud.google.com/natural-language/docs/reference/libraries


def set_if_not_none(thedict, thekey, thevalue, suf=None):
    """
    Helper function to set the key of a dict to the given value, if the value is not None and not something
    ending in _UNKNOWN (if it is a string)
    """
    if thevalue is not None:
        if suf:
            if isinstance(thevalue, str) and thevalue.endswith(suf):
                return
        thedict[thekey] = thevalue


class GoogleNlpAnnotator(Annotator):

    def __init__(
        self,
        which_features: Optional[Union[str, List[str]]] = None,
        lang: Optional[str] = None,
        outset_name: str = "",
        debug: bool = False,
    ):
        """
        Create an IbmNluAnnotator.

        Args:
            which_features: a comma separated list or a python list of features to return,
                possible values are: syntax, entities, document_sentiment, entity_sentiment, classify.
                Default is "syntax,entities"
            lang: if not None, the ISO 639-1 code of the language the text is in. If None, automatically
                determines the language and stores it as a document feature according to the doc_feature_map parameter.
                See supported: https://cloud.google.com/natural-language/docs/languages
            outset_name: the name of the annotation set where to create the annotations (default: "")
            debug: if True, enable debugging logging
        """
        self.outset_name = outset_name
        self.lang = lang
        self.debug = debug
        self.logger = init_logger(__name__)
        if debug:
            self.logger.setLevel(logging.DEBUG)
        if which_features is None:
            which_features = "syntax,entities"
        if isinstance(which_features,str):
            which_features = [x.strip() for x in which_features.split(",")]
        else:
            which_features = [x.strip() for x in which_features]
        goo_feats = {}
        for feature in which_features:
            if feature == "syntax":
                goo_feats["extract_syntax"] = True
            elif feature == "entities":
                goo_feats["extract_entities"] = True
            elif feature == "document_sentiment":
                goo_feats["extract_document_sentiment"] = True
            elif feature == "entity_sentiment":
                goo_feats["extract_entity_sentiment"] = True
            elif feature == "classify":
                goo_feats["classify_text"] = True
            else:
                raise Exception(f"Processing feature {feature} not allowed")
        self.ibm_features = language_v1.types.language_service.AnnotateTextRequest.Features(**goo_feats)
        self.which_features = which_features
        self.client = language_v1.LanguageServiceClient()

    def __call__(self, doc, **kwargs):
        goodoc = language_v1.Document(
            content=doc.text,
            type_="PLAIN_TEXT",
            language=self.lang,
        )
        outset = doc.annset(self.outset_name)
        resp = self.client.annotate_text(document=goodoc, encoding_type="UTF32", features=self.ibm_features)
        if self.debug:
            self.logger.debug(f"Response: {resp}")
        set_if_not_none(doc.features, "language", resp.language)
        if self.ibm_features.extract_document_sentiment:
            set_if_not_none(doc.features, "document_sentiment_magnitude", resp.document_sentiment.magnitude)
            set_if_not_none(doc.features, "document_sentiment_score", resp.document_sentiment.score)
        for sentence in (resp.sentences or []):
            slen = len(sentence.text.content)
            start = sentence.text.begin_offset
            end = start + slen
            fmap = {}
            if self.ibm_features.extract_document_sentiment:
                set_if_not_none(fmap, "sentiment_magnitude", sentence.sentiment.magnitude)
                set_if_not_none(fmap, "sentiment_score", sentence.sentiment.score)
            outset.add(start, end, "Sentence", fmap)
        for token in (resp.tokens or []):
            tlen = len(token.text.content)
            start = token.text.begin_offset
            end = start + tlen
            fmap = {}
            set_if_not_none(fmap, "pos", token.part_of_speech.tag.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "number", token.part_of_speech.number.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "mood", token.part_of_speech.mood.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "person", token.part_of_speech.person.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "gender", token.part_of_speech.gender.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "case", token.part_of_speech.case.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "proper", token.part_of_speech.proper.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "tense", token.part_of_speech.tense.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "head", token.dependency_edge.head_token_index)
            set_if_not_none(fmap, "deprel", token.dependency_edge.label.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "lemma", token.lemma)
            outset.add(start, end, "Token", fmap)
        for entity in (resp.entities or []):
            fmap = dict(
                type=entity.type_.name,
                salience=entity.salience,
                name=entity.name,
            )
            fmap.update(entity.metadata)
            for mention in entity.mentions:
                mlen = len(mention.text.content)
                start = mention.text.begin_offset
                end = start + mlen
                ann = outset.add(start, end, entity.type_.name, fmap)
                ann.features["mtype"] = mention.type_.name
                if self.ibm_features.extract_entity_sentiment:
                    set_if_not_none(ann.features, "sentiment_magnitude", mention.sentiment.magnitude)
                    set_if_not_none(ann.features, "sentiment_score", mention.sentiment.score)
        return doc

Functions

def set_if_not_none(thedict, thekey, thevalue, suf=None)

Helper function to set the key of a dict to the given value, if the value is not None and not something ending in _UNKNOWN (if it is a string)

Expand source code

def set_if_not_none(thedict, thekey, thevalue, suf=None):
    """
    Helper function to set the key of a dict to the given value, if the value is not None and not something
    ending in _UNKNOWN (if it is a string)
    """
    if thevalue is not None:
        if suf:
            if isinstance(thevalue, str) and thevalue.endswith(suf):
                return
        thedict[thekey] = thevalue

Classes

class GoogleNlpAnnotator (which_features: Union[str, List[str], None] = None, lang: Optional[str] = None, outset_name: str = '', debug: bool = False)

Helper class that provides a standard way to create an ABC using inheritance.

Create an IbmNluAnnotator.

Args

which_features: a comma separated list or a python list of features to return, possible values are: syntax, entities, document_sentiment, entity_sentiment, classify. Default is "syntax,entities"
lang: if not None, the ISO 639-1 code of the language the text is in. If None, automatically determines the language and stores it as a document feature according to the doc_feature_map parameter. See supported: https://cloud.google.com/natural-language/docs/languages
outset_name: the name of the annotation set where to create the annotations (default: "")
debug: if True, enable debugging logging

Expand source code

class GoogleNlpAnnotator(Annotator):

    def __init__(
        self,
        which_features: Optional[Union[str, List[str]]] = None,
        lang: Optional[str] = None,
        outset_name: str = "",
        debug: bool = False,
    ):
        """
        Create an IbmNluAnnotator.

        Args:
            which_features: a comma separated list or a python list of features to return,
                possible values are: syntax, entities, document_sentiment, entity_sentiment, classify.
                Default is "syntax,entities"
            lang: if not None, the ISO 639-1 code of the language the text is in. If None, automatically
                determines the language and stores it as a document feature according to the doc_feature_map parameter.
                See supported: https://cloud.google.com/natural-language/docs/languages
            outset_name: the name of the annotation set where to create the annotations (default: "")
            debug: if True, enable debugging logging
        """
        self.outset_name = outset_name
        self.lang = lang
        self.debug = debug
        self.logger = init_logger(__name__)
        if debug:
            self.logger.setLevel(logging.DEBUG)
        if which_features is None:
            which_features = "syntax,entities"
        if isinstance(which_features,str):
            which_features = [x.strip() for x in which_features.split(",")]
        else:
            which_features = [x.strip() for x in which_features]
        goo_feats = {}
        for feature in which_features:
            if feature == "syntax":
                goo_feats["extract_syntax"] = True
            elif feature == "entities":
                goo_feats["extract_entities"] = True
            elif feature == "document_sentiment":
                goo_feats["extract_document_sentiment"] = True
            elif feature == "entity_sentiment":
                goo_feats["extract_entity_sentiment"] = True
            elif feature == "classify":
                goo_feats["classify_text"] = True
            else:
                raise Exception(f"Processing feature {feature} not allowed")
        self.ibm_features = language_v1.types.language_service.AnnotateTextRequest.Features(**goo_feats)
        self.which_features = which_features
        self.client = language_v1.LanguageServiceClient()

    def __call__(self, doc, **kwargs):
        goodoc = language_v1.Document(
            content=doc.text,
            type_="PLAIN_TEXT",
            language=self.lang,
        )
        outset = doc.annset(self.outset_name)
        resp = self.client.annotate_text(document=goodoc, encoding_type="UTF32", features=self.ibm_features)
        if self.debug:
            self.logger.debug(f"Response: {resp}")
        set_if_not_none(doc.features, "language", resp.language)
        if self.ibm_features.extract_document_sentiment:
            set_if_not_none(doc.features, "document_sentiment_magnitude", resp.document_sentiment.magnitude)
            set_if_not_none(doc.features, "document_sentiment_score", resp.document_sentiment.score)
        for sentence in (resp.sentences or []):
            slen = len(sentence.text.content)
            start = sentence.text.begin_offset
            end = start + slen
            fmap = {}
            if self.ibm_features.extract_document_sentiment:
                set_if_not_none(fmap, "sentiment_magnitude", sentence.sentiment.magnitude)
                set_if_not_none(fmap, "sentiment_score", sentence.sentiment.score)
            outset.add(start, end, "Sentence", fmap)
        for token in (resp.tokens or []):
            tlen = len(token.text.content)
            start = token.text.begin_offset
            end = start + tlen
            fmap = {}
            set_if_not_none(fmap, "pos", token.part_of_speech.tag.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "number", token.part_of_speech.number.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "mood", token.part_of_speech.mood.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "person", token.part_of_speech.person.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "gender", token.part_of_speech.gender.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "case", token.part_of_speech.case.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "proper", token.part_of_speech.proper.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "tense", token.part_of_speech.tense.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "head", token.dependency_edge.head_token_index)
            set_if_not_none(fmap, "deprel", token.dependency_edge.label.name, suf="_UNKNOWN")
            set_if_not_none(fmap, "lemma", token.lemma)
            outset.add(start, end, "Token", fmap)
        for entity in (resp.entities or []):
            fmap = dict(
                type=entity.type_.name,
                salience=entity.salience,
                name=entity.name,
            )
            fmap.update(entity.metadata)
            for mention in entity.mentions:
                mlen = len(mention.text.content)
                start = mention.text.begin_offset
                end = start + mlen
                ann = outset.add(start, end, entity.type_.name, fmap)
                ann.features["mtype"] = mention.type_.name
                if self.ibm_features.extract_entity_sentiment:
                    set_if_not_none(ann.features, "sentiment_magnitude", mention.sentiment.magnitude)
                    set_if_not_none(ann.features, "sentiment_score", mention.sentiment.score)
        return doc

Ancestors

Annotator
abc.ABC

Inherited members

Annotator:
- __call__
- finish
- pipe
- reduce
- start