Module gatenlp.processing.client.ibmnlu
IBM Natural Language Understanding client.
Expand source code
"""
IBM Natural Language Understanding client.
"""
import json
from typing import Union, Optional, List
import logging
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, CategoriesOptions, ConceptsOptions
from ibm_watson.natural_language_understanding_v1 import EmotionOptions, EntitiesOptions, KeywordsOptions
from ibm_watson.natural_language_understanding_v1 import RelationsOptions, SemanticRolesOptions, SentimentOptions
from ibm_watson.natural_language_understanding_v1 import SyntaxOptions, SyntaxOptionsTokens
from gatenlp.processing.annotator import Annotator
from gatenlp.utils import init_logger
def string2int(string, max=999999999, fname=""):
"""Helper function to return string converted to int or raise informative exception"""
if string is None:
return None
try:
val = int(string)
except:
raise Exception(f"Parameter {fname} cannot be converted to int")
if val > max:
raise Exception(f"Parameter {fname} exceeds maximum value {max}")
return val
def which2ibm_features(which):
"""
Convert the simplified list of features to an ibm features object. The which parameter is either a
string of comma-separated feature names or a list of feature names.
"""
if isinstance(which, str):
which = which.split(",")
parms = {}
for feat in which:
if ":" in feat:
fname, fparm = feat.split(":", 1)
else:
fname, fparm = feat, None
if fname == "syntax":
parms["syntax"] = SyntaxOptions(tokens=SyntaxOptionsTokens(lemma=True, part_of_speech=True), sentences=True)
elif fname == "concepts":
limit = string2int(fparm, max=50, fname=fname)
parms["concepts"] = ConceptsOptions(limit=limit)
elif fname == "emotion":
parms["emotion"] = EmotionOptions()
elif fname == "entities":
limit = string2int(fparm, max=250, fname=fname)
parms["entities"] = EntitiesOptions(limit=limit, mentions=True, sentiment=True, emotion=True)
elif fname == "keywords":
limit = string2int(fparm, max=250, fname=fname)
parms["keywords"] = KeywordsOptions(limit=3, sentiment=True, emotion=True)
elif fname == "relations":
parms["relations"] = RelationsOptions()
elif fname == "semantic_roles":
limit = string2int(fparm, max=250, fname=fname)
parms["semantic_roles"] = SemanticRolesOptions(limit=limit, keywords=True, entities=True)
elif fname == "sentiment":
parms["sentiment"] = SentimentOptions()
elif fname == "categories":
limit = string2int(fparm, max=10, fname=fname)
parms["categories"] = CategoriesOptions(explanation=False, limit=limit)
return parms
def get_nested(thedict, thekey, sep="_"):
"""Helper function to get a dot-separated nested key from a dictionary or return None if the key does not exist"""
# Note: this throws an exception if something that is expected to be a nested dict is not.
keylist = thekey.split(sep)
cur = thedict
for curkey in keylist:
cur = cur.get(curkey)
if cur is None:
return None
return cur
class IbmNluAnnotator(Annotator):
def __init__(
self,
url: str = None,
apikey: str = None,
ibm_features: Optional[Features] = None,
which_features: Optional[Union[str, List[str]]] = None,
lang: Optional[str] = None,
outset_name: str = "",
doc_feature_map: Optional[dict] = None,
entity_type_map: Optional[dict] = None,
debug: bool = False,
):
"""
Create an IbmNluAnnotator.
Args:
url: the IBM service URL to use.
apikey: the IBM service API key to use.
ibm_features: a pre-initialized ibm Features object that defines which kinds of analyses should get
carried out.
which_features: a comma separated list or a python list of features to return,
possible values are: concepts, emotion, entities,
keywords, sentiment, categories, syntax.
(NOTE: not yet implemented: relations, semantic_roles)
For some of these features,
it is possible to add additional settings by appending them separated by a colon: concepts:10 (return
a maximum of 10 concepts); keywords:10 (return a maximum of 10 keywords); categories:3 (return
a maximum or 3 categories, max value is 10). This is ignored if ibm_features are set.
If neither this nor ibm_features is specified, the default is "syntax". The actual ibm_features
used are available in the attribute `ibm_features` of the object after initialization.
lang: if not None, the ISO 639-1 code of the language the text is in. If None, automatically
determines the language and stores it as a document feature according to the doc_feature_map parameter.
outset_name: the name of the annotation set where to create the annotations (default: "")
doc_feature_map: a map that maps original IBM features to document features. If a IBM features is mapped
to None, that feature is not stored. Supported mapping keys are: language
entity_type_map: a map that maps original entity types to annotation types. Only the types in the map
are affected, others are used unchanged.
"""
# See https://cloud.ibm.com/apidocs/natural-language-understanding?code=python
if not url or not apikey:
raise Exception("Parameters url and apikey are required.")
authenticator = IAMAuthenticator(apikey)
nlu = NaturalLanguageUnderstandingV1(version="2021-08-01", authenticator=authenticator)
nlu.set_service_url(url)
self.nlu = nlu
self.lang = lang
if doc_feature_map is None:
doc_feature_map = {x: x for x in ["language", "concepts", "emotion", "keywords", "sentiment", "categories"]}
if entity_type_map is None:
self.entity_type_map = {}
else:
self.entity_type_map = entity_type_map
self.doc_feature_map = doc_feature_map
self.outset_name = outset_name
self.debug = debug
self.logger = init_logger(__name__)
if debug:
self.logger.setLevel(logging.DEBUG)
if ibm_features:
self.ibm_features = ibm_features
else:
if which_features is None:
which_features = "syntax"
fparms = which2ibm_features(which_features)
self.ibm_features = Features(**fparms)
def __call__(self, doc, **kwargs):
resp = self.nlu.analyze(
text=doc.text,
features=self.ibm_features,
language=self.lang
).get_result()
outset = doc.annset(self.outset_name)
if self.debug:
tmp = json.dumps(resp, indent=2)
self.logger.debug(f"Result:\n{tmp}")
if "language" in resp:
fname = self.doc_feature_map["language"]
if fname:
doc.features[fname] = resp["language"]
if "entities" in resp:
ents = resp["entities"]
for ent in ents:
etype = ent["type"]
if self.entity_type_map.get(etype) is not None:
etype = self.entity_type_map[etype]
fmap = {}
for fname in ["relevance", "confidence", "sentiment.score", "sentiment.label",
"disambiguation.subtype", "disambiguation.name", "disambiguation.dbpedia_resource"]:
val = get_nested(ent, fname, sep=".")
if val is not None:
fmap[fname] = val
mentions = ent["mentions"]
for mention in mentions:
start, end = mention["location"]
fmap["mention_confidence"] = mention["confidence"]
outset.add(start, end, etype, fmap)
if "concepts" in resp:
fname = self.doc_feature_map["concepts"]
if fname:
doc.features[fname] = resp["concepts"]
if "categories" in resp:
fname = self.doc_feature_map["categories"]
if fname:
doc.features[fname] = resp["categories"]
if "emotion" in resp:
fname = self.doc_feature_map["emotion"]
emotion_dict = resp["emotion"]["document"]["emotion"]
fmap = {}
for kname, value in emotion_dict.items():
doc.features[fname+"_"+kname] = value
if "keywords" in resp:
fname = self.doc_feature_map["keywords"]
if fname:
doc.features[fname] = resp["keywords"]
if "sentiment" in resp:
fname = self.doc_feature_map["sentiment"]
sentiment_dict = resp["sentiment"]["document"]
fmap = {}
for kname, value in sentiment_dict.items():
doc.features[fname+"_"+kname] = value
if "syntax" in resp:
tokens = resp["syntax"]["tokens"]
for token in tokens:
start, end = token["location"]
fmap = {}
for fname in ["part_of_speech", "lemma"]:
if fname in token:
fmap[fname] = token[fname]
outset.add(start, end, "Token", fmap)
if "sentences" in resp["syntax"]:
sentences = resp["syntax"]["sentences"]
else:
sentences = []
for sentence in sentences:
start, end = sentence["location"]
outset.add(start, end, "Sentence")
return doc
Functions
def get_nested(thedict, thekey, sep='_')
-
Helper function to get a dot-separated nested key from a dictionary or return None if the key does not exist
Expand source code
def get_nested(thedict, thekey, sep="_"): """Helper function to get a dot-separated nested key from a dictionary or return None if the key does not exist""" # Note: this throws an exception if something that is expected to be a nested dict is not. keylist = thekey.split(sep) cur = thedict for curkey in keylist: cur = cur.get(curkey) if cur is None: return None return cur
def string2int(string, max=999999999, fname='')
-
Helper function to return string converted to int or raise informative exception
Expand source code
def string2int(string, max=999999999, fname=""): """Helper function to return string converted to int or raise informative exception""" if string is None: return None try: val = int(string) except: raise Exception(f"Parameter {fname} cannot be converted to int") if val > max: raise Exception(f"Parameter {fname} exceeds maximum value {max}") return val
def which2ibm_features(which)
-
Convert the simplified list of features to an ibm features object. The which parameter is either a string of comma-separated feature names or a list of feature names.
Expand source code
def which2ibm_features(which): """ Convert the simplified list of features to an ibm features object. The which parameter is either a string of comma-separated feature names or a list of feature names. """ if isinstance(which, str): which = which.split(",") parms = {} for feat in which: if ":" in feat: fname, fparm = feat.split(":", 1) else: fname, fparm = feat, None if fname == "syntax": parms["syntax"] = SyntaxOptions(tokens=SyntaxOptionsTokens(lemma=True, part_of_speech=True), sentences=True) elif fname == "concepts": limit = string2int(fparm, max=50, fname=fname) parms["concepts"] = ConceptsOptions(limit=limit) elif fname == "emotion": parms["emotion"] = EmotionOptions() elif fname == "entities": limit = string2int(fparm, max=250, fname=fname) parms["entities"] = EntitiesOptions(limit=limit, mentions=True, sentiment=True, emotion=True) elif fname == "keywords": limit = string2int(fparm, max=250, fname=fname) parms["keywords"] = KeywordsOptions(limit=3, sentiment=True, emotion=True) elif fname == "relations": parms["relations"] = RelationsOptions() elif fname == "semantic_roles": limit = string2int(fparm, max=250, fname=fname) parms["semantic_roles"] = SemanticRolesOptions(limit=limit, keywords=True, entities=True) elif fname == "sentiment": parms["sentiment"] = SentimentOptions() elif fname == "categories": limit = string2int(fparm, max=10, fname=fname) parms["categories"] = CategoriesOptions(explanation=False, limit=limit) return parms
Classes
class IbmNluAnnotator (url: str = None, apikey: str = None, ibm_features: Optional[ibm_watson.natural_language_understanding_v1.Features] = None, which_features: Union[str, List[str], None] = None, lang: Optional[str] = None, outset_name: str = '', doc_feature_map: Optional[dict] = None, entity_type_map: Optional[dict] = None, debug: bool = False)
-
Helper class that provides a standard way to create an ABC using inheritance.
Create an IbmNluAnnotator.
Args
url
- the IBM service URL to use.
apikey
- the IBM service API key to use.
ibm_features
- a pre-initialized ibm Features object that defines which kinds of analyses should get carried out.
which_features
- a comma separated list or a python list of features to return,
possible values are: concepts, emotion, entities,
keywords, sentiment, categories, syntax.
(NOTE: not yet implemented: relations, semantic_roles)
For some of these features,
it is possible to add additional settings by appending them separated by a colon: concepts:10 (return
a maximum of 10 concepts); keywords:10 (return a maximum of 10 keywords); categories:3 (return
a maximum or 3 categories, max value is 10). This is ignored if ibm_features are set.
If neither this nor ibm_features is specified, the default is "syntax". The actual ibm_features
used are available in the attribute
ibm_features
of the object after initialization. lang
- if not None, the ISO 639-1 code of the language the text is in. If None, automatically determines the language and stores it as a document feature according to the doc_feature_map parameter.
outset_name
- the name of the annotation set where to create the annotations (default: "")
doc_feature_map
- a map that maps original IBM features to document features. If a IBM features is mapped to None, that feature is not stored. Supported mapping keys are: language
entity_type_map
- a map that maps original entity types to annotation types. Only the types in the map are affected, others are used unchanged.
Expand source code
class IbmNluAnnotator(Annotator): def __init__( self, url: str = None, apikey: str = None, ibm_features: Optional[Features] = None, which_features: Optional[Union[str, List[str]]] = None, lang: Optional[str] = None, outset_name: str = "", doc_feature_map: Optional[dict] = None, entity_type_map: Optional[dict] = None, debug: bool = False, ): """ Create an IbmNluAnnotator. Args: url: the IBM service URL to use. apikey: the IBM service API key to use. ibm_features: a pre-initialized ibm Features object that defines which kinds of analyses should get carried out. which_features: a comma separated list or a python list of features to return, possible values are: concepts, emotion, entities, keywords, sentiment, categories, syntax. (NOTE: not yet implemented: relations, semantic_roles) For some of these features, it is possible to add additional settings by appending them separated by a colon: concepts:10 (return a maximum of 10 concepts); keywords:10 (return a maximum of 10 keywords); categories:3 (return a maximum or 3 categories, max value is 10). This is ignored if ibm_features are set. If neither this nor ibm_features is specified, the default is "syntax". The actual ibm_features used are available in the attribute `ibm_features` of the object after initialization. lang: if not None, the ISO 639-1 code of the language the text is in. If None, automatically determines the language and stores it as a document feature according to the doc_feature_map parameter. outset_name: the name of the annotation set where to create the annotations (default: "") doc_feature_map: a map that maps original IBM features to document features. If a IBM features is mapped to None, that feature is not stored. Supported mapping keys are: language entity_type_map: a map that maps original entity types to annotation types. Only the types in the map are affected, others are used unchanged. """ # See https://cloud.ibm.com/apidocs/natural-language-understanding?code=python if not url or not apikey: raise Exception("Parameters url and apikey are required.") authenticator = IAMAuthenticator(apikey) nlu = NaturalLanguageUnderstandingV1(version="2021-08-01", authenticator=authenticator) nlu.set_service_url(url) self.nlu = nlu self.lang = lang if doc_feature_map is None: doc_feature_map = {x: x for x in ["language", "concepts", "emotion", "keywords", "sentiment", "categories"]} if entity_type_map is None: self.entity_type_map = {} else: self.entity_type_map = entity_type_map self.doc_feature_map = doc_feature_map self.outset_name = outset_name self.debug = debug self.logger = init_logger(__name__) if debug: self.logger.setLevel(logging.DEBUG) if ibm_features: self.ibm_features = ibm_features else: if which_features is None: which_features = "syntax" fparms = which2ibm_features(which_features) self.ibm_features = Features(**fparms) def __call__(self, doc, **kwargs): resp = self.nlu.analyze( text=doc.text, features=self.ibm_features, language=self.lang ).get_result() outset = doc.annset(self.outset_name) if self.debug: tmp = json.dumps(resp, indent=2) self.logger.debug(f"Result:\n{tmp}") if "language" in resp: fname = self.doc_feature_map["language"] if fname: doc.features[fname] = resp["language"] if "entities" in resp: ents = resp["entities"] for ent in ents: etype = ent["type"] if self.entity_type_map.get(etype) is not None: etype = self.entity_type_map[etype] fmap = {} for fname in ["relevance", "confidence", "sentiment.score", "sentiment.label", "disambiguation.subtype", "disambiguation.name", "disambiguation.dbpedia_resource"]: val = get_nested(ent, fname, sep=".") if val is not None: fmap[fname] = val mentions = ent["mentions"] for mention in mentions: start, end = mention["location"] fmap["mention_confidence"] = mention["confidence"] outset.add(start, end, etype, fmap) if "concepts" in resp: fname = self.doc_feature_map["concepts"] if fname: doc.features[fname] = resp["concepts"] if "categories" in resp: fname = self.doc_feature_map["categories"] if fname: doc.features[fname] = resp["categories"] if "emotion" in resp: fname = self.doc_feature_map["emotion"] emotion_dict = resp["emotion"]["document"]["emotion"] fmap = {} for kname, value in emotion_dict.items(): doc.features[fname+"_"+kname] = value if "keywords" in resp: fname = self.doc_feature_map["keywords"] if fname: doc.features[fname] = resp["keywords"] if "sentiment" in resp: fname = self.doc_feature_map["sentiment"] sentiment_dict = resp["sentiment"]["document"] fmap = {} for kname, value in sentiment_dict.items(): doc.features[fname+"_"+kname] = value if "syntax" in resp: tokens = resp["syntax"]["tokens"] for token in tokens: start, end = token["location"] fmap = {} for fname in ["part_of_speech", "lemma"]: if fname in token: fmap[fname] = token[fname] outset.add(start, end, "Token", fmap) if "sentences" in resp["syntax"]: sentences = resp["syntax"]["sentences"] else: sentences = [] for sentence in sentences: start, end = sentence["location"] outset.add(start, end, "Sentence") return doc
Ancestors
- Annotator
- abc.ABC
Inherited members