Module gatenlp.processing.client.textrazor
TextRazor client.
Expand source code
"""
TextRazor client.
"""
import logging
import time
import requests
from gatenlp.processing.annotator import Annotator
from gatenlp.utils import init_logger
class TextRazorTextAnnotator(Annotator):
"""
An annotator that sends document text to the TextRazor Annotation service (https://www.textrazor.com/)
and uses the result to annotate the document.
NOTE: this annotator and how it can get parametrized will still change!
"""
def __init__(
self,
url=None, # use default
auth_token=None,
lang=None, # if None/not specified, TextRazor auto-detects
extractors=None,
outset_name="",
min_delay_ms=501,
):
"""
Create a TextRazorTextAnnotator.
Args:
lang: if specified, override the auto-detected language of the text
auth_token: the authentication token needed to use the service
url: the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used
extractors: a list of extractor names or a string with comma-separated extractor names to add to the
minimum extractors (words, sentences). If None uses words, sentences, entities.
NOTE: currently only words, sentences, entities is supported.!
outset_name: the annotationset to put the new annotations in
min_delay_ms: minimum time in ms to wait between requests to the server
"""
if url is None:
url = "https://api.textrazor.com"
self.url = url
self.lang = lang
self.outset_name = outset_name
self.auth_token = auth_token
self.min_delay_s = min_delay_ms / 1000.0
self.logger = init_logger()
# self.logger.setLevel(logging.DEBUG)
self._last_call_time = 0
if extractors is not None:
if isinstance(extractors, str):
extractors = extractors.split(",")
if isinstance(extractors, list):
allextrs = set()
allextrs.update(extractors)
allextrs.update(["words", "sentences"])
self.extractors = ",".join(list(allextrs))
else:
raise Exception("Odd extractors, must be list of strings or string")
else:
self.extractors = "words,sentences,entities"
def __call__(self, doc, **kwargs):
delay = time.time() - self._last_call_time
if delay < self.min_delay_s:
time.sleep(self.min_delay_s - delay)
text = doc.text
hdrs = {
# 'Content-Type': 'text/plain; charset=UTF-8',
# 'Accept-encoding': 'gzip' # TODO: to enable compressed responses
# 'Content-encoding': 'gzip' # TODO: to enable compressed requests
"X-TextRazor-Key": self.auth_token
}
data = {"text": text.encode("UTF-8")}
if self.extractors:
data["extractors"] = self.extractors
if self.lang:
data["languageOverride"] = self.lang
self.logger.debug(f"Sending request to {self.url}, data={data}, headers={hdrs}")
response = requests.post(
self.url,
# params=params,
data=data,
headers=hdrs,
)
scode = response.status_code
if scode != 200:
raise Exception(f"Something went wrong, received status code {scode}")
json = response.json()
ok = json.get("ok", False)
if not ok:
raise Exception(f"Something went wrong, did not get OK, json: {json}")
self.logger.debug(f"Response JSON: {json}")
resp = json.get("response", {})
entities = resp.get("entities", [])
sentences = resp.get("sentences", [])
categories = resp.get("categories", [])
topics = resp.get("topics", [])
entailments = resp.get("entailments", [])
relations = resp.get("relations", [])
properties = resp.get("properties", [])
nounphrases = resp.get("nounPhrases", [])
language = resp.get("language")
languageIsReliable = resp.get("languageIsReliable")
tok2off = {} # maps token idxs to tuples (start,end)
annset = doc.annset(self.outset_name)
for s in sentences:
sentstart = None
sentend = None
words = s.get("words", [])
end = None
for word in words:
start = word["startingPos"]
end = word["endingPos"]
if sentstart is None:
sentstart = start
tokidx = word["position"]
feats = {}
feats["partOfSpeech"] = word["partOfSpeech"]
feats["lemma"] = word["lemma"]
if word.get("stem"):
feats["stem"] = word["stem"]
annset.add(start, end, "Token", features=feats)
tok2off[tokidx] = (start, end)
if end is not None:
sentend = end
if sentstart is not None and sentend is not None:
annset.add(sentstart, sentend, "Sentence")
for ent in entities:
feats = {}
for fname in [
"wikiLink",
"entityEnglishId",
"wikidataId",
"relevanceScore",
"confidenceScore",
"type",
"freebaseId",
"entityId",
"freebaseTypes",
]:
if fname in ent:
feats[fname] = ent[fname]
annset.add(ent["startingPos"], ent["endingPos"], "Entity", feats)
return doc
Classes
class TextRazorTextAnnotator (url=None, auth_token=None, lang=None, extractors=None, outset_name='', min_delay_ms=501)
-
An annotator that sends document text to the TextRazor Annotation service (https://www.textrazor.com/) and uses the result to annotate the document.
NOTE: this annotator and how it can get parametrized will still change!
Create a TextRazorTextAnnotator.
Args
lang
- if specified, override the auto-detected language of the text
auth_token
- the authentication token needed to use the service
url
- the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used
extractors
- a list of extractor names or a string with comma-separated extractor names to add to the minimum extractors (words, sentences). If None uses words, sentences, entities. NOTE: currently only words, sentences, entities is supported.!
outset_name
- the annotationset to put the new annotations in
min_delay_ms
- minimum time in ms to wait between requests to the server
Expand source code
class TextRazorTextAnnotator(Annotator): """ An annotator that sends document text to the TextRazor Annotation service (https://www.textrazor.com/) and uses the result to annotate the document. NOTE: this annotator and how it can get parametrized will still change! """ def __init__( self, url=None, # use default auth_token=None, lang=None, # if None/not specified, TextRazor auto-detects extractors=None, outset_name="", min_delay_ms=501, ): """ Create a TextRazorTextAnnotator. Args: lang: if specified, override the auto-detected language of the text auth_token: the authentication token needed to use the service url: the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used extractors: a list of extractor names or a string with comma-separated extractor names to add to the minimum extractors (words, sentences). If None uses words, sentences, entities. NOTE: currently only words, sentences, entities is supported.! outset_name: the annotationset to put the new annotations in min_delay_ms: minimum time in ms to wait between requests to the server """ if url is None: url = "https://api.textrazor.com" self.url = url self.lang = lang self.outset_name = outset_name self.auth_token = auth_token self.min_delay_s = min_delay_ms / 1000.0 self.logger = init_logger() # self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 if extractors is not None: if isinstance(extractors, str): extractors = extractors.split(",") if isinstance(extractors, list): allextrs = set() allextrs.update(extractors) allextrs.update(["words", "sentences"]) self.extractors = ",".join(list(allextrs)) else: raise Exception("Odd extractors, must be list of strings or string") else: self.extractors = "words,sentences,entities" def __call__(self, doc, **kwargs): delay = time.time() - self._last_call_time if delay < self.min_delay_s: time.sleep(self.min_delay_s - delay) text = doc.text hdrs = { # 'Content-Type': 'text/plain; charset=UTF-8', # 'Accept-encoding': 'gzip' # TODO: to enable compressed responses # 'Content-encoding': 'gzip' # TODO: to enable compressed requests "X-TextRazor-Key": self.auth_token } data = {"text": text.encode("UTF-8")} if self.extractors: data["extractors"] = self.extractors if self.lang: data["languageOverride"] = self.lang self.logger.debug(f"Sending request to {self.url}, data={data}, headers={hdrs}") response = requests.post( self.url, # params=params, data=data, headers=hdrs, ) scode = response.status_code if scode != 200: raise Exception(f"Something went wrong, received status code {scode}") json = response.json() ok = json.get("ok", False) if not ok: raise Exception(f"Something went wrong, did not get OK, json: {json}") self.logger.debug(f"Response JSON: {json}") resp = json.get("response", {}) entities = resp.get("entities", []) sentences = resp.get("sentences", []) categories = resp.get("categories", []) topics = resp.get("topics", []) entailments = resp.get("entailments", []) relations = resp.get("relations", []) properties = resp.get("properties", []) nounphrases = resp.get("nounPhrases", []) language = resp.get("language") languageIsReliable = resp.get("languageIsReliable") tok2off = {} # maps token idxs to tuples (start,end) annset = doc.annset(self.outset_name) for s in sentences: sentstart = None sentend = None words = s.get("words", []) end = None for word in words: start = word["startingPos"] end = word["endingPos"] if sentstart is None: sentstart = start tokidx = word["position"] feats = {} feats["partOfSpeech"] = word["partOfSpeech"] feats["lemma"] = word["lemma"] if word.get("stem"): feats["stem"] = word["stem"] annset.add(start, end, "Token", features=feats) tok2off[tokidx] = (start, end) if end is not None: sentend = end if sentstart is not None and sentend is not None: annset.add(sentstart, sentend, "Sentence") for ent in entities: feats = {} for fname in [ "wikiLink", "entityEnglishId", "wikidataId", "relevanceScore", "confidenceScore", "type", "freebaseId", "entityId", "freebaseTypes", ]: if fname in ent: feats[fname] = ent[fname] annset.add(ent["startingPos"], ent["endingPos"], "Entity", feats) return doc
Ancestors
- Annotator
- abc.ABC
Inherited members