Module `gatenlp.processing.client.tagme`

TagMe client.

Expand source code

"""
TagMe client.
"""
import time
import requests

from gatenlp.processing.annotator import Annotator
from gatenlp.utils import init_logger
from gatenlp.offsetmapper import OffsetMapper


class TagMeAnnotator(Annotator):
    """
    An annotator that sends text to the TagMe Annotation service
    (https://sobigdata.d4science.org/group/tagme/tagme)
    and uses the result to annotate the document.
    """

    def __init__(
        self,
        url=None,
        auth_token=None,
        lang="en",
        ann_type="Mention",
        task="tag",  # or spot
        outset_name="",
        min_delay_ms=501,
        tweet=False,
        include_all_spots=False,
        long_text=None,
        epsilon=None,
        link_pattern="https://{0}.wikipedia.org/wiki/{1}",
    ):
        """
        Create a TagMeAnnotator.

        Args:
            url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used
            auth_token: the authentication token needed to use the service
            lang: the language of the text, one of 'de', 'en' (default), 'it'
            ann_type: the annotation type for the new annotations, default is "Mention"
            task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag"
            outset_name: the annotationset to put the new annotations in
            min_delay_ms: minimum time in ms to wait between requests to the server
            tweet: if True, TagMe expects a Tweet (default is False)
            include_all_spots: if True, include spots that cannot be linked (default is False)
            long_text: if not None, the context length to use (default: None)
            epsilon: if not None, the epsilong value (float) to use (default: None)
            link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The
               default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and
               {1} gets replaced with the title.
        """
        if url is None:
            if task == "tag":
                url = "https://tagme.d4science.org/tagme/tag"
            elif task == "spot":
                url = "https://tagme.d4science.org/tagme/spot"
            else:
                raise Exception("task must be 'tag' or 'spot'")
        assert lang in ["en", "de", "it"]
        if long_text is not None:
            assert isinstance(long_text, int)
        if epsilon is not None:
            assert isinstance(epsilon, float)
        self.long_text = long_text
        self.epsilon = epsilon
        self.lang = lang
        self.auth_token = auth_token
        self.url = url
        self.tweet = tweet
        self.include_all_spots = include_all_spots
        self.outset_name = outset_name
        self.min_delay_s = min_delay_ms / 1000.0
        self.logger = init_logger()
        # self.logger.setLevel(logging.DEBUG)
        self._last_call_time = 0
        self.ann_type = ann_type
        self.link_pattern = link_pattern

    def __call__(self, doc, **kwargs):
        if "tweet" in kwargs:
            tweet = kwargs["tweet"]
        else:
            tweet = self.tweet
        delay = time.time() - self._last_call_time
        if delay < self.min_delay_s:
            time.sleep(self.min_delay_s - delay)
        text = doc.text
        hdrs = {
            "Content-Type": "text/plain; charset=UTF-8",
            "Accept": "application/gate+json",
        }
        params = {
            "text": text,
            "gcube-token": self.auth_token,
            "lang": self.lang,
        }
        if self.include_all_spots:
            params["include_all_spots"] = "true"
        if tweet:
            params["tweet"] = "true"
        if self.long_text is not None:
            params["long_text"] = self.long_text
        if self.epsilon is not None:
            params["epsilon"] = self.epsilon
        response = requests.post(self.url, params=params, headers=hdrs)
        scode = response.status_code
        if scode != 200:
            raise Exception(f"Something went wrong, received status code {scode}")
        json = response.json()
        # self.logger.debug(f"Response JSON: {json}")
        ents = json.get("annotations", {})
        annset = doc.annset(self.outset_name)
        om = OffsetMapper(text)
        for ent in ents:
            start = ent["start"]
            end = ent["end"]
            start, end = om.convert_to_python([start, end])
            feats = {}
            title = ent.get("title")
            if title is not None:
                if self.link_pattern:
                    feats["url"] = self.link_pattern.format(self.lang, title)
                else:
                    feats["title"] = title
            for fname in ["id", "rho", "link_probability", "lp"]:
                fval = ent.get(fname)
                if fval is not None:
                    feats[fname] = fval
            # self.logger.debug(f"Adding annotation {start},{end},{feats}")
            annset.add(start, end, self.ann_type, features=feats)
        return doc

Classes

class TagMeAnnotator (url=None, auth_token=None, lang='en', ann_type='Mention', task='tag', outset_name='', min_delay_ms=501, tweet=False, include_all_spots=False, long_text=None, epsilon=None, link_pattern='https://{0}.wikipedia.org/wiki/{1}')

An annotator that sends text to the TagMe Annotation service (https://sobigdata.d4science.org/group/tagme/tagme) and uses the result to annotate the document.

Create a TagMeAnnotator.

Args

url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used
auth_token: the authentication token needed to use the service
lang: the language of the text, one of 'de', 'en' (default), 'it'
ann_type: the annotation type for the new annotations, default is "Mention"
task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag"
outset_name: the annotationset to put the new annotations in
min_delay_ms: minimum time in ms to wait between requests to the server
tweet: if True, TagMe expects a Tweet (default is False)
include_all_spots: if True, include spots that cannot be linked (default is False)
long_text: if not None, the context length to use (default: None)
epsilon: if not None, the epsilong value (float) to use (default: None)
link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and {1} gets replaced with the title.

Expand source code

class TagMeAnnotator(Annotator):
    """
    An annotator that sends text to the TagMe Annotation service
    (https://sobigdata.d4science.org/group/tagme/tagme)
    and uses the result to annotate the document.
    """

    def __init__(
        self,
        url=None,
        auth_token=None,
        lang="en",
        ann_type="Mention",
        task="tag",  # or spot
        outset_name="",
        min_delay_ms=501,
        tweet=False,
        include_all_spots=False,
        long_text=None,
        epsilon=None,
        link_pattern="https://{0}.wikipedia.org/wiki/{1}",
    ):
        """
        Create a TagMeAnnotator.

        Args:
            url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used
            auth_token: the authentication token needed to use the service
            lang: the language of the text, one of 'de', 'en' (default), 'it'
            ann_type: the annotation type for the new annotations, default is "Mention"
            task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag"
            outset_name: the annotationset to put the new annotations in
            min_delay_ms: minimum time in ms to wait between requests to the server
            tweet: if True, TagMe expects a Tweet (default is False)
            include_all_spots: if True, include spots that cannot be linked (default is False)
            long_text: if not None, the context length to use (default: None)
            epsilon: if not None, the epsilong value (float) to use (default: None)
            link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The
               default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and
               {1} gets replaced with the title.
        """
        if url is None:
            if task == "tag":
                url = "https://tagme.d4science.org/tagme/tag"
            elif task == "spot":
                url = "https://tagme.d4science.org/tagme/spot"
            else:
                raise Exception("task must be 'tag' or 'spot'")
        assert lang in ["en", "de", "it"]
        if long_text is not None:
            assert isinstance(long_text, int)
        if epsilon is not None:
            assert isinstance(epsilon, float)
        self.long_text = long_text
        self.epsilon = epsilon
        self.lang = lang
        self.auth_token = auth_token
        self.url = url
        self.tweet = tweet
        self.include_all_spots = include_all_spots
        self.outset_name = outset_name
        self.min_delay_s = min_delay_ms / 1000.0
        self.logger = init_logger()
        # self.logger.setLevel(logging.DEBUG)
        self._last_call_time = 0
        self.ann_type = ann_type
        self.link_pattern = link_pattern

    def __call__(self, doc, **kwargs):
        if "tweet" in kwargs:
            tweet = kwargs["tweet"]
        else:
            tweet = self.tweet
        delay = time.time() - self._last_call_time
        if delay < self.min_delay_s:
            time.sleep(self.min_delay_s - delay)
        text = doc.text
        hdrs = {
            "Content-Type": "text/plain; charset=UTF-8",
            "Accept": "application/gate+json",
        }
        params = {
            "text": text,
            "gcube-token": self.auth_token,
            "lang": self.lang,
        }
        if self.include_all_spots:
            params["include_all_spots"] = "true"
        if tweet:
            params["tweet"] = "true"
        if self.long_text is not None:
            params["long_text"] = self.long_text
        if self.epsilon is not None:
            params["epsilon"] = self.epsilon
        response = requests.post(self.url, params=params, headers=hdrs)
        scode = response.status_code
        if scode != 200:
            raise Exception(f"Something went wrong, received status code {scode}")
        json = response.json()
        # self.logger.debug(f"Response JSON: {json}")
        ents = json.get("annotations", {})
        annset = doc.annset(self.outset_name)
        om = OffsetMapper(text)
        for ent in ents:
            start = ent["start"]
            end = ent["end"]
            start, end = om.convert_to_python([start, end])
            feats = {}
            title = ent.get("title")
            if title is not None:
                if self.link_pattern:
                    feats["url"] = self.link_pattern.format(self.lang, title)
                else:
                    feats["title"] = title
            for fname in ["id", "rho", "link_probability", "lp"]:
                fval = ent.get(fname)
                if fval is not None:
                    feats[fname] = fval
            # self.logger.debug(f"Adding annotation {start},{end},{feats}")
            annset.add(start, end, self.ann_type, features=feats)
        return doc

Ancestors

Annotator
abc.ABC

Inherited members

Annotator:
- __call__
- finish
- pipe
- reduce
- start