Module `gatenlp.processing.emojis.emojis`

Module that implements the EmojiAnnotator. Emoji information has been converted from https://github.com/eliasdabbas/advertools

Expand source code

"""
Module that implements the EmojiAnnotator.
Emoji information has been converted from https://github.com/eliasdabbas/advertools
"""
import os
import json
import re
from gatenlp.processing.annotator import Annotator

PATH_E2I = os.path.join(os.path.dirname(__file__), "emojis2info.json")
PATH_ER = os.path.join(os.path.dirname(__file__), "emojiregex.txt")

# load the emoji info when the module is loaded and share the info among all annotators
with open(PATH_E2I, "rt", encoding="utf-8") as infp:
    EMOJI2INFO = json.load(infp)

with open(PATH_ER, "rt", encoding="utf-8") as infp:
    PAT_EMOJI_STR = infp.readline().strip("\n\r")

PAT_EMOJI = re.compile(PAT_EMOJI_STR, re.UNICODE)


class EmojiAnnotator(Annotator):
    """
    Annotator to find emojis in the document text.
    """
    def __init__(self, outset_name="", ann_type="Emoji", string_feature=None):
        """
        Initialize the Emoji annotator. For each emoji found, an annotation is created which
        contains the features codepoint, status, name, group, subgroup.

        Args:
            outset_name: the name of the output annotation set
            ann_type: the annotation type for the annotations to create
            string_feature: if None, the original matched string is not added as a feature, otherwise, the
                name of the feature to use for adding.
        """
        self.outset_name = outset_name
        self.ann_type = ann_type
        self.string_feature = string_feature

    def __call__(self, doc, **kwargs):
        txt = doc.text
        if txt is None:
            return doc
        outset = doc.annset(self.outset_name)
        for m in re.finditer(PAT_EMOJI, txt):
            infos = EMOJI2INFO.get(m.group())
            if infos is not None:
                ann = outset.add(m.start(), m.end(), self.ann_type, features=dict(
                    codepoint=infos[0],
                    status=infos[1],
                    emoji=infos[2],
                    name=infos[3],
                    group=infos[4],
                    subgroup=infos[5]
                ))
                if self.string_feature is not None:
                    ann.features[self.string_feature] = doc[ann]
        return doc

Classes

class EmojiAnnotator (outset_name='', ann_type='Emoji', string_feature=None)

Annotator to find emojis in the document text.

Initialize the Emoji annotator. For each emoji found, an annotation is created which contains the features codepoint, status, name, group, subgroup.

Args

outset_name: the name of the output annotation set
ann_type: the annotation type for the annotations to create
string_feature: if None, the original matched string is not added as a feature, otherwise, the name of the feature to use for adding.

Expand source code

class EmojiAnnotator(Annotator):
    """
    Annotator to find emojis in the document text.
    """
    def __init__(self, outset_name="", ann_type="Emoji", string_feature=None):
        """
        Initialize the Emoji annotator. For each emoji found, an annotation is created which
        contains the features codepoint, status, name, group, subgroup.

        Args:
            outset_name: the name of the output annotation set
            ann_type: the annotation type for the annotations to create
            string_feature: if None, the original matched string is not added as a feature, otherwise, the
                name of the feature to use for adding.
        """
        self.outset_name = outset_name
        self.ann_type = ann_type
        self.string_feature = string_feature

    def __call__(self, doc, **kwargs):
        txt = doc.text
        if txt is None:
            return doc
        outset = doc.annset(self.outset_name)
        for m in re.finditer(PAT_EMOJI, txt):
            infos = EMOJI2INFO.get(m.group())
            if infos is not None:
                ann = outset.add(m.start(), m.end(), self.ann_type, features=dict(
                    codepoint=infos[0],
                    status=infos[1],
                    emoji=infos[2],
                    name=infos[3],
                    group=infos[4],
                    subgroup=infos[5]
                ))
                if self.string_feature is not None:
                    ann.features[self.string_feature] = doc[ann]
        return doc

Ancestors

Annotator
abc.ABC

Inherited members

Annotator:
- __call__
- finish
- pipe
- reduce
- start