Source code for gatelfdata.vocab

"""Module for the Vocab class"""

from collections import Counter, defaultdict
import logging
import gzip
import re
import numpy as np
import math
# TODO: maybe make use of the gensim library optional?
# import gensim
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
streamhandler = logging.StreamHandler(stream=sys.stderr)
formatter = logging.Formatter(
                '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
streamhandler.setFormatter(formatter)
logger.addHandler(streamhandler)

# OK, the protocol for using this is this:
# * create a preliminary instance using "Vocab(...)"
#   This can be a completely empty instance or already contain an initial set of counts using counts=
#   NOTE: if an embedding file should be used, it should be specified at creation time
#   NOTE: emb_train can be "yes", "mapping", "no" or "onehot":
# * n times, add to the counts using v.add_counts() and also set embeddings-information using v.set_xxx()
#   and other info (e.g. additional symbols use) using v.set_xxx() to override what was specified
#   These settings should all be settings which only are relevant once finish() is called
# * Once all the counts have been set, finish the datastructure and prepare for use using v.finish()
# * only after f.finish() is run the methods for mapping from and to indices, frequencies, or vectors
#   are usable, before finish() these methods will abort with an error!
# * In general:
#   * words which should be in the vocab get mapped to their index
#   * words not in the vocab get mapped to the special "OOV" index (1 by default)
#   * there is the special PAD symbol which is 0 by default
# * Once finish has been completed, a numpy matrix with the embeddings can be retrieved using
#   get_embeddings()
# * At some point, all the unimportant data can be cleaned up using v.cleanup(embeddings=True,frequencies=True)
#   After this, get_embeddings() raises an exception

# Overview over how indices and embeddings are created using the various configuration settings:
#
# train=yes, file=None:
# * all words in the training set not filtered by minfrequ or maxsize get mapped to an index
# * all other words get mapped to the OOV index
# * if there are k words, there will be k+2 (OOV, PAD) indices
# * Embeddings: all embeddings are some random vectors, PAD is zero
#
# train=yes, file=embfile
# * index for all words in the training set > minfreq, maxsize which are also in the embeddings file
# * OOV for all other words (random vector)
# * Embeddings: all embeddings from file except OOV (random) and PAD(zero)
# * TODO: at some point could average low-freq vectors in the embeddings
#
# train=no, file=None
# * same as for train=yes, file=None
# * only pytorch layer works differently
#
# train=no, file=embfile
# * same as for train=yes, file=embfile
# * only pytorch layer works differently
#
# train=mapping, file=None
# * signal an ERROR
#
# train=mapping, file=embfile
# * index for all words either in our vocab or in the embeddings file, but NOT in our vocab and < minfreq!
# * so we have to load all embeddings from the file, except those which correspond to the words we
#   have filtered. In addition, all the words not filtered and not in the embeddings get random vectors,
#   and we create an OOV vector as well.
#
# train=onehot, file=none
# * ignore minfreq, dims
# * dims is equal to number of values
# * no OOV dim, only padding, unless suppressed
# * embeddings matrix is diagonal matrix


[docs]class Vocab(object):
    """From the counter object, create string to id and id to string
    mappings."""
    def __init__(self, counts=None, max_size=None,
                 emb_id=None, emb_train=None, emb_dims=0, emb_file=None, emb_minfreq=1,
                 no_special_indices=False,
                 pad_index_only=False,
                 emb_dir=None,
                 pad_string="", oov_string="<<oov>>"):
        """Create a vocabulary instance from the counts. If max_size is
        given sorts by frequency and only retains the max_size most frequent
        ones. Removes everything less the emb_minfreq.
        The padding index is always 0, however the string for which the padding index is returned can be set.
        The OOV index is always 1, however the string for which the padding index is returned can be set.
        If no_special_indices is true, only indices for words are added, not for padding or oov. looking up
        a word not in the vocabulary will result in an exception.
        If pad_index_only is true then no oov index will be used, looking up a word not in the vocabulary raises
        an exception. However, the index 0 is reserved for padding.
        NOTE: if emb_train is onehot and neither no_special_indices nor pad_index_only is true,
        for now we automatically use pad_index_only!!!!
        If emb_dir is not None, then all references to relative (embeddings) files are relative to that
        directory.
        """
        if counts:
            self.freqs = Counter(counts)
        else:
            self.freqs = Counter()
        if emb_train == "onehot" and not no_special_indices:
            pad_index_only = True
        self.no_special_indices = no_special_indices
        self.pad_index_only = pad_index_only
        self.emb_minfreq = emb_minfreq or 1
        self.max_size = max_size
        self.emb_dims = emb_dims
        self.emb_id = emb_id
        self.emb_file = emb_file
        self.emb_train = emb_train
        self.itos = None
        self.stoi = None
        self.stoe = {}
        self.emb_dir = emb_dir
        self.n = 0
        self.pad_string = pad_string
        self.oov_string = oov_string
        self.finished = False
        self.embeddings_loaded = False
        self.embeddings = None
        self.oov_emb = None
        if self.emb_train and self.emb_train not in ["yes", "mapping", "no", "onehot"]:
            raise Exception("Vocab emb_train must be one of yes, mapping, no, onehot but is "+str(self.emb_train))
        if not self.emb_file and self.emb_train == "mapping":
            raise Exception("Vocab emb_train 'mapping' not usable without embeddings file, "
                            "got emb_train=%s and emb_file=%s" % (self.emb_train, self.emb_file))
        if self.emb_file and self.emb_train == "onehot":
            raise Exception("Vocab emb_train 'onehot' not usable with embeddings file, "
                            "got emb_train=%s and emb_file=%s" % (self.emb_train, self.emb_file))
        self.have_oov = True
        self.have_pad = True
        self.have_vocab = False  # this indicates if we have already built the final vocab
        if no_special_indices:
            self.have_oov = False
            self.have_pad = False
        if pad_index_only:
            self.have_oov = False

[docs]    def check_finished(self, method="method"):
        if not self.finished:
            raise Exception("Cannot call", method, "unless the finish() method has been called first!")

[docs]    def check_nonfinished(self, method="method"):
        if self.finished:
            raise Exception("Cannot call", method, "after the finish() method has been called!")

[docs]    def embs4line(self, line, fromidx, dims):
        embs = []
        toidx = fromidx
        for i in range(dims):
            fromidx = toidx + 1
            toidx = line.find(" ", fromidx)
            if toidx < 0:
                toidx = len(line)
            embs.append(float(line[fromidx:toidx]))
        return embs

[docs]    def load_embeddings(self, emb_file, filterset=None):
        """Load pre-calculated embeddings from the given file. This will update embd_dim as needed!
        Currently only supports text format, compressed text format or a two file format where
        the file with extension ".vocab" has one word per line and the file with extension ".npy"
        is a matrix with as many rows as there are words and as many columns as there are dimensions.
        The format is identified by the presence of one of the extensions ".txt", ".vec", ".txt.gz",
        or ".vocab" and ".npy" in the emb_file given. (".vec" is an alias for ".txt")
        The text formats may or may not have a first line that indicates the number of words and
        number of dimensions.
        If filterset is non-empty, all embeddings not in the set are loaded, otherwise all embeddings
        which are also already in the vocabulary are loaded.
        NOTE: this will not check if the case conventions or other conventions (e.g. hyphens) for the tokens
        in our vocabulary are compatible with the conventions used for the embeddings.
        """
        if filterset is None:
            filterset = set()
        n_lines = 0
        n_added = 0
        n_vocab = len(self.itos)
        if emb_file.endswith(".txt") or emb_file.endswith(".vec") or emb_file.endswith(".txt.gz"):
            if emb_file.endswith(".txt.gz"):
                reader = gzip.open
            else:
                reader = open
            # TODO: if emb_file is relative, try to make it relative to the directory where the metafile is
            logger.info("Loading embeddings for %s from %s (%s words)" % (self.emb_id, emb_file, n_vocab))
            n_expected = 0
            with reader(emb_file, 'rt', encoding="utf-8") as infile:
                for line in infile:
                    if n_added == n_vocab:
                        logger.info("Got all %s embeddings needed, stopping reading the embeddings file" % (n_vocab,))
                        break
                    if n_lines == 0:
                        m = re.match(r'^\s*([0-9]+)\s+([0-9]+)\s*$', line)
                        if m:
                            n_expected = int(m.group(1))
                            self.emb_dims = int(m.group(2))
                            n_lines += 1
                            continue
                        else:
                            # assume the first line is already an embedding line and get dims from there
                            self.emb_dims = len(line.split())-1
                            n_expected = -1
                    n_lines += 1
                    if n_lines % 100000 == 0:
                        logger.info("Read lines from embeddings file: %s of %s, added words: %s of %s" %
                                    (n_lines, n_expected, n_added, n_vocab))
                    line = line.strip()
                    toidx = line.find(" ")
                    word = line[0:toidx]
                    if filterset:
                        if word not in filterset:
                            n_added += 1
                            self.stoe[word] = self.embs4line(line, toidx, self.emb_dims)
                    else:
                        if word in self.stoi:
                            n_added += 1
                            self.stoe[word] = self.embs4line(line, toidx, self.emb_dims)
        elif emb_file.endswith(".vocab") or emb_file.endswith(".npy"):
            raise Exception("TODO: format .vocab/.npy not yet implemented!")
        elif emb_file.endswith(".gensim"):
            import gensim
            gensimmodel = gensim.models.KeyedVectors.load(emb_file, mmap='r')
            # now copy over only the embeddings we actually need
            # TODO: !!!!
            raise Exception(".gensim format for embeddings not yet implemented")
        else:
            raise Exception("Embeddings file must have one of the extensions: .txt, .txt.gz, .vocab, .npy")
        self.embeddings_loaded = True
        logger.info("Embeddings for \"%s\" loaded: %s, dims=%s" % (self.emb_id, n_added, self.emb_dims))
        #if self.stoe is not None and "the" in self.stoe:
        #    print("DEBUG: embeddings for the", self.stoe["the"], file=sys.stderr)

[docs]    def get_embeddings(self):
        """Return a numpy matrix of the embeddings in the order of the indices. If this is called
        before finish() an exception is raised"""
        self.check_finished("get_embeddings")
        return self.embeddings

[docs]    @staticmethod
    def rnd_vec(dims, strng=None, as_numpy=True):
        """Returns a random vector of the given dimensions where each dimension is from a gaussian(0,1)
        If str is None, the vector is dependent on the current numpy random state. If a string is given,
        then the random state is seeded with a number derived from the string first, so the random vector
        will always be the same for that string and number of dimensions."""
        if str:
            np.random.seed(hash(strng) % (2**32-1))
        vec = np.random.randn(dims).astype(np.float32)
        if as_numpy:
            return vec
        else:
            return list(vec)

[docs]    def zero_vec(self, as_numpy=True):
        if as_numpy:
            return np.zeros(self.emb_dims, np.float32)
        else:
            return list(np.zeros(self.emb_dims, np.float32))

[docs]    def add_counts(self, counts):
        """Incrementally add additional counts to the vocabulary. This can be done only before the finish
        method is called"""
        self.check_nonfinished("add_counts")
        self.freqs.update(counts)

[docs]    def set_emb_minfreq(self, min_freq=1):
        self.check_nonfinished("set_emb_minfreq")
        self.emb_minfreq = min_freq

[docs]    def set_max_size(self, max_size=None):
        self.check_nonfinished("set_max_size")
        self.max_size = max_size

[docs]    def set_emb_id(self, embid):
        self.check_nonfinished("set_emb_id")
        self.emb_id = embid

[docs]    def set_emb_file(self, file):
        self.check_nonfinished("set_emb_file")
        self.emb_file = file

[docs]    def set_emb_dims(self, dim):
        self.check_nonfinished("set_emb_dims")
        self.emb_dims = dim

[docs]    def finish(self, remove_counts=True, remove_embs=True):
        """Build the actual vocab instance, it can only be used properly to look-up things after calling
        this method, but no parameters can be changed nor counts added after this."""
        self.check_nonfinished("finish")

        # if the emb_train parameter was never set, try to come up with a sensible default here:
        # - if a file is specified, use the setting "no" for now,
        # - otherwise use "yes"
        if not self.emb_train:
            # we set our own default here: if a file is specified, then emb_train is no, otherwise
            # it is yes.
            if self.emb_file:
                self.emb_train = "no"
            else:
                self.emb_train = "yes"

        # make sure the padding "word" which may be included in the frequencies gets ignored.
        # we do this by removing the entry at this point, if it exists
        if self.have_oov:
            if self.oov_string in self.freqs:
                logger.debug("OOV symbol removed from frequencies, freq=%s, id=%s" %
                             (self.freqs[self.pad_string], self.emb_id))
                del self.freqs[self.oov_string]
        if self.have_pad:
            if self.pad_string in self.freqs:
                logger.debug("Pad symbol removed from frequencies, freq=%s, id=%s" %
                             (self.freqs[self.pad_string], self.emb_id))
                del self.freqs[self.pad_string]

        # go through the entries and put all the keys satisfying the emb_minfreq limit into a list
        # put all the words not satisfying the restriction in the filtered_words set
        filtered_words = set()
        self.itos = []
        print("Finishing vocab ", self.emb_id, "before filtering: ", len(self.freqs), file=sys.stderr)
        for s in self.freqs:
            if self.freqs[s] >= self.emb_minfreq:
                self.itos.append(s)
            else:
                filtered_words.add(s)
        # sort the keys by frequency, then alphabetically in reverse order
        # (so to achieve this sort, sort first alphabetically, then by frequency)
        self.itos = sorted(self.itos)
        print("Vocab", self.emb_id, "after minfreq filtering: ", len(self.itos), file=sys.stderr)
        self.itos = sorted(self.itos, reverse=True, key=lambda x: self.freqs[x])
        # add the additional symbols at the beginning, first and always at index 0, the pad symbol, except
        # when no_pad is True
        if self.no_special_indices:
            pass  # do nothing what we have is all we need
        elif self.pad_index_only:
            self.itos = [self.pad_string] + self.itos
        else:
            self.itos = [self.pad_string] + [self.oov_string] + self.itos
        # trim the itos according to max_size and add any trimmed words to the filtered_words set
        if self.max_size and len(self.itos) > self.max_size:
            for w in self.itos[self.max_size:]:
                filtered_words.add(w)
            self.itos = self.itos[:self.max_size]
        # now create the reverse map
        self.stoi = defaultdict(int)
        for i, s in enumerate(self.itos):
            self.stoi[s] = i
        self.n = len(self.itos)
        print("Vocab", self.emb_id, "final: ", self.n, file=sys.stderr)
        if self.emb_train == "onehot":
            # set the emb_dims to the number of values we have, but if we have a padding symbol,
            # do not include it in the dimensions
            if self.have_pad:
                self.emb_dims = self.n - 1
            else:
                self.emb_dims = self.n

        # print("DEBUG: initial itos for ",self.emb_id,"is",self.itos[0:20], file=sys.stderr)

        if not self.emb_file and not self.emb_dims:
            self.emb_dims = int(math.log2(self.n)**1.8)+1

        # if needed, load the embeddings: if the set we pass on is empty, only the embeddings in the vocab
        # are loaded, otherwise all embeddings not in the filter set are loaded
        if self.emb_file:
            self.load_embeddings(self.emb_file, filterset=filtered_words)
            # the embeddings loaded already have been filtered, but our own vocab may need
            # to get cleaned up now: if filtered_words is empty, then we want to keep only
            # those words in our vocab which also occur in the embeddings.
            # Otherwise (this is when we learn a mapping), we keep all our own vocab words which
            # do not occur in the embeddings, but we create some random embedding vectors for them
            if filtered_words:
                # ok, we loaded all embeddings except the filtered vocab words, so we first also
                # create the random embedding vectors for the words in our vocab not in the embeddings file
                for s in self.stoi:
                    if s not in self.stoe:
                        self.stoe[s] = self.rnd_vec(dims=self.emb_dims, strng=s)
                # we have also loaded embeddings for words which are not in our vocabulary, we need to add
                # those to our index. First create the set of words that need to get added
                words2add = set()
                for s in self.stoe:
                    if s not in self.stoi:
                        words2add.add(s)
                # now append those words at the end of the itos array and also add them to the stoi dict
                for s in words2add:
                    self.itos.append(s)
                    self.stoi[s] = self.n
                    self.n += 1
            else:
                # we have loaded only those embeddings which are in the vocab, but now we have some
                # vocab words left which are not in the embeddings: remove them!
                self.itos = [w for w in self.itos if w == self.pad_string or w == self.oov_string or w in self.stoe]
                self.stoi = defaultdict(int)
                for i, s in enumerate(self.itos):
                    self.stoi[s] = i
                self.n = len(self.itos)
            # now if necessary add the padding and oov vectors
            if self.have_oov:
                self.stoe[self.oov_string] = self.rnd_vec(dims=self.emb_dims, strng=self.oov_string)
            if self.have_pad:
                self.stoe[self.pad_string] = self.zero_vec()
            # now we should have an embedding vector for each word in stoi/itos, so we should now
            # create the actual embeddings matrix
            self.embeddings = np.zeros((self.n, self.emb_dims), np.float32)
            for s in self.stoi:
                idx = self.stoi[s]
                emb = self.stoe[s]
                self.embeddings[idx] = emb
        else:  # no emb file
            if self.emb_train == "onehot":
                self.embeddings = np.zeros((self.n, self.emb_dims), np.float32)
                fromindex = 0
                if self.have_pad:
                    fromindex = 1
                j = 0
                for i in range(fromindex, self.n):
                    self.embeddings[i, j] = 1.0
                    j += 1
            else:
                self.embeddings = np.random.randn(self.n, self.emb_dims).astype(np.float32)
                # override the padding vector with a zero vector if needed:
                if not self.no_special_indices:
                    self.embeddings[0] = np.zeros(self.emb_dims, np.float32)

        # print("DEBUG: itos new=", self.itos, file=sys.stderr)
        # print("DEBUG: stoi new=", self.stoi, file=sys.stderr)
        # print("DEBUG: stoe new=", self.stoe, file=sys.stderr)

        # if self.stoe is not None and "the" in self.stoe:
        #    print("DEBUG: embeddings for the", self.stoe["the"], file=sys.stderr)

        # cleanup what we do not need any more
        if remove_embs:
            self.stoe = None
        if remove_counts:
            self.freqs = None
        self.finished = True
        # print("DEBUG: final itos for ",self.emb_id,"is",self.itos[0:20], file=sys.stderr)

[docs]    def idx2string(self, idx):
        """Return the string for this index"""
        self.check_finished("idx2string")
        if idx >= len(self.itos):
            raise Exception("Vocab: index larger than vocabulary size")
        else:
            return self.itos[idx]

[docs]    def string2idx(self, string):
        self.check_finished("string2idx")
        if string in self.stoi:
            return self.stoi[string]  # NOTE: the pad string is in there!
        else:
            if self.have_oov:
                return self.stoi[self.oov_string]
            else:
                # not a proper word no oov character, for now throw an exception, this should probablly never happen
                raise Exception("String not found in vocab and do not have OOV symbol either: %s" % string)

[docs]    def string2emb(self, string):
        self.check_finished("string2emb")
        if self.embeddings is None:
            raise Exception("Cannot get embedding vector, no embeddings matrix")
        if string in self.stoi:
            return self.embeddings[self.stoi[string]]
        else:
            if self.have_oov:
                return self.embeddings[self.stoi[self.oov_string]]
            else:
                raise Exception("Cannot return embedding vector, string not found and no OOV symbol: %s" % string)

[docs]    def string2onehot(self, thestring):
        """return a one-hot vector for the string. If we have an oov index, return that for unknown words,
        otherwise raise and exception. If the string is the padding string, return an all zero vector.
        NOTE: this can be called even if the emb_train parameter was not equal to 'onehot' when creating the
        vocabulary. In that case, there may be an OOV symbol in the vocab and the onehot vector generated will
        contain it as its first dimension."""
        if not self.finished:
            raise Exception("Vocab %r has not been finished!" % self)
        vec = self.zero_onehotvec()
        if self.have_pad and thestring == self.pad_string:
            return vec
        if thestring in self.stoi:
            l = self.stoi[thestring]
        elif self.have_oov:
            l = self.stoi[self.oov_string]
        else:
            raise Exception("String not found in vocab and no OOV symbol: %s" % (thestring,))
        if self.have_pad:
            l -= 1
        vec[l] = 1.0
        return vec

[docs]    def zero_onehotvec(self):
        l = len(self.itos)
        if self.have_pad:
            l -= 1
        return [0.0] * l

[docs]    def onehot2string(self, vec):
        if not self.finished:
            raise Exception("Vocab has not been finished!")
        s = sum(vec)
        if self.have_pad and s == 0.0:
            return self.pad_string
        if s != 1.0:
            raise Exception("Not a proper one-hot vector: %s" % (vec,))
        idx = vec.index(1.0)
        if self.have_pad:
            idx += 1
        return self.itos[idx]

[docs]    def count(self, strng):
        """Return the count/frequency for the given word. NOTE: after finish() this will return 0 for any words
        that have been removed because of one of the filter criteria!!"""
        if self.freqs:
            c = self.freqs.get(strng)
            if c:
                return c
            else:
                return 0
        else:
            raise Exception("Cannot retrieve count, data has been removed")

[docs]    def size(self):
        """Return the total number of entries in the vocab, including any special symbols"""
        return len(self.itos)

    def __str__(self):
        return self.__repr__()+":nentries=%d" % len(self.stoi)

    def __repr__(self):
        tmp_entries = [self.itos[i] for i in range(min(len(self.itos),20))]
        return "Vocab(n=%d,emb_id=%r,emb_train=%r,emb_file=%r,emb_dims=%d,entries=%s)" % \
               (len(self.stoi), self.emb_id, self.emb_train, self.emb_file, self.emb_dims, tmp_entries)