"""Module for the Features class"""
import sys
import logging
from gatelfdata.featurenumeric import FeatureNumeric
from gatelfdata.featurenominalembs import FeatureNominalEmbs
from gatelfdata.featureboolean import FeatureBoolean
from gatelfdata.featurengram import FeatureNgram
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
streamhandler = logging.StreamHandler(stream=sys.stderr)
formatter = logging.Formatter(
'%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
streamhandler.setFormatter(formatter)
logger.addHandler(streamhandler)
[docs]class Features(object):
[docs] def make_feature(self, fname, datatype, attribute, featurestats, vocabs):
"""Helper function to create a specific feature gets called as part of __init__"""
kind = attribute["featureCode"]
logger.debug("Making feature for kind/name/type/attr: %r/%r/%r/%r", kind, fname, datatype, attribute)
if kind == "N":
# create an ngram feature, based on a simple feature of type nominal
ret = FeatureNgram(fname, attribute, featurestats, vocabs.get_vocab(attribute))
else:
# create a simple feature of the correct type
if datatype == "nominal":
# create a nominal feature, represented through embeddings or onehot
# We represent both by featurenominalembs, both get converted into a value
# index initiallly. However for onehot, the embedding vectors are just the onehot
# vectors (except for padding which is still an all-zero vector).
logger.debug("About to create feature, vocabs is %s" % (vocabs,))
ret = FeatureNominalEmbs(fname, attribute, featurestats, vocabs.get_vocab(attribute))
elif datatype == "numeric":
# simple numeric feature
ret = FeatureNumeric(fname, attribute, featurestats)
elif datatype == "boolean":
# simple boolean feature
ret = FeatureBoolean(fname, attribute, featurestats)
else:
raise Exception("Odd datatype: ", datatype)
logger.debug("Returning: %r", ret)
return ret
def __init__(self, meta, vocabs):
# initialisation consists of going through the meta info and
# creating all the individual feature instances and storing them
# in here in a list.
# NOTE: we should go through the actual features, not the attributes, so we do
# not really need anything that represents an attributelist since this is
# just a fixed number of simple attributes.
# meta: either a string or the meta information already read in and parsed.
self.meta = meta
self.vocabs = vocabs
self.isSequence = meta["isSequence"]
if self.isSequence:
self.seq_max = meta["sequLengths.max"]
self.seq_avg = meta["sequLengths.mean"]
# now we have the meta, create the list of features
self.features = []
attrs = self.meta["featureInfo"]["attributes"]
stats = self.meta["featureStats"]
# The LF metadata is per feature, not per embedding type of the feature, so
# we first need to combine the counts per feature for each of the types here.
for f in self.meta["features"]:
dt = f["datatype"]
attrnr = f["attrid"]
attrinfo = attrs[attrnr]
# attrcode = attrinfo.get("code")
if dt == "nominal":
self.vocabs.setup_vocab(attrinfo, stats[f["name"]])
self.vocabs.finish()
for f in self.meta["features"]:
dt = f["datatype"]
attrnr = f["attrid"]
fname = f["name"]
# attrkind = f["kind"]
# get a bit more info from the corresponding attribute metadata
attrinfo = attrs[attrnr]
fstats = stats[fname]
thefeature = self.make_feature(fname, dt, attrinfo, fstats, self.vocabs)
logger.debug("Features: appending feature=%r", thefeature)
self.features.append(thefeature)
def _convert_featurevec(self, valuelist, idxs=None, normalize=None):
if not idxs and (len(valuelist) != len(self.features)):
raise Exception("Wrong number of values passed, expected", len(self.features), "got", len(valuelist))
if idxs and len(idxs) > len(valuelist):
raise Exception("Wrong number of idxs passed, got", len(idxs), "but got values:", len(valuelist))
if idxs and len(idxs) > len(self.features):
raise Exception("Wrong number of idxs passed, got", len(idxs), "but got features:", len(self.features))
if idxs:
valuelist = [valuelist[i] for i in idxs]
features = [self.features[i] for i in idxs]
else:
features = self.features
values = []
for i in range(len(features)):
res = features[i](valuelist[i], normalize=normalize)
values.append(res)
return values
def __iter__(self):
return iter(self.features)
def __getitem__(self, item):
return self.features[item]
def __call__(self, valuelist, idxs=None, normalize=None):
# For a feature vector:
# this will go through each input and run it through the stored feature
# instance, and the values will get put into the result list and returned
# Note that for ngram attributes, the "value" to put into the list is itself a list
# (of embedding indices).
# For a sequence of feature vectors: each feature vector gets converted
# in the normal way, targets as well
# NOTE: not sure yet how to handle nominals that are onehot encoded! In some cases
# we want to instances in some we want the vectors .. see featurenominal1ofk
if self.isSequence:
out_indep = []
for fv in valuelist:
out_indep.append(self._convert_featurevec(fv, idxs=idxs))
return out_indep
else:
values = self._convert_featurevec(valuelist, idxs=idxs)
return values
def __call__OLD(self, valuelist, idxs=None):
# For a feature vector:
# this will go through each input and run it through the stored feature
# instance, and the values will get put into the result list and returned
# Note that for ngram attributes, the "value" to put into the list is itself a list
# (of embedding indices).
# For a sequence of feature vectors: will return a list/vector
# for each feature where each element corresponds to a sequence element
# So the representation gets changed from a list of feature vectors
# of values to a list of values for each feature
if self.isSequence:
# for now we do this in an easy to understand but maybe slow way:
# first go convert each of the feature vectors in the sequence
# then convert the resulting list of lists
seqofvecs = []
for el in valuelist:
vals4featurevec = self._convert_featurevec(el, idxs=idxs)
seqofvecs.append(vals4featurevec)
# now each element in sequofvecs should have as many elements
# as there are features, just transpose that matrix
return [l for l in map(list, zip(*seqofvecs))]
else:
values = self._convert_featurevec(valuelist, idxs=idxs)
return values
[docs] def size(self):
return len(self.features)
def __repr__(self):
fl = [f.__repr__() for f in self.features]
return "Features(features=%r)" % fl
def __str__(self):
fl = [f.__str__() for f in self.features]
return "Features("+",".join(fl)+")"
[docs] def pretty_print(self, file=sys.stdout):
print("Features:", file=file)
for f in self.features:
print(" ", f, file=file)